proptest/arbitrary/_std/string.rs
1//-
2// Copyright 2017, 2018 The proptest developers
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10//! Arbitrary implementations for `std::string`.
11
12use crate::std_facade::{Box, String, Vec};
13use std::iter;
14use std::rc::Rc;
15use std::slice;
16use std::sync::Arc;
17
18multiplex_alloc! {
19 alloc::string::FromUtf8Error, ::std::string::FromUtf8Error,
20 alloc::string::FromUtf16Error, ::std::string::FromUtf16Error
21}
22
23use crate::arbitrary::*;
24use crate::collection;
25use crate::strategy::statics::static_map;
26use crate::strategy::*;
27use crate::string::StringParam;
28
29impl Arbitrary for String {
30 type Parameters = StringParam;
31 type Strategy = &'static str;
32
33 /// ## Panics
34 ///
35 /// This implementation panics if the input is not a valid regex proptest
36 /// can handle.
37 fn arbitrary_with(args: Self::Parameters) -> Self::Strategy {
38 args.into()
39 }
40}
41
42macro_rules! dst_wrapped {
43 ($($w: ident),*) => {
44 $(arbitrary!($w<str>, MapInto<StrategyFor<String>, Self>, StringParam;
45 a => any_with::<String>(a).prop_map_into()
46 );)*
47 };
48}
49
50dst_wrapped!(Box, Rc, Arc);
51
52lazy_just!(FromUtf16Error, || String::from_utf16(&[0xD800])
53 .unwrap_err());
54
55// This is a void-like type, it needs to be handled by the user of
56// the type by simply never constructing the variant in an enum or for
57// structs by inductively not generating the struct.
58// The same applies to ! and Infallible.
59// generator!(ParseError, || panic!());
60
61arbitrary!(FromUtf8Error, SFnPtrMap<BoxedStrategy<Vec<u8>>, Self>;
62 static_map(not_utf8_bytes(true).boxed(),
63 |bs| String::from_utf8(bs).unwrap_err())
64);
65
66/// This strategy produces sequences of bytes that are guaranteed to be illegal
67/// wrt. UTF-8 with the goal of producing a suffix of bytes in the end of
68/// an otherwise legal UTF-8 string that causes the string to be illegal.
69/// This is used primarily to generate the `Utf8Error` type and similar.
70pub(crate) fn not_utf8_bytes(
71 allow_null: bool,
72) -> impl Strategy<Value = Vec<u8>> {
73 let prefix = collection::vec(any::<char>(), ..::std::u16::MAX as usize);
74 let suffix = gen_el_bytes(allow_null);
75 (prefix, suffix).prop_map(move |(prefix_bytes, el_bytes)| {
76 let iter = prefix_bytes.iter();
77 let string: String = if allow_null {
78 iter.collect()
79 } else {
80 iter.filter(|&&x| x != '\u{0}').collect()
81 };
82 let mut bytes = string.into_bytes();
83 bytes.extend(el_bytes.into_iter());
84 bytes
85 })
86}
87
88/// Stands for "error_length" bytes and contains a suffix of bytes that
89/// will cause the whole string to become invalid UTF-8.
90/// See `gen_el_bytes` for more details.
91#[derive(Debug)]
92enum ELBytes {
93 B1([u8; 1]),
94 B2([u8; 2]),
95 B3([u8; 3]),
96 B4([u8; 4]),
97}
98
99impl<'a> IntoIterator for &'a ELBytes {
100 type Item = u8;
101 type IntoIter = iter::Cloned<slice::Iter<'a, u8>>;
102 fn into_iter(self) -> Self::IntoIter {
103 use self::ELBytes::*;
104 (match *self {
105 B1(ref a) => a.iter(),
106 B2(ref a) => a.iter(),
107 B3(ref a) => a.iter(),
108 B4(ref a) => a.iter(),
109 })
110 .cloned()
111 }
112}
113
114// By analysis of run_utf8_validation defined at:
115// https://doc.rust-lang.org/nightly/src/core/str/mod.rs.html#1429
116// we know that .error_len() \in {None, Some(1), Some(2), Some(3)}.
117// We represent this with the range [0..4) and generate a valid
118// sequence from that.
119fn gen_el_bytes(allow_null: bool) -> impl Strategy<Value = ELBytes> {
120 fn b1(a: u8) -> ELBytes {
121 ELBytes::B1([a])
122 }
123 fn b2(a: (u8, u8)) -> ELBytes {
124 ELBytes::B2([a.0, a.1])
125 }
126 fn b3(a: ((u8, u8), u8)) -> ELBytes {
127 ELBytes::B3([(a.0).0, (a.0).1, a.1])
128 }
129 fn b4(a: ((u8, u8), u8, u8)) -> ELBytes {
130 ELBytes::B4([(a.0).0, (a.0).1, a.1, a.2])
131 }
132
133 /*
134 // https://tools.ietf.org/html/rfc3629
135 static UTF8_CHAR_WIDTH: [u8; 256] = [
136 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
137 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
138 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
139 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
140 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
141 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
142 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
143 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
144 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
145 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
146 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
147 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
148 0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
149 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
150 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
151 4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
152 ];
153
154 /// Mask of the value bits of a continuation byte.
155 const CONT_MASK: u8 = 0b0011_1111;
156 /// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte.
157 const TAG_CONT_U8: u8 = 0b1000_0000;
158 */
159
160 // Continuation byte:
161 let succ_byte = 0x80u8..0xC0u8;
162
163 // Do we allow the nul byte or not?
164 let start_byte = if allow_null { 0x00u8 } else { 0x01u8 };
165
166 // Invalid continuation byte:
167 let fail_byte = prop_oneof![start_byte..0x7Fu8, 0xC1u8..];
168
169 // Matches zero in the UTF8_CHAR_WIDTH table above.
170 let byte0_w0 = prop_oneof![0x80u8..0xC0u8, 0xF5u8..];
171
172 // Start of a 3 (width) byte sequence:
173 // Leads here: https://doc.rust-lang.org/1.23.0/src/core/str/mod.rs.html#1479
174 let byte0_w2 = 0xC2u8..0xE0u8;
175
176 // Start of a 3 (width) byte sequence:
177 // https://doc.rust-lang.org/1.23.0/src/core/str/mod.rs.html#1484
178 // See the left column in the match.
179 let byte0_w3 = 0xE0u8..0xF0u8;
180
181 // Start of a 4 (width) byte sequence:
182 // https://doc.rust-lang.org/1.23.0/src/core/str/mod.rs.html#1495
183 // See the left column in the match.
184 let byte0_w4 = 0xF0u8..0xF5u8;
185
186 // The 2 first (valid) bytes of a 3 (width) byte sequence:
187 // The first byte is byte0_w3. The second is the ones produced on the right.
188 let byte01_w3 = byte0_w3.clone().prop_flat_map(|x| {
189 (
190 Just(x),
191 match x {
192 0xE0u8 => 0xA0u8..0xC0u8,
193 0xE1u8..=0xECu8 => 0x80u8..0xC0u8,
194 0xEDu8 => 0x80u8..0xA0u8,
195 0xEEu8..=0xEFu8 => 0x80u8..0xA0u8,
196 _ => panic!(),
197 },
198 )
199 });
200
201 // In a 3 (width) byte sequence, an invalid second byte is chosen such that
202 // it will yield an error length of Some(1). The second byte is on
203 // the right of the match arms.
204 let byte01_w3_e1 = byte0_w3.clone().prop_flat_map(move |x| {
205 (
206 Just(x),
207 match x {
208 0xE0u8 => prop_oneof![start_byte..0xA0u8, 0xC0u8..],
209 0xE1u8..=0xECu8 => prop_oneof![start_byte..0x80u8, 0xC0u8..],
210 0xEDu8 => prop_oneof![start_byte..0x80u8, 0xA0u8..],
211 0xEEu8..=0xEFu8 => prop_oneof![start_byte..0x80u8, 0xA0u8..],
212 _ => panic!(),
213 },
214 )
215 });
216
217 // In a 4 (width) byte sequence, an invalid second byte is chosen such that
218 // it will yield an error length of Some(1). The second byte is on
219 // the right of the match arms.
220 let byte01_w4_e1 = byte0_w4.clone().prop_flat_map(move |x| {
221 (
222 Just(x),
223 match x {
224 0xF0u8 => prop_oneof![start_byte..0x90u8, 0xA0u8..],
225 0xF1u8..=0xF3u8 => prop_oneof![start_byte..0x80u8, 0xA0u8..],
226 0xF4u8 => prop_oneof![start_byte..0x80u8, 0x90u8..],
227 _ => panic!(),
228 },
229 )
230 });
231
232 // The 2 first (valid) bytes of a 4 (width) byte sequence:
233 // The first byte is byte0_w4. The second is the ones produced on the right.
234 let byte01_w4 = byte0_w4.clone().prop_flat_map(|x| {
235 (
236 Just(x),
237 match x {
238 0xF0u8 => 0x90u8..0xA0u8,
239 0xF1u8..=0xF3u8 => 0x80u8..0xA0u8,
240 0xF4u8 => 0x80u8..0x90u8,
241 _ => panic!(),
242 },
243 )
244 });
245
246 prop_oneof![
247 // error_len = None
248 // These are all happen when next!() fails to provide a byte.
249 prop_oneof![
250 // width = 2
251 // lacking 1 bytes:
252 static_map(byte0_w2.clone(), b1),
253 // width = 3
254 // lacking 2 bytes:
255 static_map(byte0_w3, b1),
256 // lacking 1 bytes:
257 static_map(byte01_w3.clone(), b2),
258 // width = 4
259 // lacking 3 bytes:
260 static_map(byte0_w4, b1),
261 // lacking 2 bytes:
262 static_map(byte01_w4.clone(), b2),
263 // lacking 1 byte:
264 static_map((byte01_w4.clone(), succ_byte.clone()), b3),
265 ],
266 // error_len = Some(1)
267 prop_oneof![
268 // width = 1 is not represented.
269 // width = 0
270 // path taken:
271 // https://doc.rust-lang.org/1.23.0/src/core/str/mod.rs.html#1508
272 static_map(byte0_w0, b1),
273 // width = 2
274 // path taken:
275 // https://doc.rust-lang.org/1.23.0/src/core/str/mod.rs.html#1480
276 static_map((byte0_w2, fail_byte.clone()), b2),
277 // width = 3
278 // path taken:
279 // https://doc.rust-lang.org/1.23.0/src/core/str/mod.rs.html#1488
280 static_map(byte01_w3_e1, b2),
281 // width = 4
282 // path taken:
283 // https://doc.rust-lang.org/1.23.0/src/core/str/mod.rs.html#1499
284 static_map(byte01_w4_e1, b2),
285 ],
286 // error_len = Some(2)
287 static_map(
288 prop_oneof![
289 // width = 3
290 // path taken:
291 // https://doc.rust-lang.org/1.23.0/src/core/str/mod.rs.html#1491
292 (byte01_w3, fail_byte.clone()),
293 // width = 4
294 // path taken:
295 // https://doc.rust-lang.org/1.23.0/src/core/str/mod.rs.html#1502
296 (byte01_w4.clone(), fail_byte.clone())
297 ],
298 b3
299 ),
300 // error_len = Some(3), width = 4
301 // path taken:
302 // https://doc.rust-lang.org/1.23.0/src/core/str/mod.rs.html#1505
303 static_map((byte01_w4, succ_byte, fail_byte), b4),
304 ]
305 .boxed()
306}
307
308#[cfg(test)]
309mod test {
310 no_panic_test!(
311 string => String,
312 str_box => Box<str>,
313 str_rc => Rc<str>,
314 str_arc => Arc<str>,
315 from_utf16_error => FromUtf16Error,
316 from_utf8_error => FromUtf8Error
317 );
318}