proptest/
string.rs

1//-
2// Copyright 2017 Jason Lingle
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10//! Strategies for generating strings and byte strings from regular
11//! expressions.
12
13use crate::std_facade::{Box, Cow, String, ToOwned, Vec};
14use core::fmt;
15use core::mem;
16use core::ops::RangeInclusive;
17use core::u32;
18
19use regex_syntax::hir::{self, Hir, HirKind::*, Repetition};
20use regex_syntax::{Error as ParseError, ParserBuilder};
21
22use crate::bool;
23use crate::char;
24use crate::collection::{size_range, vec, SizeRange};
25use crate::strategy::*;
26use crate::test_runner::*;
27
28/// Wraps the regex that forms the `Strategy` for `String` so that a sensible
29/// `Default` can be given. The default is a string of non-control characters.
30#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
31pub struct StringParam(&'static str);
32
33impl From<StringParam> for &'static str {
34    fn from(x: StringParam) -> Self {
35        x.0
36    }
37}
38
39impl From<&'static str> for StringParam {
40    fn from(x: &'static str) -> Self {
41        StringParam(x)
42    }
43}
44
45impl Default for StringParam {
46    fn default() -> Self {
47        StringParam("\\PC*")
48    }
49}
50
51/// Errors which may occur when preparing a regular expression for use with
52/// string generation.
53#[derive(Debug)]
54pub enum Error {
55    /// The string passed as the regex was not syntactically valid.
56    RegexSyntax(ParseError),
57    /// The regex was syntactically valid, but contains elements not
58    /// supported by proptest.
59    UnsupportedRegex(&'static str),
60}
61
62impl fmt::Display for Error {
63    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
64        match self {
65            Error::RegexSyntax(err) => write!(f, "{}", err),
66            Error::UnsupportedRegex(message) => write!(f, "{}", message),
67        }
68    }
69}
70
71impl std::error::Error for Error {
72    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
73        match self {
74            Error::RegexSyntax(err) => Some(err),
75            Error::UnsupportedRegex(_) => None,
76        }
77    }
78}
79
80impl From<ParseError> for Error {
81    fn from(err: ParseError) -> Error {
82        Error::RegexSyntax(err)
83    }
84}
85
86opaque_strategy_wrapper! {
87    /// Strategy which generates values (i.e., `String` or `Vec<u8>`) matching
88    /// a regular expression.
89    ///
90    /// Created by various functions in this module.
91    #[derive(Debug)]
92    pub struct RegexGeneratorStrategy[<T>][where T : fmt::Debug]
93        (SBoxedStrategy<T>) -> RegexGeneratorValueTree<T>;
94    /// `ValueTree` corresponding to `RegexGeneratorStrategy`.
95    pub struct RegexGeneratorValueTree[<T>][where T : fmt::Debug]
96        (Box<dyn ValueTree<Value = T>>) -> T;
97}
98
99impl Strategy for str {
100    type Tree = RegexGeneratorValueTree<String>;
101    type Value = String;
102
103    fn new_tree(&self, runner: &mut TestRunner) -> NewTree<Self> {
104        string_regex(self).unwrap().new_tree(runner)
105    }
106}
107
108type ParseResult<T> = Result<RegexGeneratorStrategy<T>, Error>;
109
110#[doc(hidden)]
111/// A type which knows how to produce a `Strategy` from a regular expression
112/// generating the type.
113///
114/// This trait exists for the benefit of `#[proptest(regex = "...")]`.
115/// It is semver exempt, so use at your own risk.
116/// If you found a use for the trait beyond `Vec<u8>` and `String`,
117/// please file an issue at https://github.com/proptest-rs/proptest.
118pub trait StrategyFromRegex: Sized + fmt::Debug {
119    type Strategy: Strategy<Value = Self>;
120
121    /// Produce a strategy for `Self` from the `regex`.
122    fn from_regex(regex: &str) -> Self::Strategy;
123}
124
125impl StrategyFromRegex for String {
126    type Strategy = RegexGeneratorStrategy<Self>;
127
128    fn from_regex(regex: &str) -> Self::Strategy {
129        string_regex(regex).unwrap()
130    }
131}
132
133impl StrategyFromRegex for Vec<u8> {
134    type Strategy = RegexGeneratorStrategy<Self>;
135
136    fn from_regex(regex: &str) -> Self::Strategy {
137        bytes_regex(regex).unwrap()
138    }
139}
140
141/// Creates a strategy which generates strings matching the given regular
142/// expression.
143///
144/// If you don't need error handling and aren't limited by setup time, it is
145/// also possible to directly use a `&str` as a strategy with the same effect.
146pub fn string_regex(regex: &str) -> ParseResult<String> {
147    let hir = ParserBuilder::new().build().parse(regex)?;
148    string_regex_parsed(&hir)
149}
150
151/// Like `string_regex()`, but allows providing a pre-parsed expression.
152pub fn string_regex_parsed(expr: &Hir) -> ParseResult<String> {
153    bytes_regex_parsed(expr)
154        .map(|v| {
155            v.prop_map(|bytes| {
156                String::from_utf8(bytes).expect("non-utf8 string")
157            })
158            .sboxed()
159        })
160        .map(RegexGeneratorStrategy)
161}
162
163/// Creates a strategy which generates byte strings matching the given regular
164/// expression.
165///
166/// By default, the byte strings generated by this strategy _will_ be valid
167/// UTF-8.  If you wish to generate byte strings that aren't (necessarily)
168/// valid UTF-8, wrap your regex (or some subsection of it) in `(?-u: ... )`.
169/// You may want to turn on the `s` flag as well (`(?s-u: ... )`) so that `.`
170/// will generate newline characters (byte value `0x0A`).  See the
171/// [`regex` crate's documentation](https://docs.rs/regex/*/regex/#opt-out-of-unicode-support)
172/// for more information.
173pub fn bytes_regex(regex: &str) -> ParseResult<Vec<u8>> {
174    let hir = ParserBuilder::new()
175        .utf8(false)
176        .build()
177        .parse(regex)?;
178    bytes_regex_parsed(&hir)
179}
180
181/// Like `bytes_regex()`, but allows providing a pre-parsed expression.
182pub fn bytes_regex_parsed(expr: &Hir) -> ParseResult<Vec<u8>> {
183    match expr.kind() {
184        Empty => Ok(Just(vec![]).sboxed()),
185
186        Literal(lit) => Ok(Just(lit.0.to_vec()).sboxed()),
187
188        Class(class) => Ok(match class {
189            hir::Class::Unicode(class) => {
190                unicode_class_strategy(class).prop_map(to_bytes).sboxed()
191            }
192            hir::Class::Bytes(class) => {
193                let subs = class.iter().map(|r| r.start()..=r.end());
194                Union::new(subs).prop_map(|b| vec![b]).sboxed()
195            }
196        }),
197
198        Repetition(rep) => {
199            Ok(vec(bytes_regex_parsed(&rep.sub)?, to_range(rep)?)
200                .prop_map(|parts| parts.concat())
201                .sboxed())
202        }
203
204        Capture(capture) => bytes_regex_parsed(&capture.sub).map(|v| v.0),
205
206        Concat(subs) => {
207            let subs = ConcatIter {
208                iter: subs.iter(),
209                buf: vec![],
210                next: None,
211            };
212            let ext = |(mut lhs, rhs): (Vec<_>, _)| {
213                lhs.extend(rhs);
214                lhs
215            };
216            Ok(subs
217                .fold(Ok(None), |accum: Result<_, Error>, rhs| {
218                    Ok(match accum? {
219                        None => Some(rhs?.sboxed()),
220                        Some(accum) => {
221                            Some((accum, rhs?).prop_map(ext).sboxed())
222                        }
223                    })
224                })?
225                .unwrap_or_else(|| Just(vec![]).sboxed()))
226        }
227
228        Alternation(subs) => {
229            Ok(Union::try_new(subs.iter().map(bytes_regex_parsed))?.sboxed())
230        }
231
232        Look(_) => unsupported(
233            "anchors/boundaries not supported for string generation",
234        ),
235    }
236    .map(RegexGeneratorStrategy)
237}
238
239fn unicode_class_strategy(
240    class: &hir::ClassUnicode,
241) -> char::CharStrategy<'static> {
242    static NONL_RANGES: &[RangeInclusive<char>] = &[
243        '\x00'..='\x09',
244        // Multiple instances of the latter range to partially make up
245        // for the bias of having such a tiny range in the control
246        // characters.
247        '\x0B'..=::core::char::MAX,
248        '\x0B'..=::core::char::MAX,
249        '\x0B'..=::core::char::MAX,
250        '\x0B'..=::core::char::MAX,
251        '\x0B'..=::core::char::MAX,
252    ];
253
254    let dotnnl = |x: &hir::ClassUnicodeRange, y: &hir::ClassUnicodeRange| {
255        x.start() == '\0'
256            && x.end() == '\x09'
257            && y.start() == '\x0B'
258            && y.end() == '\u{10FFFF}'
259    };
260
261    char::ranges(match class.ranges() {
262        [x, y] if dotnnl(x, y) || dotnnl(y, x) => Cow::Borrowed(NONL_RANGES),
263        _ => Cow::Owned(class.iter().map(|r| r.start()..=r.end()).collect()),
264    })
265}
266
267struct ConcatIter<'a, I> {
268    buf: Vec<u8>,
269    iter: I,
270    next: Option<&'a Hir>,
271}
272
273fn flush_lit_buf<I>(
274    it: &mut ConcatIter<'_, I>,
275) -> Option<ParseResult<Vec<u8>>> {
276    Some(Ok(RegexGeneratorStrategy(
277        Just(mem::replace(&mut it.buf, vec![])).sboxed(),
278    )))
279}
280
281impl<'a, I: Iterator<Item = &'a Hir>> Iterator for ConcatIter<'a, I> {
282    type Item = ParseResult<Vec<u8>>;
283
284    fn next(&mut self) -> Option<Self::Item> {
285        // A left-over node, process it first:
286        if let Some(next) = self.next.take() {
287            return Some(bytes_regex_parsed(next));
288        }
289
290        // Accumulate a literal sequence as long as we can:
291        while let Some(next) = self.iter.next() {
292            match next.kind() {
293                // A literal. Accumulate:
294                Literal(literal) => self.buf.extend_from_slice(&literal.0),
295                // Encountered a non-literal.
296                _ => {
297                    return if !self.buf.is_empty() {
298                        // We've accumulated a literal from before, flush it out.
299                        // Store this node so we deal with it the next call.
300                        self.next = Some(next);
301                        flush_lit_buf(self)
302                    } else {
303                        // We didn't; just yield this node.
304                        Some(bytes_regex_parsed(next))
305                    };
306                }
307            }
308        }
309
310        // Flush out any accumulated literal from before.
311        if !self.buf.is_empty() {
312            flush_lit_buf(self)
313        } else {
314            self.next.take().map(bytes_regex_parsed)
315        }
316    }
317}
318
319fn to_range(rep: &Repetition) -> Result<SizeRange, Error> {
320    Ok(match (rep.min, rep.max) {
321        // Zero or one
322        (0, Some(1)) => size_range(0..=1),
323        // Zero or more
324        (0, None) => size_range(0..=32),
325        // One or more
326        (1, None) => size_range(1..=32),
327        // Exact count of u32::MAX
328        (u32::MAX, Some(u32::MAX)) => {
329            return unsupported("Cannot have repetition of exactly u32::MAX");
330        }
331        // Exact count
332        (min, Some(max)) if min == max => size_range(min as usize),
333        // At least min
334        (min, None) => {
335            let max = if min < u32::MAX as u32 / 2 {
336                min as usize * 2
337            } else {
338                u32::MAX as usize
339            };
340            size_range((min as usize)..max)
341        }
342        // Bounded range with max of u32::MAX
343        (_, Some(u32::MAX)) => {
344            return unsupported("Cannot have repetition max of u32::MAX")
345        }
346        // Bounded range
347        (min, Some(max)) => size_range((min as usize)..(max as usize + 1)),
348    })
349}
350
351fn to_bytes(khar: char) -> Vec<u8> {
352    let mut buf = [0u8; 4];
353    khar.encode_utf8(&mut buf).as_bytes().to_owned()
354}
355
356fn unsupported<T>(error: &'static str) -> Result<T, Error> {
357    Err(Error::UnsupportedRegex(error))
358}
359
360#[cfg(test)]
361mod test {
362    use std::collections::HashSet;
363
364    use regex::Regex;
365    use regex::bytes::Regex as BytesRegex;
366
367    use super::*;
368
369    fn printable_ascii(v: &[u8]) -> String {
370        v.iter()
371            .flat_map(|c| std::ascii::escape_default(*c))
372            .map(|c| char::from_u32(c.into()).unwrap())
373            .collect()
374    }
375
376    fn do_test(
377        pattern: &str,
378        min_distinct: usize,
379        max_distinct: usize,
380        iterations: usize,
381    ) {
382        let generated = generate_values_matching_regex(pattern, iterations);
383        assert!(
384            generated.len() >= min_distinct,
385            "Expected to generate at least {} strings, but only \
386             generated {}",
387            min_distinct,
388            generated.len()
389        );
390        assert!(
391            generated.len() <= max_distinct,
392            "Expected to generate at most {} strings, but \
393             generated {}",
394            max_distinct,
395            generated.len()
396        );
397    }
398
399    fn do_test_bytes(
400        pattern: &str,
401        min_distinct: usize,
402        max_distinct: usize,
403        iterations: usize,
404    ) {
405        let generated = generate_byte_values_matching_regex(pattern, iterations);
406        assert!(
407            generated.len() >= min_distinct,
408            "Expected to generate at least {} strings, but only \
409             generated {}",
410            min_distinct,
411            generated.len()
412        );
413        assert!(
414            generated.len() <= max_distinct,
415            "Expected to generate at most {} strings, but \
416             generated {}",
417            max_distinct,
418            generated.len()
419        );
420    }
421
422    fn generate_values_matching_regex(
423        pattern: &str,
424        iterations: usize,
425    ) -> HashSet<String> {
426        let rx = Regex::new(pattern).unwrap();
427        let mut generated = HashSet::new();
428
429        let strategy = string_regex(pattern).unwrap();
430        let mut runner = TestRunner::deterministic();
431        for _ in 0..iterations {
432            let mut value = strategy.new_tree(&mut runner).unwrap();
433
434            loop {
435                let s = value.current();
436                let ok = if let Some(matsch) = rx.find(&s) {
437                    0 == matsch.start() && s.len() == matsch.end()
438                } else {
439                    false
440                };
441                if !ok {
442                    panic!(
443                        "Generated string {:?} which does not match {:?}",
444                        s, pattern
445                    );
446                }
447
448                generated.insert(s);
449
450                if !value.simplify() {
451                    break;
452                }
453            }
454        }
455        generated
456    }
457
458    fn generate_byte_values_matching_regex(
459        pattern: &str,
460        iterations: usize,
461    ) -> HashSet<Vec<u8>> {
462        let rx = BytesRegex::new(pattern).unwrap();
463        let mut generated = HashSet::new();
464
465        let strategy = bytes_regex(pattern).unwrap();
466        let mut runner = TestRunner::deterministic();
467        for _ in 0..iterations {
468            let mut value = strategy.new_tree(&mut runner).unwrap();
469
470            loop {
471                let s = value.current();
472                let ok = if let Some(matsch) = rx.find(&s) {
473                    0 == matsch.start() && s.len() == matsch.end()
474                } else {
475                    false
476                };
477                if !ok {
478                    panic!(
479                        "Generated string {:?} which does not match {:?}",
480                        printable_ascii(&s), pattern
481                    );
482                }
483
484                generated.insert(s);
485
486                if !value.simplify() {
487                    break;
488                }
489            }
490        }
491        generated
492    }
493
494    #[test]
495    fn test_case_insensitive_produces_all_available_values() {
496        let mut expected: HashSet<String> = HashSet::new();
497        expected.insert("a".into());
498        expected.insert("b".into());
499        expected.insert("A".into());
500        expected.insert("B".into());
501        assert_eq!(generate_values_matching_regex("(?i:a|B)", 64), expected);
502    }
503
504    #[test]
505    fn test_literal() {
506        do_test("foo", 1, 1, 8);
507        do_test_bytes("foo", 1, 1, 8);
508    }
509
510    #[test]
511    fn test_casei_literal() {
512        do_test("(?i:fOo)", 8, 8, 64);
513    }
514
515    #[test]
516    fn test_alternation() {
517        do_test("foo|bar|baz", 3, 3, 16);
518        do_test_bytes("foo|bar|baz", 3, 3, 16);
519    }
520
521    #[test]
522    fn test_repetition() {
523        do_test("a{0,8}", 9, 9, 64);
524        do_test_bytes("a{0,8}", 9, 9, 64);
525    }
526
527    #[test]
528    fn test_question() {
529        do_test("a?", 2, 2, 16);
530        do_test_bytes("a?", 2, 2, 16);
531    }
532
533    #[test]
534    fn test_star() {
535        do_test("a*", 33, 33, 256);
536        do_test_bytes("a*", 33, 33, 256);
537    }
538
539    #[test]
540    fn test_plus() {
541        do_test("a+", 32, 32, 256);
542        do_test_bytes("a+", 32, 32, 256);
543    }
544
545    #[test]
546    fn test_n_to_range() {
547        do_test("a{4,}", 4, 4, 64);
548        do_test_bytes("a{4,}", 4, 4, 64);
549    }
550
551    #[test]
552    fn test_concatenation() {
553        do_test("(foo|bar)(xyzzy|plugh)", 4, 4, 32);
554        do_test_bytes("(foo|bar)(xyzzy|plugh)", 4, 4, 32);
555    }
556
557    #[test]
558    fn test_ascii_class() {
559        do_test("[[:digit:]]", 10, 10, 256);
560    }
561
562    #[test]
563    fn test_unicode_class() {
564        do_test("\\p{Greek}", 24, 512, 256);
565    }
566
567    #[test]
568    fn test_dot() {
569        do_test(".", 200, 65536, 256);
570    }
571
572    #[test]
573    fn test_dot_s() {
574        do_test("(?s).", 200, 65536, 256);
575        do_test_bytes("(?s-u).", 256, 256, 2048);
576    }
577
578    #[test]
579    fn test_backslash_d_plus() {
580        do_test("\\d+", 1, 65536, 256);
581    }
582
583    #[test]
584    fn test_non_utf8_byte_strings() {
585        do_test_bytes(r"(?-u)[\xC0-\xFF]\x20", 64, 64, 512);
586        do_test_bytes(r"(?-u)\x20[\x80-\xBF]", 64, 64, 512);
587        do_test_bytes(r#"(?x-u)
588  \xed (( ( \xa0\x80 | \xad\xbf | \xae\x80 | \xaf\xbf )
589          ( \xed ( \xb0\x80 | \xbf\xbf ) )? )
590        | \xb0\x80 | \xbe\x80 | \xbf\xbf )"#, 15, 15, 120);
591    }
592
593    fn assert_send_and_sync<T: Send + Sync>(_: T) {}
594
595    #[test]
596    fn regex_strategy_is_send_and_sync() {
597        assert_send_and_sync(string_regex(".").unwrap());
598    }
599
600    macro_rules! consistent {
601        ($name:ident, $value:expr) => {
602            #[test]
603            fn $name() {
604                test_generates_matching_strings($value);
605            }
606        };
607    }
608
609    fn test_generates_matching_strings(pattern: &str) {
610        use std::time;
611
612        let mut runner = TestRunner::default();
613        let start = time::Instant::now();
614
615        // If we don't support this regex, just move on quietly
616        if let Ok(strategy) = string_regex(pattern) {
617            let rx = Regex::new(pattern).unwrap();
618
619            for _ in 0..1000 {
620                let mut val = strategy.new_tree(&mut runner).unwrap();
621                // No more than 1000 simplify steps to keep test time down
622                for _ in 0..1000 {
623                    let s = val.current();
624                    assert!(
625                        rx.is_match(&s),
626                        "Produced string {:?}, which does not match {:?}",
627                        s,
628                        pattern
629                    );
630
631                    if !val.simplify() {
632                        break;
633                    }
634                }
635
636                // Quietly stop testing if we've run for >10 s
637                if start.elapsed().as_secs() > 10 {
638                    break;
639                }
640            }
641        }
642    }
643
644    include!("regex-contrib/crates_regex.rs");
645}