1use crate::std_facade::{Box, Cow, String, ToOwned, Vec};
14use core::fmt;
15use core::mem;
16use core::ops::RangeInclusive;
17use core::u32;
18
19use regex_syntax::hir::{self, Hir, HirKind::*, Repetition};
20use regex_syntax::{Error as ParseError, ParserBuilder};
21
22use crate::bool;
23use crate::char;
24use crate::collection::{size_range, vec, SizeRange};
25use crate::strategy::*;
26use crate::test_runner::*;
27
28#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
31pub struct StringParam(&'static str);
32
33impl From<StringParam> for &'static str {
34 fn from(x: StringParam) -> Self {
35 x.0
36 }
37}
38
39impl From<&'static str> for StringParam {
40 fn from(x: &'static str) -> Self {
41 StringParam(x)
42 }
43}
44
45impl Default for StringParam {
46 fn default() -> Self {
47 StringParam("\\PC*")
48 }
49}
50
51#[derive(Debug)]
54pub enum Error {
55 RegexSyntax(ParseError),
57 UnsupportedRegex(&'static str),
60}
61
62impl fmt::Display for Error {
63 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
64 match self {
65 Error::RegexSyntax(err) => write!(f, "{}", err),
66 Error::UnsupportedRegex(message) => write!(f, "{}", message),
67 }
68 }
69}
70
71impl std::error::Error for Error {
72 fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
73 match self {
74 Error::RegexSyntax(err) => Some(err),
75 Error::UnsupportedRegex(_) => None,
76 }
77 }
78}
79
80impl From<ParseError> for Error {
81 fn from(err: ParseError) -> Error {
82 Error::RegexSyntax(err)
83 }
84}
85
86opaque_strategy_wrapper! {
87 #[derive(Debug)]
92 pub struct RegexGeneratorStrategy[<T>][where T : fmt::Debug]
93 (SBoxedStrategy<T>) -> RegexGeneratorValueTree<T>;
94 pub struct RegexGeneratorValueTree[<T>][where T : fmt::Debug]
96 (Box<dyn ValueTree<Value = T>>) -> T;
97}
98
99impl Strategy for str {
100 type Tree = RegexGeneratorValueTree<String>;
101 type Value = String;
102
103 fn new_tree(&self, runner: &mut TestRunner) -> NewTree<Self> {
104 string_regex(self).unwrap().new_tree(runner)
105 }
106}
107
108type ParseResult<T> = Result<RegexGeneratorStrategy<T>, Error>;
109
110#[doc(hidden)]
111pub trait StrategyFromRegex: Sized + fmt::Debug {
119 type Strategy: Strategy<Value = Self>;
120
121 fn from_regex(regex: &str) -> Self::Strategy;
123}
124
125impl StrategyFromRegex for String {
126 type Strategy = RegexGeneratorStrategy<Self>;
127
128 fn from_regex(regex: &str) -> Self::Strategy {
129 string_regex(regex).unwrap()
130 }
131}
132
133impl StrategyFromRegex for Vec<u8> {
134 type Strategy = RegexGeneratorStrategy<Self>;
135
136 fn from_regex(regex: &str) -> Self::Strategy {
137 bytes_regex(regex).unwrap()
138 }
139}
140
141pub fn string_regex(regex: &str) -> ParseResult<String> {
147 let hir = ParserBuilder::new().build().parse(regex)?;
148 string_regex_parsed(&hir)
149}
150
151pub fn string_regex_parsed(expr: &Hir) -> ParseResult<String> {
153 bytes_regex_parsed(expr)
154 .map(|v| {
155 v.prop_map(|bytes| {
156 String::from_utf8(bytes).expect("non-utf8 string")
157 })
158 .sboxed()
159 })
160 .map(RegexGeneratorStrategy)
161}
162
163pub fn bytes_regex(regex: &str) -> ParseResult<Vec<u8>> {
174 let hir = ParserBuilder::new()
175 .utf8(false)
176 .build()
177 .parse(regex)?;
178 bytes_regex_parsed(&hir)
179}
180
181pub fn bytes_regex_parsed(expr: &Hir) -> ParseResult<Vec<u8>> {
183 match expr.kind() {
184 Empty => Ok(Just(vec![]).sboxed()),
185
186 Literal(lit) => Ok(Just(lit.0.to_vec()).sboxed()),
187
188 Class(class) => Ok(match class {
189 hir::Class::Unicode(class) => {
190 unicode_class_strategy(class).prop_map(to_bytes).sboxed()
191 }
192 hir::Class::Bytes(class) => {
193 let subs = class.iter().map(|r| r.start()..=r.end());
194 Union::new(subs).prop_map(|b| vec![b]).sboxed()
195 }
196 }),
197
198 Repetition(rep) => {
199 Ok(vec(bytes_regex_parsed(&rep.sub)?, to_range(rep)?)
200 .prop_map(|parts| parts.concat())
201 .sboxed())
202 }
203
204 Capture(capture) => bytes_regex_parsed(&capture.sub).map(|v| v.0),
205
206 Concat(subs) => {
207 let subs = ConcatIter {
208 iter: subs.iter(),
209 buf: vec![],
210 next: None,
211 };
212 let ext = |(mut lhs, rhs): (Vec<_>, _)| {
213 lhs.extend(rhs);
214 lhs
215 };
216 Ok(subs
217 .fold(Ok(None), |accum: Result<_, Error>, rhs| {
218 Ok(match accum? {
219 None => Some(rhs?.sboxed()),
220 Some(accum) => {
221 Some((accum, rhs?).prop_map(ext).sboxed())
222 }
223 })
224 })?
225 .unwrap_or_else(|| Just(vec![]).sboxed()))
226 }
227
228 Alternation(subs) => {
229 Ok(Union::try_new(subs.iter().map(bytes_regex_parsed))?.sboxed())
230 }
231
232 Look(_) => unsupported(
233 "anchors/boundaries not supported for string generation",
234 ),
235 }
236 .map(RegexGeneratorStrategy)
237}
238
239fn unicode_class_strategy(
240 class: &hir::ClassUnicode,
241) -> char::CharStrategy<'static> {
242 static NONL_RANGES: &[RangeInclusive<char>] = &[
243 '\x00'..='\x09',
244 '\x0B'..=::core::char::MAX,
248 '\x0B'..=::core::char::MAX,
249 '\x0B'..=::core::char::MAX,
250 '\x0B'..=::core::char::MAX,
251 '\x0B'..=::core::char::MAX,
252 ];
253
254 let dotnnl = |x: &hir::ClassUnicodeRange, y: &hir::ClassUnicodeRange| {
255 x.start() == '\0'
256 && x.end() == '\x09'
257 && y.start() == '\x0B'
258 && y.end() == '\u{10FFFF}'
259 };
260
261 char::ranges(match class.ranges() {
262 [x, y] if dotnnl(x, y) || dotnnl(y, x) => Cow::Borrowed(NONL_RANGES),
263 _ => Cow::Owned(class.iter().map(|r| r.start()..=r.end()).collect()),
264 })
265}
266
267struct ConcatIter<'a, I> {
268 buf: Vec<u8>,
269 iter: I,
270 next: Option<&'a Hir>,
271}
272
273fn flush_lit_buf<I>(
274 it: &mut ConcatIter<'_, I>,
275) -> Option<ParseResult<Vec<u8>>> {
276 Some(Ok(RegexGeneratorStrategy(
277 Just(mem::replace(&mut it.buf, vec![])).sboxed(),
278 )))
279}
280
281impl<'a, I: Iterator<Item = &'a Hir>> Iterator for ConcatIter<'a, I> {
282 type Item = ParseResult<Vec<u8>>;
283
284 fn next(&mut self) -> Option<Self::Item> {
285 if let Some(next) = self.next.take() {
287 return Some(bytes_regex_parsed(next));
288 }
289
290 while let Some(next) = self.iter.next() {
292 match next.kind() {
293 Literal(literal) => self.buf.extend_from_slice(&literal.0),
295 _ => {
297 return if !self.buf.is_empty() {
298 self.next = Some(next);
301 flush_lit_buf(self)
302 } else {
303 Some(bytes_regex_parsed(next))
305 };
306 }
307 }
308 }
309
310 if !self.buf.is_empty() {
312 flush_lit_buf(self)
313 } else {
314 self.next.take().map(bytes_regex_parsed)
315 }
316 }
317}
318
319fn to_range(rep: &Repetition) -> Result<SizeRange, Error> {
320 Ok(match (rep.min, rep.max) {
321 (0, Some(1)) => size_range(0..=1),
323 (0, None) => size_range(0..=32),
325 (1, None) => size_range(1..=32),
327 (u32::MAX, Some(u32::MAX)) => {
329 return unsupported("Cannot have repetition of exactly u32::MAX");
330 }
331 (min, Some(max)) if min == max => size_range(min as usize),
333 (min, None) => {
335 let max = if min < u32::MAX as u32 / 2 {
336 min as usize * 2
337 } else {
338 u32::MAX as usize
339 };
340 size_range((min as usize)..max)
341 }
342 (_, Some(u32::MAX)) => {
344 return unsupported("Cannot have repetition max of u32::MAX")
345 }
346 (min, Some(max)) => size_range((min as usize)..(max as usize + 1)),
348 })
349}
350
351fn to_bytes(khar: char) -> Vec<u8> {
352 let mut buf = [0u8; 4];
353 khar.encode_utf8(&mut buf).as_bytes().to_owned()
354}
355
356fn unsupported<T>(error: &'static str) -> Result<T, Error> {
357 Err(Error::UnsupportedRegex(error))
358}
359
360#[cfg(test)]
361mod test {
362 use std::collections::HashSet;
363
364 use regex::Regex;
365 use regex::bytes::Regex as BytesRegex;
366
367 use super::*;
368
369 fn printable_ascii(v: &[u8]) -> String {
370 v.iter()
371 .flat_map(|c| std::ascii::escape_default(*c))
372 .map(|c| char::from_u32(c.into()).unwrap())
373 .collect()
374 }
375
376 fn do_test(
377 pattern: &str,
378 min_distinct: usize,
379 max_distinct: usize,
380 iterations: usize,
381 ) {
382 let generated = generate_values_matching_regex(pattern, iterations);
383 assert!(
384 generated.len() >= min_distinct,
385 "Expected to generate at least {} strings, but only \
386 generated {}",
387 min_distinct,
388 generated.len()
389 );
390 assert!(
391 generated.len() <= max_distinct,
392 "Expected to generate at most {} strings, but \
393 generated {}",
394 max_distinct,
395 generated.len()
396 );
397 }
398
399 fn do_test_bytes(
400 pattern: &str,
401 min_distinct: usize,
402 max_distinct: usize,
403 iterations: usize,
404 ) {
405 let generated = generate_byte_values_matching_regex(pattern, iterations);
406 assert!(
407 generated.len() >= min_distinct,
408 "Expected to generate at least {} strings, but only \
409 generated {}",
410 min_distinct,
411 generated.len()
412 );
413 assert!(
414 generated.len() <= max_distinct,
415 "Expected to generate at most {} strings, but \
416 generated {}",
417 max_distinct,
418 generated.len()
419 );
420 }
421
422 fn generate_values_matching_regex(
423 pattern: &str,
424 iterations: usize,
425 ) -> HashSet<String> {
426 let rx = Regex::new(pattern).unwrap();
427 let mut generated = HashSet::new();
428
429 let strategy = string_regex(pattern).unwrap();
430 let mut runner = TestRunner::deterministic();
431 for _ in 0..iterations {
432 let mut value = strategy.new_tree(&mut runner).unwrap();
433
434 loop {
435 let s = value.current();
436 let ok = if let Some(matsch) = rx.find(&s) {
437 0 == matsch.start() && s.len() == matsch.end()
438 } else {
439 false
440 };
441 if !ok {
442 panic!(
443 "Generated string {:?} which does not match {:?}",
444 s, pattern
445 );
446 }
447
448 generated.insert(s);
449
450 if !value.simplify() {
451 break;
452 }
453 }
454 }
455 generated
456 }
457
458 fn generate_byte_values_matching_regex(
459 pattern: &str,
460 iterations: usize,
461 ) -> HashSet<Vec<u8>> {
462 let rx = BytesRegex::new(pattern).unwrap();
463 let mut generated = HashSet::new();
464
465 let strategy = bytes_regex(pattern).unwrap();
466 let mut runner = TestRunner::deterministic();
467 for _ in 0..iterations {
468 let mut value = strategy.new_tree(&mut runner).unwrap();
469
470 loop {
471 let s = value.current();
472 let ok = if let Some(matsch) = rx.find(&s) {
473 0 == matsch.start() && s.len() == matsch.end()
474 } else {
475 false
476 };
477 if !ok {
478 panic!(
479 "Generated string {:?} which does not match {:?}",
480 printable_ascii(&s), pattern
481 );
482 }
483
484 generated.insert(s);
485
486 if !value.simplify() {
487 break;
488 }
489 }
490 }
491 generated
492 }
493
494 #[test]
495 fn test_case_insensitive_produces_all_available_values() {
496 let mut expected: HashSet<String> = HashSet::new();
497 expected.insert("a".into());
498 expected.insert("b".into());
499 expected.insert("A".into());
500 expected.insert("B".into());
501 assert_eq!(generate_values_matching_regex("(?i:a|B)", 64), expected);
502 }
503
504 #[test]
505 fn test_literal() {
506 do_test("foo", 1, 1, 8);
507 do_test_bytes("foo", 1, 1, 8);
508 }
509
510 #[test]
511 fn test_casei_literal() {
512 do_test("(?i:fOo)", 8, 8, 64);
513 }
514
515 #[test]
516 fn test_alternation() {
517 do_test("foo|bar|baz", 3, 3, 16);
518 do_test_bytes("foo|bar|baz", 3, 3, 16);
519 }
520
521 #[test]
522 fn test_repetition() {
523 do_test("a{0,8}", 9, 9, 64);
524 do_test_bytes("a{0,8}", 9, 9, 64);
525 }
526
527 #[test]
528 fn test_question() {
529 do_test("a?", 2, 2, 16);
530 do_test_bytes("a?", 2, 2, 16);
531 }
532
533 #[test]
534 fn test_star() {
535 do_test("a*", 33, 33, 256);
536 do_test_bytes("a*", 33, 33, 256);
537 }
538
539 #[test]
540 fn test_plus() {
541 do_test("a+", 32, 32, 256);
542 do_test_bytes("a+", 32, 32, 256);
543 }
544
545 #[test]
546 fn test_n_to_range() {
547 do_test("a{4,}", 4, 4, 64);
548 do_test_bytes("a{4,}", 4, 4, 64);
549 }
550
551 #[test]
552 fn test_concatenation() {
553 do_test("(foo|bar)(xyzzy|plugh)", 4, 4, 32);
554 do_test_bytes("(foo|bar)(xyzzy|plugh)", 4, 4, 32);
555 }
556
557 #[test]
558 fn test_ascii_class() {
559 do_test("[[:digit:]]", 10, 10, 256);
560 }
561
562 #[test]
563 fn test_unicode_class() {
564 do_test("\\p{Greek}", 24, 512, 256);
565 }
566
567 #[test]
568 fn test_dot() {
569 do_test(".", 200, 65536, 256);
570 }
571
572 #[test]
573 fn test_dot_s() {
574 do_test("(?s).", 200, 65536, 256);
575 do_test_bytes("(?s-u).", 256, 256, 2048);
576 }
577
578 #[test]
579 fn test_backslash_d_plus() {
580 do_test("\\d+", 1, 65536, 256);
581 }
582
583 #[test]
584 fn test_non_utf8_byte_strings() {
585 do_test_bytes(r"(?-u)[\xC0-\xFF]\x20", 64, 64, 512);
586 do_test_bytes(r"(?-u)\x20[\x80-\xBF]", 64, 64, 512);
587 do_test_bytes(r#"(?x-u)
588 \xed (( ( \xa0\x80 | \xad\xbf | \xae\x80 | \xaf\xbf )
589 ( \xed ( \xb0\x80 | \xbf\xbf ) )? )
590 | \xb0\x80 | \xbe\x80 | \xbf\xbf )"#, 15, 15, 120);
591 }
592
593 fn assert_send_and_sync<T: Send + Sync>(_: T) {}
594
595 #[test]
596 fn regex_strategy_is_send_and_sync() {
597 assert_send_and_sync(string_regex(".").unwrap());
598 }
599
600 macro_rules! consistent {
601 ($name:ident, $value:expr) => {
602 #[test]
603 fn $name() {
604 test_generates_matching_strings($value);
605 }
606 };
607 }
608
609 fn test_generates_matching_strings(pattern: &str) {
610 use std::time;
611
612 let mut runner = TestRunner::default();
613 let start = time::Instant::now();
614
615 if let Ok(strategy) = string_regex(pattern) {
617 let rx = Regex::new(pattern).unwrap();
618
619 for _ in 0..1000 {
620 let mut val = strategy.new_tree(&mut runner).unwrap();
621 for _ in 0..1000 {
623 let s = val.current();
624 assert!(
625 rx.is_match(&s),
626 "Produced string {:?}, which does not match {:?}",
627 s,
628 pattern
629 );
630
631 if !val.simplify() {
632 break;
633 }
634 }
635
636 if start.elapsed().as_secs() > 10 {
638 break;
639 }
640 }
641 }
642 }
643
644 include!("regex-contrib/crates_regex.rs");
645}