proptest/
char.rs

1//-
2// Copyright 2017 Jason Lingle
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10//! Strategies for generating `char` values.
11//!
12//! Unlike most strategies in Proptest, character generation is by default
13//! biased to particular values known to be difficult to handle in various
14//! circumstances.
15//!
16//! The main things of interest are `any()` to generate truly arbitrary
17//! characters, and `range()` and `ranges()` to select characters from
18//! inclusive ranges.
19
20use crate::std_facade::Cow;
21use core::ops::RangeInclusive;
22
23use rand::Rng;
24
25use crate::num;
26use crate::strategy::*;
27use crate::test_runner::*;
28
29/// An inclusive char range from fst to snd.
30type CharRange = RangeInclusive<char>;
31
32/// A default set of characters to consider as "special" during character
33/// generation.
34///
35/// Most of the characters here were chosen specifically because they are
36/// difficult to handle in particular contexts.
37pub const DEFAULT_SPECIAL_CHARS: &[char] = &[
38    // Things to give shell scripts and filesystem logic difficulties
39    '/', '\\', '$', '.', '*', '{', '\'', '"', '`', ':',
40    // Characters with special significance in URLs and elsewhere
41    '?', '%', '=', '&', '<',
42    // Interesting ASCII control characters
43    // NUL, HT,   CR,   LF,   VT      ESC     DEL
44    '\x00', '\t', '\r', '\n', '\x0B', '\x1B', '\x7F',
45    // ¥ both to test simple Unicode handling and because it has interesting
46    // properties on MS Shift-JIS systems.
47    '¥', // No non-Unicode encoding has both ¥ and Ѩ
48    'Ѩ',
49    // In UTF-8, Ⱥ increases in length from 2 to 3 bytes when lowercased
50    'Ⱥ',
51    // More Unicode edge-cases: BOM, replacement character, RTL override, and non-BMP
52    '\u{FEFF}', '\u{FFFD}', '\u{202E}', '🕴',
53];
54
55/// A default sequence of ranges used preferentially when generating random
56/// characters.
57pub const DEFAULT_PREFERRED_RANGES: &[CharRange] = &[
58    // ASCII printable
59    ' '..='~',
60    ' '..='~',
61    ' '..='~',
62    ' '..='~',
63    ' '..='~',
64    // Latin-1
65    '\u{0040}'..='\u{00ff}',
66];
67
68/// Selects a random character the way `CharStrategy` does.
69///
70/// If `special` is non-empty, there is a 50% chance that a character from this
71/// array is chosen randomly, and will be returned if that character falls
72/// within `ranges`.
73///
74/// If `preferred` is non-empty, there is a 50% chance that any generation
75/// which gets past the `special` step picks a random element from this list,
76/// then a random character from within that range (both endpoints inclusive).
77/// That character will be returned if it falls within `ranges`.
78///
79/// In all other cases, an element is picked randomly from `ranges` and a
80/// random character within the range (both endpoints inclusive) is chosen and
81/// returned.
82///
83/// Notice that in all cases, `ranges` completely defines the set of characters
84/// that can possibly be defined.
85///
86/// It is legal for ranges in all cases to contain non-characters.
87///
88/// Both `preferred` and `ranges` bias selection towards characters in smaller
89/// ranges. This is deliberate. `preferred` is usually tuned to select
90/// particular characters anyway. `ranges` is usually derived from some
91/// external property, and the fact that a range is small often means it is
92/// more interesting.
93pub fn select_char(
94    rnd: &mut impl Rng,
95    special: &[char],
96    preferred: &[CharRange],
97    ranges: &[CharRange],
98) -> char {
99    let (base, offset) = select_range_index(rnd, special, preferred, ranges);
100    ::core::char::from_u32(base + offset).expect("bad character selected")
101}
102
103fn select_range_index(
104    rnd: &mut impl Rng,
105    special: &[char],
106    preferred: &[CharRange],
107    ranges: &[CharRange],
108) -> (u32, u32) {
109    fn in_range(ranges: &[CharRange], ch: char) -> Option<(u32, u32)> {
110        ranges
111            .iter()
112            .find(|r| ch >= *r.start() && ch <= *r.end())
113            .map(|r| (*r.start() as u32, ch as u32 - *r.start() as u32))
114    }
115
116    if !special.is_empty() && rnd.gen() {
117        let s = special[rnd.gen_range(0..special.len())];
118        if let Some(ret) = in_range(ranges, s) {
119            return ret;
120        }
121    }
122
123    if !preferred.is_empty() && rnd.gen() {
124        let range = preferred[rnd.gen_range(0..preferred.len())].clone();
125        if let Some(ch) = ::core::char::from_u32(
126            rnd.gen_range(*range.start() as u32..*range.end() as u32 + 1),
127        ) {
128            if let Some(ret) = in_range(ranges, ch) {
129                return ret;
130            }
131        }
132    }
133
134    for _ in 0..65_536 {
135        let range = ranges[rnd.gen_range(0..ranges.len())].clone();
136        if let Some(ch) = ::core::char::from_u32(
137            rnd.gen_range(*range.start() as u32..*range.end() as u32 + 1),
138        ) {
139            return (*range.start() as u32, ch as u32 - *range.start() as u32);
140        }
141    }
142
143    // Give up and return a character we at least know is valid.
144    (*ranges[0].start() as u32, 0)
145}
146
147/// Strategy for generating `char`s.
148///
149/// Character selection is more sophisticated than integer selection. Naïve
150/// selection (particularly in the larger context of generating strings) would
151/// result in starting inputs like `ꂡ螧轎ቶᢹ糦狥芹ᘆ㶏曊ᒀ踔虙ჲ` and "simplified"
152/// inputs consisting mostly of control characters. It also has difficulty
153/// locating edge cases, since the vast majority of code points (such as the
154/// enormous CJK regions) don't cause problems for anything with even basic
155/// Unicode support.
156///
157/// Instead, character selection is always based on explicit ranges, and is
158/// designed to bias to specifically chosen characters and character ranges to
159/// produce inputs that are both more useful and easier for humans to
160/// understand. There are also hard-wired simplification targets based on ASCII
161/// instead of simply simplifying towards NUL to avoid problematic inputs being
162/// reduced to a bunch of NUL characters.
163///
164/// Shrinking never crosses ranges. If you have a complex range like `[A-Za-z]`
165/// and the starting point `x` is chosen, it will not shrink to the first `A-Z`
166/// group, but rather simply to `a`.
167///
168/// The usual way to get instances of this class is with the module-level `ANY`
169/// constant or `range` function. Directly constructing a `CharStrategy` is
170/// only necessary for complex ranges or to override the default biases.
171#[derive(Debug, Clone)]
172#[must_use = "strategies do nothing unless used"]
173pub struct CharStrategy<'a> {
174    special: Cow<'a, [char]>,
175    preferred: Cow<'a, [CharRange]>,
176    ranges: Cow<'a, [CharRange]>,
177}
178
179impl<'a> CharStrategy<'a> {
180    /// Construct a new `CharStrategy` with the parameters it will pass to the
181    /// function underlying `select_char()`.
182    ///
183    /// All arguments as per `select_char()`.
184    pub fn new(
185        special: Cow<'a, [char]>,
186        preferred: Cow<'a, [CharRange]>,
187        ranges: Cow<'a, [CharRange]>,
188    ) -> Self {
189        CharStrategy {
190            special,
191            preferred,
192            ranges,
193        }
194    }
195
196    /// Same as `CharStrategy::new()` but using `Cow::Borrowed` for all parts.
197    pub fn new_borrowed(
198        special: &'a [char],
199        preferred: &'a [CharRange],
200        ranges: &'a [CharRange],
201    ) -> Self {
202        CharStrategy::new(
203            Cow::Borrowed(special),
204            Cow::Borrowed(preferred),
205            Cow::Borrowed(ranges),
206        )
207    }
208}
209
210const WHOLE_RANGE: &[CharRange] = &['\x00'..=::core::char::MAX];
211
212/// Creates a `CharStrategy` which picks from literally any character, with the
213/// default biases.
214pub fn any() -> CharStrategy<'static> {
215    CharStrategy {
216        special: Cow::Borrowed(DEFAULT_SPECIAL_CHARS),
217        preferred: Cow::Borrowed(DEFAULT_PREFERRED_RANGES),
218        ranges: Cow::Borrowed(WHOLE_RANGE),
219    }
220}
221
222/// Creates a `CharStrategy` which selects characters within the given
223/// endpoints, inclusive, using the default biases.
224pub fn range(start: char, end: char) -> CharStrategy<'static> {
225    CharStrategy {
226        special: Cow::Borrowed(DEFAULT_SPECIAL_CHARS),
227        preferred: Cow::Borrowed(DEFAULT_PREFERRED_RANGES),
228        ranges: Cow::Owned(vec![start..=end]),
229    }
230}
231
232/// Creates a `CharStrategy` which selects characters within the given ranges,
233/// all inclusive, using the default biases.
234pub fn ranges(ranges: Cow<[CharRange]>) -> CharStrategy {
235    CharStrategy {
236        special: Cow::Borrowed(DEFAULT_SPECIAL_CHARS),
237        preferred: Cow::Borrowed(DEFAULT_PREFERRED_RANGES),
238        ranges,
239    }
240}
241
242/// The `ValueTree` corresponding to `CharStrategy`.
243#[derive(Debug, Clone, Copy)]
244pub struct CharValueTree {
245    value: num::u32::BinarySearch,
246}
247
248impl<'a> Strategy for CharStrategy<'a> {
249    type Tree = CharValueTree;
250    type Value = char;
251
252    fn new_tree(&self, runner: &mut TestRunner) -> NewTree<Self> {
253        let (base, offset) = select_range_index(
254            runner.rng(),
255            &self.special,
256            &self.preferred,
257            &self.ranges,
258        );
259
260        // Select a minimum point more convenient than 0
261        let start = base + offset;
262        let bottom = if start >= '¡' as u32 && base < '¡' as u32 {
263            '¡' as u32
264        } else if start >= 'a' as u32 && base < 'a' as u32 {
265            'a' as u32
266        } else if start >= 'A' as u32 && base < 'A' as u32 {
267            'A' as u32
268        } else if start >= '0' as u32 && base < '0' as u32 {
269            '0' as u32
270        } else if start >= ' ' as u32 && base < ' ' as u32 {
271            ' ' as u32
272        } else {
273            base
274        };
275
276        Ok(CharValueTree {
277            value: num::u32::BinarySearch::new_above(bottom, start),
278        })
279    }
280}
281
282impl CharValueTree {
283    fn reposition(&mut self) {
284        while ::core::char::from_u32(self.value.current()).is_none() {
285            if !self.value.complicate() {
286                panic!("Converged to non-char value");
287            }
288        }
289    }
290}
291
292impl ValueTree for CharValueTree {
293    type Value = char;
294
295    fn current(&self) -> char {
296        ::core::char::from_u32(self.value.current())
297            .expect("Generated non-char value")
298    }
299
300    fn simplify(&mut self) -> bool {
301        if self.value.simplify() {
302            self.reposition();
303            true
304        } else {
305            false
306        }
307    }
308
309    fn complicate(&mut self) -> bool {
310        if self.value.complicate() {
311            self.reposition();
312            true
313        } else {
314            false
315        }
316    }
317}
318
319#[cfg(test)]
320mod test {
321    use std::cmp::{max, min};
322    use std::vec::Vec;
323
324    use super::*;
325    use crate::collection;
326
327    proptest! {
328        #[test]
329        fn stays_in_range(input_ranges in collection::vec(
330            (0..::std::char::MAX as u32,
331             0..::std::char::MAX as u32),
332            1..5))
333        {
334            let input = ranges(Cow::Owned(input_ranges.iter().map(
335                |&(lo, hi)| ::std::char::from_u32(lo).and_then(
336                    |lo| ::std::char::from_u32(hi).map(
337                        |hi| min(lo, hi) ..= max(lo, hi)))
338                    .ok_or_else(|| TestCaseError::reject("non-char")))
339                                          .collect::<Result<Vec<CharRange>,_>>()?));
340
341            let mut runner = TestRunner::default();
342            for _ in 0..256 {
343                let mut value = input.new_tree(&mut runner).unwrap();
344                loop {
345                    let ch = value.current() as u32;
346                    assert!(input_ranges.iter().any(
347                        |&(lo, hi)| ch >= min(lo, hi) &&
348                            ch <= max(lo, hi)));
349
350                    if !value.simplify() { break; }
351                }
352            }
353        }
354    }
355
356    #[test]
357    fn applies_desired_bias() {
358        let mut men_in_business_suits_levitating = 0;
359        let mut ascii_printable = 0;
360        let mut runner = TestRunner::deterministic();
361
362        for _ in 0..1024 {
363            let ch = any().new_tree(&mut runner).unwrap().current();
364            if '🕴' == ch {
365                men_in_business_suits_levitating += 1;
366            } else if ch >= ' ' && ch <= '~' {
367                ascii_printable += 1;
368            }
369        }
370
371        assert!(ascii_printable >= 256);
372        assert!(men_in_business_suits_levitating >= 1);
373    }
374
375    #[test]
376    fn doesnt_shrink_to_ascii_control() {
377        let mut accepted = 0;
378        let mut runner = TestRunner::deterministic();
379
380        for _ in 0..256 {
381            let mut value = any().new_tree(&mut runner).unwrap();
382
383            if value.current() <= ' ' {
384                continue;
385            }
386
387            while value.simplify() {}
388
389            assert!(value.current() >= ' ');
390            accepted += 1;
391        }
392
393        assert!(accepted >= 200);
394    }
395
396    #[test]
397    fn test_sanity() {
398        check_strategy_sanity(
399            any(),
400            Some(CheckStrategySanityOptions {
401                // `simplify()` can itself `complicate()` back to the starting
402                // position, so the overly strict complicate-after-simplify check
403                // must be disabled.
404                strict_complicate_after_simplify: false,
405                ..CheckStrategySanityOptions::default()
406            }),
407        );
408    }
409}