determinator/
paths0.rs

1// Copyright (c) The cargo-guppy Contributors
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4use camino::Utf8Path;
5use std::str::Utf8Error;
6
7/// A store for null-separated paths.
8///
9/// This manages paths on Unix and Windows platforms, including converting `/` on Windows to `\`.
10///
11/// # Null-separated paths
12///
13/// Paths as produced by tools like `git diff --name-only` are typically separated by newline
14/// characters (`\n`). However, on Unix platforms filenames can themselves have newlines embedded in
15/// them, so source control systems often end up [quoting newlines and other "unusual"
16/// characters](https://git-scm.com/docs/git-config#Documentation/git-config.txt-corequotePath).
17///
18/// A robust, lossless way to retrieve a list of paths is by separating them with null characters.
19/// Both Unix and Windows platforms guarantee that a path can never have embedded null characters.
20///
21/// # Examples
22///
23/// Most source control systems can provide null-separated paths. These examples are expected to be
24/// run from the Cargo workspace root (which is assumed to be the same as the repository root).
25///
26/// In most cases, you'll want to compare the current working directory against the [*merge
27/// base*][mb], or [*nearest/greatest/lowest common
28/// ancestor*](https://en.wikipedia.org/wiki/Lowest_common_ancestor), of the current commit with a
29/// specified upstream revision, such as `origin/main`. To do so, run:
30///
31/// * Git: `git diff -z --name-only $(git merge-base <upstream rev> HEAD)`
32/// * Mercurial: `hg status --print0 -mard --no-status --rev 'ancestor(<upstream rev>,.)'`
33///
34/// [mb]:
35///     https://stackoverflow.com/questions/1549146/git-find-the-most-recent-common-ancestor-of-two-branches
36///
37/// ---
38///
39/// **NOTE:**
40/// * The `$()` syntax in Bash and other shells means "run the command and insert its contents
41///   here".
42/// * Git provides a syntax `<upstream rev>...` which purports to use the merge base, but it ignores
43///   uncommitted changes. Executing `git merge-base` as a separate command is the only way to
44///   include uncommitted changes.
45/// * The `-mard` flag to `hg status` means that untracked files are not included. `git diff` does
46///   not have an option to display untracked files. For more discussion, see the documentation for
47///   [`add_changed_paths`](crate::Determinator::add_changed_paths).
48///
49/// ---
50///
51/// In general, to obtain a list of changed paths between two revisions (omit `<new rev>` if
52/// comparing against the working directory):
53///
54/// * Git: `git diff -z --name-only <old rev> <new rev>`
55/// * Mercurial: `hg status --print0 -mard --no-status <old rev> <new rev>`
56///
57/// To obtain a list of all files in the working directory that are tracked by the source control
58/// system:
59///
60/// * Git: `git ls-files -z`
61/// * Mercurial: `hg files --print0`
62///
63/// Null-separated paths are produced through the `-z` option to Git commands, or the `--print0`
64/// option to Mercurial. If you're using a different system, check its help for instructions.
65///
66/// # Implementations
67///
68/// `&'a Utf8Paths0` implements `IntoIterator<Item = &'a Utf8Path>`.
69#[derive(Clone, Debug, Eq, Ord, PartialOrd, PartialEq)]
70pub struct Utf8Paths0 {
71    buf: Box<str>,
72}
73
74impl Utf8Paths0 {
75    /// Creates a new instance of `Utf8Paths0` from a string with embedded nulls.
76    ///
77    /// The string may, but does not need to, have a trailing null byte.
78    pub fn new(buf: impl Into<String>) -> Self {
79        Self::strip_trailing_null_byte(buf.into())
80    }
81
82    /// Creates a new instance of `Utf8Paths0` from a `Vec<u8>`, performing a UTF-8 validation
83    /// check on the buffer.
84    ///
85    /// The buffer may, but does not need to, have a trailing null byte.
86    ///
87    /// ## Errors
88    ///
89    /// If any paths inside the string weren't valid UTF-8, this returns the first path that failed
90    /// to parse and the error returned.
91    pub fn from_bytes(buf: impl Into<Vec<u8>>) -> Result<Self, (Vec<u8>, Utf8Error)> {
92        let buf = buf.into();
93        let buf = Self::validate_utf8(buf)?;
94        Ok(Self::strip_trailing_null_byte(buf))
95    }
96
97    /// Creates a new instance of `Utf8Paths0`, converting `/` to `\` on platforms like Windows.
98    ///
99    /// Some tools like Git (but not Mercurial) return paths with `/` on Windows, even though the
100    /// canonical separator on the platform is `\`. This constructor changes all instances of `/`
101    /// to `\`.
102    pub fn new_forward_slashes(buf: impl Into<String>) -> Self {
103        let mut buf = buf.into();
104        // Change all `/` to `\` on Windows.
105        if std::path::MAIN_SEPARATOR == '\\' {
106            buf = buf.replace('/', "\\");
107        }
108        Self::strip_trailing_null_byte(buf)
109    }
110
111    /// Iterates over the paths in this buffer.
112    pub fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = &'a Utf8Path> + 'a> {
113        self.into_iter()
114    }
115
116    // ---
117    // Helper methods
118    // ---
119
120    fn validate_utf8(buf: Vec<u8>) -> Result<String, (Vec<u8>, Utf8Error)> {
121        match String::from_utf8(buf) {
122            Ok(s) => Ok(s),
123            Err(err) => {
124                let buf = err.into_bytes();
125                // Look for the path that failed validation.
126                buf.split(|b| *b == 0)
127                    .try_for_each(|path| match std::str::from_utf8(path) {
128                        Ok(_) => Ok(()),
129                        Err(utf8_error) => Err((path.to_vec(), utf8_error)),
130                    })?;
131                unreachable!("full buffer failed utf-8 validation => at least one path failed");
132            }
133        }
134    }
135
136    fn strip_trailing_null_byte(mut buf: String) -> Self {
137        if buf.as_bytes().last() == Some(&0) {
138            buf.pop();
139        }
140
141        Self { buf: buf.into() }
142    }
143}
144
145impl<'a> IntoIterator for &'a Utf8Paths0 {
146    type Item = &'a Utf8Path;
147    type IntoIter = Box<dyn Iterator<Item = &'a Utf8Path> + 'a>;
148
149    fn into_iter(self) -> Self::IntoIter {
150        // An empty string means there are no paths -- this has to be handled as a special case.
151        if self.buf.is_empty() {
152            return Box::new(std::iter::empty());
153        }
154
155        Box::new(self.buf.split('\0').map(Utf8Path::new))
156    }
157}
158
159#[cfg(test)]
160mod tests {
161    use super::*;
162
163    #[test]
164    fn basic() {
165        // Empty string should return no paths.
166        paths_eq(*b"", &[]);
167
168        paths_eq(*b"a/b/c", &["a/b/c"]);
169        paths_eq(*b"a/b\0a/c", &["a/b", "a/c"]);
170        paths_eq(*b"a/b\0a/c\0", &["a/b", "a/c"]);
171
172        // UTF-8
173        paths_eq(*b"a/b\xF0\x9F\x98\x81\0c/d", &["a/b😁", "c/d"]);
174    }
175
176    // This is really a Windows test but it should work on all platforms.
177    #[test]
178    fn backslashes() {
179        paths_eq(*b"a\\b\\c", &["a\\b\\c"]);
180        paths_eq(*b"a\\b\0a\\c", &["a\\b", "a\\c"]);
181        paths_eq(*b"a\\b\0a\\c\0", &["a\\b", "a\\c"]);
182    }
183
184    #[cfg(windows)]
185    #[test]
186    fn forward_slashes() {
187        paths_eq_fwd(*b"a/b/c", &["a\\b\\c"]);
188        paths_eq_fwd(*b"a/b\0a/c", &["a\\b", "a\\c"]);
189        paths_eq_fwd(*b"a/b\0a/c\0", &["a\\b", "a\\c"]);
190
191        // Also test mixed forward/backslashes.
192        paths_eq_fwd(*b"a/b\0a\\c", &["a\\b", "a\\c"]);
193    }
194
195    fn paths_eq(bytes: impl Into<Vec<u8>>, expected: &[&str]) {
196        let paths = Utf8Paths0::from_bytes(bytes.into()).expect("null-separated paths are valid");
197        let actual: Vec<_> = paths.iter().collect();
198        let expected: Vec<_> = expected.iter().map(Utf8Path::new).collect();
199
200        assert_eq!(actual, expected, "paths match");
201    }
202
203    #[cfg(windows)]
204    fn paths_eq_fwd(bytes: impl Into<Vec<u8>>, expected: &[&str]) {
205        let s = String::from_utf8(bytes.into()).expect("valid UTF-8");
206        let paths = Utf8Paths0::new_forward_slashes(s);
207        let actual: Vec<_> = paths.iter().collect();
208        let expected: Vec<_> = expected.iter().map(Utf8Path::new).collect();
209
210        assert_eq!(actual, expected, "paths match");
211    }
212}