determinator/paths0.rs
1// Copyright (c) The cargo-guppy Contributors
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4use camino::Utf8Path;
5use std::str::Utf8Error;
6
7/// A store for null-separated paths.
8///
9/// This manages paths on Unix and Windows platforms, including converting `/` on Windows to `\`.
10///
11/// # Null-separated paths
12///
13/// Paths as produced by tools like `git diff --name-only` are typically separated by newline
14/// characters (`\n`). However, on Unix platforms filenames can themselves have newlines embedded in
15/// them, so source control systems often end up [quoting newlines and other "unusual"
16/// characters](https://git-scm.com/docs/git-config#Documentation/git-config.txt-corequotePath).
17///
18/// A robust, lossless way to retrieve a list of paths is by separating them with null characters.
19/// Both Unix and Windows platforms guarantee that a path can never have embedded null characters.
20///
21/// # Examples
22///
23/// Most source control systems can provide null-separated paths. These examples are expected to be
24/// run from the Cargo workspace root (which is assumed to be the same as the repository root).
25///
26/// In most cases, you'll want to compare the current working directory against the [*merge
27/// base*][mb], or [*nearest/greatest/lowest common
28/// ancestor*](https://en.wikipedia.org/wiki/Lowest_common_ancestor), of the current commit with a
29/// specified upstream revision, such as `origin/main`. To do so, run:
30///
31/// * Git: `git diff -z --name-only $(git merge-base <upstream rev> HEAD)`
32/// * Mercurial: `hg status --print0 -mard --no-status --rev 'ancestor(<upstream rev>,.)'`
33///
34/// [mb]:
35/// https://stackoverflow.com/questions/1549146/git-find-the-most-recent-common-ancestor-of-two-branches
36///
37/// ---
38///
39/// **NOTE:**
40/// * The `$()` syntax in Bash and other shells means "run the command and insert its contents
41/// here".
42/// * Git provides a syntax `<upstream rev>...` which purports to use the merge base, but it ignores
43/// uncommitted changes. Executing `git merge-base` as a separate command is the only way to
44/// include uncommitted changes.
45/// * The `-mard` flag to `hg status` means that untracked files are not included. `git diff` does
46/// not have an option to display untracked files. For more discussion, see the documentation for
47/// [`add_changed_paths`](crate::Determinator::add_changed_paths).
48///
49/// ---
50///
51/// In general, to obtain a list of changed paths between two revisions (omit `<new rev>` if
52/// comparing against the working directory):
53///
54/// * Git: `git diff -z --name-only <old rev> <new rev>`
55/// * Mercurial: `hg status --print0 -mard --no-status <old rev> <new rev>`
56///
57/// To obtain a list of all files in the working directory that are tracked by the source control
58/// system:
59///
60/// * Git: `git ls-files -z`
61/// * Mercurial: `hg files --print0`
62///
63/// Null-separated paths are produced through the `-z` option to Git commands, or the `--print0`
64/// option to Mercurial. If you're using a different system, check its help for instructions.
65///
66/// # Implementations
67///
68/// `&'a Utf8Paths0` implements `IntoIterator<Item = &'a Utf8Path>`.
69#[derive(Clone, Debug, Eq, Ord, PartialOrd, PartialEq)]
70pub struct Utf8Paths0 {
71 buf: Box<str>,
72}
73
74impl Utf8Paths0 {
75 /// Creates a new instance of `Utf8Paths0` from a string with embedded nulls.
76 ///
77 /// The string may, but does not need to, have a trailing null byte.
78 pub fn new(buf: impl Into<String>) -> Self {
79 Self::strip_trailing_null_byte(buf.into())
80 }
81
82 /// Creates a new instance of `Utf8Paths0` from a `Vec<u8>`, performing a UTF-8 validation
83 /// check on the buffer.
84 ///
85 /// The buffer may, but does not need to, have a trailing null byte.
86 ///
87 /// ## Errors
88 ///
89 /// If any paths inside the string weren't valid UTF-8, this returns the first path that failed
90 /// to parse and the error returned.
91 pub fn from_bytes(buf: impl Into<Vec<u8>>) -> Result<Self, (Vec<u8>, Utf8Error)> {
92 let buf = buf.into();
93 let buf = Self::validate_utf8(buf)?;
94 Ok(Self::strip_trailing_null_byte(buf))
95 }
96
97 /// Creates a new instance of `Utf8Paths0`, converting `/` to `\` on platforms like Windows.
98 ///
99 /// Some tools like Git (but not Mercurial) return paths with `/` on Windows, even though the
100 /// canonical separator on the platform is `\`. This constructor changes all instances of `/`
101 /// to `\`.
102 pub fn new_forward_slashes(buf: impl Into<String>) -> Self {
103 let mut buf = buf.into();
104 // Change all `/` to `\` on Windows.
105 if std::path::MAIN_SEPARATOR == '\\' {
106 buf = buf.replace('/', "\\");
107 }
108 Self::strip_trailing_null_byte(buf)
109 }
110
111 /// Iterates over the paths in this buffer.
112 pub fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = &'a Utf8Path> + 'a> {
113 self.into_iter()
114 }
115
116 // ---
117 // Helper methods
118 // ---
119
120 fn validate_utf8(buf: Vec<u8>) -> Result<String, (Vec<u8>, Utf8Error)> {
121 match String::from_utf8(buf) {
122 Ok(s) => Ok(s),
123 Err(err) => {
124 let buf = err.into_bytes();
125 // Look for the path that failed validation.
126 buf.split(|b| *b == 0)
127 .try_for_each(|path| match std::str::from_utf8(path) {
128 Ok(_) => Ok(()),
129 Err(utf8_error) => Err((path.to_vec(), utf8_error)),
130 })?;
131 unreachable!("full buffer failed utf-8 validation => at least one path failed");
132 }
133 }
134 }
135
136 fn strip_trailing_null_byte(mut buf: String) -> Self {
137 if buf.as_bytes().last() == Some(&0) {
138 buf.pop();
139 }
140
141 Self { buf: buf.into() }
142 }
143}
144
145impl<'a> IntoIterator for &'a Utf8Paths0 {
146 type Item = &'a Utf8Path;
147 type IntoIter = Box<dyn Iterator<Item = &'a Utf8Path> + 'a>;
148
149 fn into_iter(self) -> Self::IntoIter {
150 // An empty string means there are no paths -- this has to be handled as a special case.
151 if self.buf.is_empty() {
152 return Box::new(std::iter::empty());
153 }
154
155 Box::new(self.buf.split('\0').map(Utf8Path::new))
156 }
157}
158
159#[cfg(test)]
160mod tests {
161 use super::*;
162
163 #[test]
164 fn basic() {
165 // Empty string should return no paths.
166 paths_eq(*b"", &[]);
167
168 paths_eq(*b"a/b/c", &["a/b/c"]);
169 paths_eq(*b"a/b\0a/c", &["a/b", "a/c"]);
170 paths_eq(*b"a/b\0a/c\0", &["a/b", "a/c"]);
171
172 // UTF-8
173 paths_eq(*b"a/b\xF0\x9F\x98\x81\0c/d", &["a/b😁", "c/d"]);
174 }
175
176 // This is really a Windows test but it should work on all platforms.
177 #[test]
178 fn backslashes() {
179 paths_eq(*b"a\\b\\c", &["a\\b\\c"]);
180 paths_eq(*b"a\\b\0a\\c", &["a\\b", "a\\c"]);
181 paths_eq(*b"a\\b\0a\\c\0", &["a\\b", "a\\c"]);
182 }
183
184 #[cfg(windows)]
185 #[test]
186 fn forward_slashes() {
187 paths_eq_fwd(*b"a/b/c", &["a\\b\\c"]);
188 paths_eq_fwd(*b"a/b\0a/c", &["a\\b", "a\\c"]);
189 paths_eq_fwd(*b"a/b\0a/c\0", &["a\\b", "a\\c"]);
190
191 // Also test mixed forward/backslashes.
192 paths_eq_fwd(*b"a/b\0a\\c", &["a\\b", "a\\c"]);
193 }
194
195 fn paths_eq(bytes: impl Into<Vec<u8>>, expected: &[&str]) {
196 let paths = Utf8Paths0::from_bytes(bytes.into()).expect("null-separated paths are valid");
197 let actual: Vec<_> = paths.iter().collect();
198 let expected: Vec<_> = expected.iter().map(Utf8Path::new).collect();
199
200 assert_eq!(actual, expected, "paths match");
201 }
202
203 #[cfg(windows)]
204 fn paths_eq_fwd(bytes: impl Into<Vec<u8>>, expected: &[&str]) {
205 let s = String::from_utf8(bytes.into()).expect("valid UTF-8");
206 let paths = Utf8Paths0::new_forward_slashes(s);
207 let actual: Vec<_> = paths.iter().collect();
208 let expected: Vec<_> = expected.iter().map(Utf8Path::new).collect();
209
210 assert_eq!(actual, expected, "paths match");
211 }
212}