determinator/paths0.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212
// Copyright (c) The cargo-guppy Contributors
// SPDX-License-Identifier: MIT OR Apache-2.0
use camino::Utf8Path;
use std::str::Utf8Error;
/// A store for null-separated paths.
///
/// This manages paths on Unix and Windows platforms, including converting `/` on Windows to `\`.
///
/// # Null-separated paths
///
/// Paths as produced by tools like `git diff --name-only` are typically separated by newline
/// characters (`\n`). However, on Unix platforms filenames can themselves have newlines embedded in
/// them, so source control systems often end up [quoting newlines and other "unusual"
/// characters](https://git-scm.com/docs/git-config#Documentation/git-config.txt-corequotePath).
///
/// A robust, lossless way to retrieve a list of paths is by separating them with null characters.
/// Both Unix and Windows platforms guarantee that a path can never have embedded null characters.
///
/// # Examples
///
/// Most source control systems can provide null-separated paths. These examples are expected to be
/// run from the Cargo workspace root (which is assumed to be the same as the repository root).
///
/// In most cases, you'll want to compare the current working directory against the [*merge
/// base*][mb], or [*nearest/greatest/lowest common
/// ancestor*](https://en.wikipedia.org/wiki/Lowest_common_ancestor), of the current commit with a
/// specified upstream revision, such as `origin/main`. To do so, run:
///
/// * Git: `git diff -z --name-only $(git merge-base <upstream rev> HEAD)`
/// * Mercurial: `hg status --print0 -mard --no-status --rev 'ancestor(<upstream rev>,.)'`
///
/// [mb]:
/// https://stackoverflow.com/questions/1549146/git-find-the-most-recent-common-ancestor-of-two-branches
///
/// ---
///
/// **NOTE:**
/// * The `$()` syntax in Bash and other shells means "run the command and insert its contents
/// here".
/// * Git provides a syntax `<upstream rev>...` which purports to use the merge base, but it ignores
/// uncommitted changes. Executing `git merge-base` as a separate command is the only way to
/// include uncommitted changes.
/// * The `-mard` flag to `hg status` means that untracked files are not included. `git diff` does
/// not have an option to display untracked files. For more discussion, see the documentation for
/// [`add_changed_paths`](crate::Determinator::add_changed_paths).
///
/// ---
///
/// In general, to obtain a list of changed paths between two revisions (omit `<new rev>` if
/// comparing against the working directory):
///
/// * Git: `git diff -z --name-only <old rev> <new rev>`
/// * Mercurial: `hg status --print0 -mard --no-status <old rev> <new rev>`
///
/// To obtain a list of all files in the working directory that are tracked by the source control
/// system:
///
/// * Git: `git ls-files -z`
/// * Mercurial: `hg files --print0`
///
/// Null-separated paths are produced through the `-z` option to Git commands, or the `--print0`
/// option to Mercurial. If you're using a different system, check its help for instructions.
///
/// # Implementations
///
/// `&'a Utf8Paths0` implements `IntoIterator<Item = &'a Utf8Path>`.
#[derive(Clone, Debug, Eq, Ord, PartialOrd, PartialEq)]
pub struct Utf8Paths0 {
buf: Box<str>,
}
impl Utf8Paths0 {
/// Creates a new instance of `Utf8Paths0` from a string with embedded nulls.
///
/// The string may, but does not need to, have a trailing null byte.
pub fn new(buf: impl Into<String>) -> Self {
Self::strip_trailing_null_byte(buf.into())
}
/// Creates a new instance of `Utf8Paths0` from a `Vec<u8>`, performing a UTF-8 validation
/// check on the buffer.
///
/// The buffer may, but does not need to, have a trailing null byte.
///
/// ## Errors
///
/// If any paths inside the string weren't valid UTF-8, this returns the first path that failed
/// to parse and the error returned.
pub fn from_bytes(buf: impl Into<Vec<u8>>) -> Result<Self, (Vec<u8>, Utf8Error)> {
let buf = buf.into();
let buf = Self::validate_utf8(buf)?;
Ok(Self::strip_trailing_null_byte(buf))
}
/// Creates a new instance of `Utf8Paths0`, converting `/` to `\` on platforms like Windows.
///
/// Some tools like Git (but not Mercurial) return paths with `/` on Windows, even though the
/// canonical separator on the platform is `\`. This constructor changes all instances of `/`
/// to `\`.
pub fn new_forward_slashes(buf: impl Into<String>) -> Self {
let mut buf = buf.into();
// Change all `/` to `\` on Windows.
if std::path::MAIN_SEPARATOR == '\\' {
buf = buf.replace('/', "\\");
}
Self::strip_trailing_null_byte(buf)
}
/// Iterates over the paths in this buffer.
pub fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = &'a Utf8Path> + 'a> {
self.into_iter()
}
// ---
// Helper methods
// ---
fn validate_utf8(buf: Vec<u8>) -> Result<String, (Vec<u8>, Utf8Error)> {
match String::from_utf8(buf) {
Ok(s) => Ok(s),
Err(err) => {
let buf = err.into_bytes();
// Look for the path that failed validation.
buf.split(|b| *b == 0)
.try_for_each(|path| match std::str::from_utf8(path) {
Ok(_) => Ok(()),
Err(utf8_error) => Err((path.to_vec(), utf8_error)),
})?;
unreachable!("full buffer failed utf-8 validation => at least one path failed");
}
}
}
fn strip_trailing_null_byte(mut buf: String) -> Self {
if buf.as_bytes().last() == Some(&0) {
buf.pop();
}
Self { buf: buf.into() }
}
}
impl<'a> IntoIterator for &'a Utf8Paths0 {
type Item = &'a Utf8Path;
type IntoIter = Box<dyn Iterator<Item = &'a Utf8Path> + 'a>;
fn into_iter(self) -> Self::IntoIter {
// An empty string means there are no paths -- this has to be handled as a special case.
if self.buf.is_empty() {
return Box::new(std::iter::empty());
}
Box::new(self.buf.split('\0').map(Utf8Path::new))
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn basic() {
// Empty string should return no paths.
paths_eq(*b"", &[]);
paths_eq(*b"a/b/c", &["a/b/c"]);
paths_eq(*b"a/b\0a/c", &["a/b", "a/c"]);
paths_eq(*b"a/b\0a/c\0", &["a/b", "a/c"]);
// UTF-8
paths_eq(*b"a/b\xF0\x9F\x98\x81\0c/d", &["a/b😁", "c/d"]);
}
// This is really a Windows test but it should work on all platforms.
#[test]
fn backslashes() {
paths_eq(*b"a\\b\\c", &["a\\b\\c"]);
paths_eq(*b"a\\b\0a\\c", &["a\\b", "a\\c"]);
paths_eq(*b"a\\b\0a\\c\0", &["a\\b", "a\\c"]);
}
#[cfg(windows)]
#[test]
fn forward_slashes() {
paths_eq_fwd(*b"a/b/c", &["a\\b\\c"]);
paths_eq_fwd(*b"a/b\0a/c", &["a\\b", "a\\c"]);
paths_eq_fwd(*b"a/b\0a/c\0", &["a\\b", "a\\c"]);
// Also test mixed forward/backslashes.
paths_eq_fwd(*b"a/b\0a\\c", &["a\\b", "a\\c"]);
}
fn paths_eq(bytes: impl Into<Vec<u8>>, expected: &[&str]) {
let paths = Utf8Paths0::from_bytes(bytes.into()).expect("null-separated paths are valid");
let actual: Vec<_> = paths.iter().collect();
let expected: Vec<_> = expected.iter().map(Utf8Path::new).collect();
assert_eq!(actual, expected, "paths match");
}
#[cfg(windows)]
fn paths_eq_fwd(bytes: impl Into<Vec<u8>>, expected: &[&str]) {
let s = String::from_utf8(bytes.into()).expect("valid UTF-8");
let paths = Utf8Paths0::new_forward_slashes(s);
let actual: Vec<_> = paths.iter().collect();
let expected: Vec<_> = expected.iter().map(Utf8Path::new).collect();
assert_eq!(actual, expected, "paths match");
}
}