Skip to content

feat(ops): create get_changed_paths_between_trees_fast #14

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
214 changes: 214 additions & 0 deletions src/ops.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
//! They serve as both examples on how to use `git2` but also should be usable in some limited
//! subset of cases.

use std::collections::{HashMap, HashSet};
use std::path::PathBuf;

use itertools::Itertools;

/// Lookup the commit ID for `HEAD`
Expand Down Expand Up @@ -45,6 +48,217 @@ pub fn is_dirty(repo: &git2::Repository) -> bool {
}
}

/// This function is a hot code path. Do not annotate with `#[instrument]`, and
/// be mindful of performance/memory allocations.
fn get_changed_paths_between_trees_internal(
repo: &git2::Repository,
acc: &mut Vec<Vec<PathBuf>>,
current_path: &[PathBuf],
lhs: Option<&git2::Tree>,
rhs: Option<&git2::Tree>,
) -> ChangedPathsResult<()> {
let lhs_entries: Vec<_> = lhs.map(|tree| tree.iter().collect()).unwrap_or_default();
let lhs_entries: HashMap<&[u8], &git2::TreeEntry> = lhs_entries
.iter()
.map(|entry| (entry.name_bytes(), entry))
.collect();

let rhs_entries: Vec<_> = rhs.map(|tree| tree.iter().collect()).unwrap_or_default();
let rhs_entries: HashMap<&[u8], &git2::TreeEntry> = rhs_entries
.iter()
.map(|entry| (entry.name_bytes(), entry))
.collect();

let all_entry_names: HashSet<&[u8]> = lhs_entries
.keys()
.chain(rhs_entries.keys())
.cloned()
.collect();
let entries: HashMap<&[u8], (Option<&git2::TreeEntry>, Option<&git2::TreeEntry>)> =
all_entry_names
.into_iter()
.map(|entry_name| {
(
entry_name,
(
lhs_entries.get(entry_name).copied(),
rhs_entries.get(entry_name).copied(),
),
)
})
.collect();

for (entry_name, (lhs_entry, rhs_entry)) in entries {
enum ClassifiedEntry {
Absent,
NotATree(git2::Oid, i32),
Tree(git2::Oid, i32),
}

fn classify_entry(entry: Option<&git2::TreeEntry>) -> ChangedPathsResult<ClassifiedEntry> {
let entry = match entry {
Some(entry) => entry,
None => return Ok(ClassifiedEntry::Absent),
};

let file_mode = entry.filemode_raw();
match entry.kind() {
Some(git2::ObjectType::Tree) => Ok(ClassifiedEntry::Tree(entry.id(), file_mode)),
_ => Ok(ClassifiedEntry::NotATree(entry.id(), file_mode)),
}
}

let get_tree = |oid| match repo.find_tree(oid) {
Ok(tree) => Ok(tree),
Err(err) => Err(ChangedPathsError::TreeLookupFailure { source: err, oid }),
};

let full_entry_path = {
let entry_name = match std::str::from_utf8(entry_name) {
Ok(entry_name) => entry_name,
Err(_) => continue,
};
let mut full_entry_path = current_path.to_vec();
full_entry_path.push(PathBuf::from(entry_name));
full_entry_path
};
match (classify_entry(lhs_entry)?, classify_entry(rhs_entry)?) {
(ClassifiedEntry::Absent, ClassifiedEntry::Absent) => {
// Shouldn't happen, but there's no issue here.
}

(
ClassifiedEntry::NotATree(lhs_oid, lhs_file_mode),
ClassifiedEntry::NotATree(rhs_oid, rhs_file_mode),
) => {
if lhs_oid == rhs_oid && lhs_file_mode == rhs_file_mode {
// Unchanged file, do nothing.
} else {
// Changed file.
acc.push(full_entry_path);
}
}

(ClassifiedEntry::Absent, ClassifiedEntry::NotATree(_, _))
| (ClassifiedEntry::NotATree(_, _), ClassifiedEntry::Absent) => {
// Added, removed, or changed file.
acc.push(full_entry_path);
}

(ClassifiedEntry::Absent, ClassifiedEntry::Tree(tree_oid, _))
| (ClassifiedEntry::Tree(tree_oid, _), ClassifiedEntry::Absent) => {
// A directory was added or removed. Add all entries from that
// directory.
let tree = get_tree(tree_oid)?;
get_changed_paths_between_trees_internal(
repo,
acc,
&full_entry_path,
Some(&tree),
None,
)?;
}

(ClassifiedEntry::NotATree(_, _), ClassifiedEntry::Tree(tree_oid, _))
| (ClassifiedEntry::Tree(tree_oid, _), ClassifiedEntry::NotATree(_, _)) => {
// A file was changed into a directory. Add both the file and
// all subdirectory entries as changed entries.
let tree = get_tree(tree_oid)?;
get_changed_paths_between_trees_internal(
repo,
acc,
&full_entry_path,
Some(&tree),
None,
)?;
acc.push(full_entry_path);
}

(
ClassifiedEntry::Tree(lhs_tree_oid, lhs_file_mode),
ClassifiedEntry::Tree(rhs_tree_oid, rhs_file_mode),
) => {
match (
(lhs_tree_oid == rhs_tree_oid),
// Note that there should only be one possible file mode for
// an entry which points to a tree, but it's possible that
// some extra non-meaningful bits are set. Should we report
// a change in that case? This code takes the conservative
// approach and reports a change.
(lhs_file_mode == rhs_file_mode),
) {
(true, true) => {
// Unchanged entry, do nothing.
}

(true, false) => {
// Only the directory changed, but none of its contents.
acc.push(full_entry_path);
}

(false, true) => {
let lhs_tree = get_tree(lhs_tree_oid)?;
let rhs_tree = get_tree(rhs_tree_oid)?;

// Only include the files changed in the subtrees, and
// not the directory itself.
get_changed_paths_between_trees_internal(
repo,
acc,
&full_entry_path,
Some(&lhs_tree),
Some(&rhs_tree),
)?;
}

(false, false) => {
let lhs_tree = get_tree(lhs_tree_oid)?;
let rhs_tree = get_tree(rhs_tree_oid)?;

get_changed_paths_between_trees_internal(
repo,
acc,
&full_entry_path,
Some(&lhs_tree),
Some(&rhs_tree),
)?;
acc.push(full_entry_path);
}
}
}
}
}

Ok(())
}

#[derive(Debug, PartialEq)]
pub enum ChangedPathsError {
/// An error occurred when trying to look up a tree by OID.
TreeLookupFailure { source: git2::Error, oid: git2::Oid },
}

pub type ChangedPathsResult<T> = Result<T, ChangedPathsError>;

/// Calculate which paths have changed between two trees more quickly than
/// libgit2. See https://github.com/libgit2/libgit2/issues/6036 for more
/// discussion.
///
/// The libgit2 implementation works by iterating both trees recursively and
/// comparing them, which is O(n) in the size of the trees. This implementation
/// works by mutually traversing both trees and stopping early for subtrees
/// which are equal, which is O(n) in the number of *changes* instead.
pub fn get_changed_paths_between_trees_fast(
repo: &git2::Repository,
lhs: Option<&git2::Tree>,
rhs: Option<&git2::Tree>,
) -> ChangedPathsResult<HashSet<PathBuf>> {
let mut acc = Vec::new();
get_changed_paths_between_trees_internal(repo, &mut acc, &Vec::new(), lhs, rhs)?;
let changed_paths: HashSet<PathBuf> = acc.into_iter().map(PathBuf::from_iter).collect();
Ok(changed_paths)
}

/// Cherry pick a commit onto another without touching the working directory
pub fn cherry_pick(
repo: &git2::Repository,
Expand Down
34 changes: 34 additions & 0 deletions tests/ops.rs
Original file line number Diff line number Diff line change
Expand Up @@ -117,3 +117,37 @@ fn reword() {

temp.close().unwrap();
}

#[test]
fn test_get_changed_paths_between_trees() {
let temp = assert_fs::TempDir::new().unwrap();
let plan =
git_fixture::TodoList::load(std::path::Path::new("tests/fixtures/branches.yml")).unwrap();
plan.run(temp.path()).unwrap();

let repo = git2::Repository::discover(temp.path()).unwrap();

{
let head_tree = repo.head().unwrap().peel_to_tree().unwrap();
let parent_tree = repo
.head()
.unwrap()
.peel_to_commit()
.unwrap()
.parent(0)
.unwrap()
.tree()
.unwrap();
let changed_paths = git2_ext::ops::get_changed_paths_between_trees_fast(
&repo,
Some(&head_tree),
Some(&parent_tree),
);
assert_eq!(
changed_paths,
Ok(["file_c.txt".into()].into_iter().collect())
);
}

temp.close().unwrap();
}