Skip to content

Commit c80bfc1

Browse files
committed
feat(ops): create get_changed_paths_between_trees_fast
This is a faster path than the one offered by libgit2. I didn't actually update `cherry_pick` to use this faster path, though, as it's a little more involved (you have to create temporary "dehydrated" trees, apply the cherry-pick, and then "rehydrate" them).
1 parent c7fa8c5 commit c80bfc1

File tree

2 files changed

+248
-0
lines changed

2 files changed

+248
-0
lines changed

src/ops.rs

Lines changed: 214 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@
44
//! They serve as both examples on how to use `git2` but also should be usable in some limited
55
//! subset of cases.
66
7+
use std::collections::{HashMap, HashSet};
8+
use std::path::PathBuf;
9+
710
use itertools::Itertools;
811

912
/// Lookup the commit ID for `HEAD`
@@ -45,6 +48,217 @@ pub fn is_dirty(repo: &git2::Repository) -> bool {
4548
}
4649
}
4750

51+
/// This function is a hot code path. Do not annotate with `#[instrument]`, and
52+
/// be mindful of performance/memory allocations.
53+
fn get_changed_paths_between_trees_internal(
54+
repo: &git2::Repository,
55+
acc: &mut Vec<Vec<PathBuf>>,
56+
current_path: &[PathBuf],
57+
lhs: Option<&git2::Tree>,
58+
rhs: Option<&git2::Tree>,
59+
) -> ChangedPathsResult<()> {
60+
let lhs_entries: Vec<_> = lhs.map(|tree| tree.iter().collect()).unwrap_or_default();
61+
let lhs_entries: HashMap<&[u8], &git2::TreeEntry> = lhs_entries
62+
.iter()
63+
.map(|entry| (entry.name_bytes(), entry))
64+
.collect();
65+
66+
let rhs_entries: Vec<_> = rhs.map(|tree| tree.iter().collect()).unwrap_or_default();
67+
let rhs_entries: HashMap<&[u8], &git2::TreeEntry> = rhs_entries
68+
.iter()
69+
.map(|entry| (entry.name_bytes(), entry))
70+
.collect();
71+
72+
let all_entry_names: HashSet<&[u8]> = lhs_entries
73+
.keys()
74+
.chain(rhs_entries.keys())
75+
.cloned()
76+
.collect();
77+
let entries: HashMap<&[u8], (Option<&git2::TreeEntry>, Option<&git2::TreeEntry>)> =
78+
all_entry_names
79+
.into_iter()
80+
.map(|entry_name| {
81+
(
82+
entry_name,
83+
(
84+
lhs_entries.get(entry_name).copied(),
85+
rhs_entries.get(entry_name).copied(),
86+
),
87+
)
88+
})
89+
.collect();
90+
91+
for (entry_name, (lhs_entry, rhs_entry)) in entries {
92+
enum ClassifiedEntry {
93+
Absent,
94+
NotATree(git2::Oid, i32),
95+
Tree(git2::Oid, i32),
96+
}
97+
98+
fn classify_entry(entry: Option<&git2::TreeEntry>) -> ChangedPathsResult<ClassifiedEntry> {
99+
let entry = match entry {
100+
Some(entry) => entry,
101+
None => return Ok(ClassifiedEntry::Absent),
102+
};
103+
104+
let file_mode = entry.filemode_raw();
105+
match entry.kind() {
106+
Some(git2::ObjectType::Tree) => Ok(ClassifiedEntry::Tree(entry.id(), file_mode)),
107+
_ => Ok(ClassifiedEntry::NotATree(entry.id(), file_mode)),
108+
}
109+
}
110+
111+
let get_tree = |oid| match repo.find_tree(oid) {
112+
Ok(tree) => Ok(tree),
113+
Err(err) => Err(ChangedPathsError::TreeLookupFailure { source: err, oid }),
114+
};
115+
116+
let full_entry_path = {
117+
let entry_name = match std::str::from_utf8(entry_name) {
118+
Ok(entry_name) => entry_name,
119+
Err(_) => continue,
120+
};
121+
let mut full_entry_path = current_path.to_vec();
122+
full_entry_path.push(PathBuf::from(entry_name));
123+
full_entry_path
124+
};
125+
match (classify_entry(lhs_entry)?, classify_entry(rhs_entry)?) {
126+
(ClassifiedEntry::Absent, ClassifiedEntry::Absent) => {
127+
// Shouldn't happen, but there's no issue here.
128+
}
129+
130+
(
131+
ClassifiedEntry::NotATree(lhs_oid, lhs_file_mode),
132+
ClassifiedEntry::NotATree(rhs_oid, rhs_file_mode),
133+
) => {
134+
if lhs_oid == rhs_oid && lhs_file_mode == rhs_file_mode {
135+
// Unchanged file, do nothing.
136+
} else {
137+
// Changed file.
138+
acc.push(full_entry_path);
139+
}
140+
}
141+
142+
(ClassifiedEntry::Absent, ClassifiedEntry::NotATree(_, _))
143+
| (ClassifiedEntry::NotATree(_, _), ClassifiedEntry::Absent) => {
144+
// Added, removed, or changed file.
145+
acc.push(full_entry_path);
146+
}
147+
148+
(ClassifiedEntry::Absent, ClassifiedEntry::Tree(tree_oid, _))
149+
| (ClassifiedEntry::Tree(tree_oid, _), ClassifiedEntry::Absent) => {
150+
// A directory was added or removed. Add all entries from that
151+
// directory.
152+
let tree = get_tree(tree_oid)?;
153+
get_changed_paths_between_trees_internal(
154+
repo,
155+
acc,
156+
&full_entry_path,
157+
Some(&tree),
158+
None,
159+
)?;
160+
}
161+
162+
(ClassifiedEntry::NotATree(_, _), ClassifiedEntry::Tree(tree_oid, _))
163+
| (ClassifiedEntry::Tree(tree_oid, _), ClassifiedEntry::NotATree(_, _)) => {
164+
// A file was changed into a directory. Add both the file and
165+
// all subdirectory entries as changed entries.
166+
let tree = get_tree(tree_oid)?;
167+
get_changed_paths_between_trees_internal(
168+
repo,
169+
acc,
170+
&full_entry_path,
171+
Some(&tree),
172+
None,
173+
)?;
174+
acc.push(full_entry_path);
175+
}
176+
177+
(
178+
ClassifiedEntry::Tree(lhs_tree_oid, lhs_file_mode),
179+
ClassifiedEntry::Tree(rhs_tree_oid, rhs_file_mode),
180+
) => {
181+
match (
182+
(lhs_tree_oid == rhs_tree_oid),
183+
// Note that there should only be one possible file mode for
184+
// an entry which points to a tree, but it's possible that
185+
// some extra non-meaningful bits are set. Should we report
186+
// a change in that case? This code takes the conservative
187+
// approach and reports a change.
188+
(lhs_file_mode == rhs_file_mode),
189+
) {
190+
(true, true) => {
191+
// Unchanged entry, do nothing.
192+
}
193+
194+
(true, false) => {
195+
// Only the directory changed, but none of its contents.
196+
acc.push(full_entry_path);
197+
}
198+
199+
(false, true) => {
200+
let lhs_tree = get_tree(lhs_tree_oid)?;
201+
let rhs_tree = get_tree(rhs_tree_oid)?;
202+
203+
// Only include the files changed in the subtrees, and
204+
// not the directory itself.
205+
get_changed_paths_between_trees_internal(
206+
repo,
207+
acc,
208+
&full_entry_path,
209+
Some(&lhs_tree),
210+
Some(&rhs_tree),
211+
)?;
212+
}
213+
214+
(false, false) => {
215+
let lhs_tree = get_tree(lhs_tree_oid)?;
216+
let rhs_tree = get_tree(rhs_tree_oid)?;
217+
218+
get_changed_paths_between_trees_internal(
219+
repo,
220+
acc,
221+
&full_entry_path,
222+
Some(&lhs_tree),
223+
Some(&rhs_tree),
224+
)?;
225+
acc.push(full_entry_path);
226+
}
227+
}
228+
}
229+
}
230+
}
231+
232+
Ok(())
233+
}
234+
235+
#[derive(Debug, PartialEq)]
236+
pub enum ChangedPathsError {
237+
/// An error occurred when trying to look up a tree by OID.
238+
TreeLookupFailure { source: git2::Error, oid: git2::Oid },
239+
}
240+
241+
pub type ChangedPathsResult<T> = Result<T, ChangedPathsError>;
242+
243+
/// Calculate which paths have changed between two trees more quickly than
244+
/// libgit2. See https://github.com/libgit2/libgit2/issues/6036 for more
245+
/// discussion.
246+
///
247+
/// The libgit2 implementation works by iterating both trees recursively and
248+
/// comparing them, which is O(n) in the size of the trees. This implementation
249+
/// works by mutually traversing both trees and stopping early for subtrees
250+
/// which are equal, which is O(n) in the number of *changes* instead.
251+
pub fn get_changed_paths_between_trees_fast(
252+
repo: &git2::Repository,
253+
lhs: Option<&git2::Tree>,
254+
rhs: Option<&git2::Tree>,
255+
) -> ChangedPathsResult<HashSet<PathBuf>> {
256+
let mut acc = Vec::new();
257+
get_changed_paths_between_trees_internal(repo, &mut acc, &Vec::new(), lhs, rhs)?;
258+
let changed_paths: HashSet<PathBuf> = acc.into_iter().map(PathBuf::from_iter).collect();
259+
Ok(changed_paths)
260+
}
261+
48262
/// Cherry pick a commit onto another without touching the working directory
49263
pub fn cherry_pick(
50264
repo: &git2::Repository,

tests/ops.rs

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,3 +117,37 @@ fn reword() {
117117

118118
temp.close().unwrap();
119119
}
120+
121+
#[test]
122+
fn test_get_changed_paths_between_trees() {
123+
let temp = assert_fs::TempDir::new().unwrap();
124+
let plan =
125+
git_fixture::TodoList::load(std::path::Path::new("tests/fixtures/branches.yml")).unwrap();
126+
plan.run(temp.path()).unwrap();
127+
128+
let repo = git2::Repository::discover(temp.path()).unwrap();
129+
130+
{
131+
let head_tree = repo.head().unwrap().peel_to_tree().unwrap();
132+
let parent_tree = repo
133+
.head()
134+
.unwrap()
135+
.peel_to_commit()
136+
.unwrap()
137+
.parent(0)
138+
.unwrap()
139+
.tree()
140+
.unwrap();
141+
let changed_paths = git2_ext::ops::get_changed_paths_between_trees_fast(
142+
&repo,
143+
Some(&head_tree),
144+
Some(&parent_tree),
145+
);
146+
assert_eq!(
147+
changed_paths,
148+
Ok(["file_c.txt".into()].into_iter().collect())
149+
);
150+
}
151+
152+
temp.close().unwrap();
153+
}

0 commit comments

Comments
 (0)