Skip to content

Commit feca5d0

Browse files
committed
feat!: rewrite-tracker now uses blob diff-platform for correct and optimized diffing.
Correctness is improved as all necessary transformation are now performed. Performance is improved by avoiding duplicate work by caching transformed diffable data for later reuse.
1 parent ebe66db commit feca5d0

File tree

4 files changed

+143
-76
lines changed

4 files changed

+143
-76
lines changed

gix-diff/src/blob/platform.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -351,7 +351,7 @@ impl Platform {
351351
mode: gix_object::tree::EntryKind,
352352
rela_path: &BStr,
353353
kind: ResourceKind,
354-
objects: &impl gix_object::FindObjectOrHeader, // TODO: make this `dyn` once https://github.com/rust-lang/rust/issues/65991 is stable
354+
objects: &impl gix_object::FindObjectOrHeader, // TODO: make this `dyn` once https://github.com/rust-lang/rust/issues/65991 is stable, then also make tracker.rs `objects` dyn
355355
) -> Result<(), set_resource::Error> {
356356
let res = self.set_resource_inner(id, mode, rela_path, kind, objects);
357357
if res.is_err() {

gix-diff/src/rewrites/mod.rs

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,8 @@ pub struct Tracker<T> {
1010
items: Vec<tracker::Item<T>>,
1111
/// A place to store all paths in to reduce amount of allocations.
1212
path_backing: Vec<u8>,
13-
/// A buffer for use when fetching objects for similarity tests.
14-
buf1: Vec<u8>,
15-
/// Another buffer for use when fetching objects for similarity tests.
16-
buf2: Vec<u8>,
1713
/// How to track copies and/or rewrites.
1814
rewrites: Rewrites,
19-
/// The diff algorithm to use when checking for similarity.
20-
diff_algo: crate::blob::Algorithm,
2115
}
2216

2317
/// Determine in which set of files to search for copies.

gix-diff/src/rewrites/tracker.rs

Lines changed: 110 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,11 @@ use std::ops::Range;
88

99
use gix_object::tree::{EntryKind, EntryMode};
1010

11-
use crate::blob::DiffLineStats;
11+
use crate::blob::platform::prepare_diff::Operation;
12+
use crate::blob::{DiffLineStats, ResourceKind};
1213
use crate::rewrites::{CopySource, Outcome};
1314
use crate::{rewrites::Tracker, Rewrites};
1415
use bstr::BStr;
15-
use gix_object::FindExt;
1616

1717
/// The kind of a change.
1818
#[derive(Debug, Copy, Clone, Ord, PartialOrd, PartialEq, Eq)]
@@ -123,21 +123,21 @@ pub mod emit {
123123
FindExistingBlob(#[from] gix_object::find::existing_object::Error),
124124
#[error("Could not obtain exhaustive item set to use as possible sources for copy detection")]
125125
GetItemsForExhaustiveCopyDetection(#[source] Box<dyn std::error::Error + Send + Sync>),
126+
#[error(transparent)]
127+
SetResource(#[from] crate::blob::platform::set_resource::Error),
128+
#[error(transparent)]
129+
PrepareDiff(#[from] crate::blob::platform::prepare_diff::Error),
126130
}
127131
}
128132

129133
/// Lifecycle
130134
impl<T: Change> Tracker<T> {
131-
/// Create a new instance with `rewrites` configuration, and the `diff_algo` to use when performing
132-
/// similarity checking.
133-
pub fn new(rewrites: Rewrites, diff_algo: crate::blob::Algorithm) -> Self {
135+
/// Create a new instance with `rewrites` configuration.
136+
pub fn new(rewrites: Rewrites) -> Self {
134137
Tracker {
135138
items: vec![],
136139
path_backing: vec![],
137-
buf1: Vec::new(),
138-
buf2: Vec::new(),
139140
rewrites,
140-
diff_algo,
141141
}
142142
}
143143
}
@@ -177,25 +177,31 @@ impl<T: Change> Tracker<T> {
177177
///
178178
/// `objects` is used to access blob data for similarity checks if required and is taken directly from the object database.
179179
/// Worktree filters and text conversions will be applied afterwards automatically. Note that object-caching *should not*
180-
/// be enabled as caching is implemented internally, after all, the blob that's actually diffed is going through conversion steps.
180+
/// be enabled as caching is implemented by `diff_cache`, after all, the blob that's actually diffed is going
181+
/// through conversion steps.
181182
///
182-
/// Use `worktree_filter` to obtain working-tree versions of files present on disk before diffing to see if rewrites happened,
183-
/// with text-conversions being applied afterwards.
183+
/// `diff_cache` is a way to retain a cache of resources that are prepared for rapid diffing, and it also controls
184+
/// the diff-algorithm (provided no user-algorithm is set).
185+
/// Note that we control a few options of `diff_cache` to assure it will ignore external commands.
186+
/// Note that we do not control how the `diff_cache` converts resources, it's left to the caller to decide
187+
/// if it should look at what's stored in `git`, or in the working tree, along with all diff-specific conversions.
184188
///
185189
/// `push_source_tree(push_fn: push(change, location))` is a function that is called when the entire tree of the source
186190
/// should be added as modifications by calling `push` repeatedly to use for perfect copy tracking. Note that `push`
187191
/// will panic if `change` is not a modification, and it's valid to not call `push` at all.
188192
pub fn emit<PushSourceTreeFn, E>(
189193
&mut self,
190194
mut cb: impl FnMut(visit::Destination<'_, T>, Option<visit::Source<'_>>) -> crate::tree::visit::Action,
191-
objects: &dyn gix_object::Find,
192-
_worktree_filter: &mut gix_filter::Pipeline,
195+
diff_cache: &mut crate::blob::Platform,
196+
objects: &impl gix_object::FindObjectOrHeader,
193197
mut push_source_tree: PushSourceTreeFn,
194198
) -> Result<Outcome, emit::Error>
195199
where
196200
PushSourceTreeFn: FnMut(&mut dyn FnMut(T, &BStr)) -> Result<(), E>,
197201
E: std::error::Error + Send + Sync + 'static,
198202
{
203+
diff_cache.options.skip_internal_diff_if_external_is_configured = false;
204+
199205
fn by_id_and_location<T: Change>(a: &Item<T>, b: &Item<T>) -> std::cmp::Ordering {
200206
a.change
201207
.id()
@@ -213,11 +219,19 @@ impl<T: Change> Tracker<T> {
213219
&mut cb,
214220
self.rewrites.percentage,
215221
&mut out,
222+
diff_cache,
216223
objects,
217224
)?;
218225

219226
if let Some(copies) = self.rewrites.copies {
220-
self.match_pairs_of_kind(visit::SourceKind::Copy, &mut cb, copies.percentage, &mut out, objects)?;
227+
self.match_pairs_of_kind(
228+
visit::SourceKind::Copy,
229+
&mut cb,
230+
copies.percentage,
231+
&mut out,
232+
diff_cache,
233+
objects,
234+
)?;
221235

222236
match copies.source {
223237
CopySource::FromSetOfModifiedFiles => {}
@@ -233,7 +247,14 @@ impl<T: Change> Tracker<T> {
233247
.map_err(|err| emit::Error::GetItemsForExhaustiveCopyDetection(Box::new(err)))?;
234248
self.items.sort_by(by_id_and_location);
235249

236-
self.match_pairs_of_kind(visit::SourceKind::Copy, &mut cb, copies.percentage, &mut out, objects)?;
250+
self.match_pairs_of_kind(
251+
visit::SourceKind::Copy,
252+
&mut cb,
253+
copies.percentage,
254+
&mut out,
255+
diff_cache,
256+
objects,
257+
)?;
237258
}
238259
}
239260
}
@@ -263,11 +284,14 @@ impl<T: Change> Tracker<T> {
263284
cb: &mut impl FnMut(visit::Destination<'_, T>, Option<visit::Source<'_>>) -> crate::tree::visit::Action,
264285
percentage: Option<f32>,
265286
out: &mut Outcome,
266-
objects: &dyn gix_object::Find,
287+
diff_cache: &mut crate::blob::Platform,
288+
objects: &impl gix_object::FindObjectOrHeader,
267289
) -> Result<(), emit::Error> {
268290
// we try to cheaply reduce the set of possibilities first, before possibly looking more exhaustively.
269291
let needs_second_pass = !needs_exact_match(percentage);
270-
if self.match_pairs(cb, None /* by identity */, kind, out, objects)? == crate::tree::visit::Action::Cancel {
292+
if self.match_pairs(cb, None /* by identity */, kind, out, diff_cache, objects)?
293+
== crate::tree::visit::Action::Cancel
294+
{
271295
return Ok(());
272296
}
273297
if needs_second_pass {
@@ -292,7 +316,7 @@ impl<T: Change> Tracker<T> {
292316
}
293317
};
294318
if !is_limited {
295-
self.match_pairs(cb, percentage, kind, out, objects)?;
319+
self.match_pairs(cb, percentage, kind, out, diff_cache, objects)?;
296320
}
297321
}
298322
Ok(())
@@ -304,9 +328,9 @@ impl<T: Change> Tracker<T> {
304328
percentage: Option<f32>,
305329
kind: visit::SourceKind,
306330
stats: &mut Outcome,
307-
objects: &dyn gix_object::Find,
331+
diff_cache: &mut crate::blob::Platform,
332+
objects: &impl gix_object::FindObjectOrHeader,
308333
) -> Result<crate::tree::visit::Action, emit::Error> {
309-
// TODO(perf): reuse object data and interner state and interned tokens, make these available to `find_match()`
310334
let mut dest_ofs = 0;
311335
while let Some((mut dest_idx, dest)) = self.items[dest_ofs..].iter().enumerate().find_map(|(idx, item)| {
312336
(!item.emitted && matches!(item.change.kind(), ChangeKind::Addition)).then_some((idx, item))
@@ -317,12 +341,12 @@ impl<T: Change> Tracker<T> {
317341
&self.items,
318342
dest,
319343
dest_idx,
320-
percentage.map(|p| (p, self.diff_algo)),
344+
percentage,
321345
kind,
322346
stats,
323347
objects,
324-
&mut self.buf1,
325-
&mut self.buf2,
348+
diff_cache,
349+
&self.path_backing,
326350
)?
327351
.map(|(src_idx, src, diff)| {
328352
let (id, entry_mode) = src.change.id_and_entry_mode();
@@ -409,15 +433,15 @@ fn find_match<'a, T: Change>(
409433
items: &'a [Item<T>],
410434
item: &Item<T>,
411435
item_idx: usize,
412-
percentage: Option<(f32, crate::blob::Algorithm)>,
436+
percentage: Option<f32>,
413437
kind: visit::SourceKind,
414438
stats: &mut Outcome,
415-
objects: &dyn gix_object::Find,
416-
buf1: &mut Vec<u8>,
417-
buf2: &mut Vec<u8>,
439+
objects: &impl gix_object::FindObjectOrHeader,
440+
diff_cache: &mut crate::blob::Platform,
441+
path_backing: &[u8],
418442
) -> Result<Option<SourceTuple<'a, T>>, emit::Error> {
419443
let (item_id, item_mode) = item.change.id_and_entry_mode();
420-
if needs_exact_match(percentage.map(|t| t.0)) || item_mode.is_link() {
444+
if needs_exact_match(percentage) || item_mode.is_link() {
421445
let first_idx = items.partition_point(|a| a.change.id() < item_id);
422446
let range = match items.get(first_idx..).map(|items| {
423447
let end = items
@@ -440,55 +464,76 @@ fn find_match<'a, T: Change>(
440464
return Ok(Some(src));
441465
}
442466
} else {
443-
let mut new = None;
444-
let (percentage, algo) = percentage.expect("it's set to something below 1.0 and we assured this");
467+
let mut has_new = false;
468+
let percentage = percentage.expect("it's set to something below 1.0 and we assured this");
445469
debug_assert_eq!(
446470
item.change.entry_mode().kind(),
447471
EntryKind::Blob,
448472
"symlinks are matched exactly, and trees aren't used here"
449473
);
474+
450475
for (can_idx, src) in items
451476
.iter()
452477
.enumerate()
453478
.filter(|(src_idx, src)| *src_idx != item_idx && src.is_source_for_destination_of(kind, item_mode))
454479
{
455-
let new = match &new {
456-
Some(new) => new,
457-
None => {
458-
new = objects.find_blob(item_id, buf1)?.into();
459-
new.as_ref().expect("just set")
460-
}
461-
};
462-
let old = objects.find_blob(src.change.id(), buf2)?;
463-
// TODO: make sure we get attribute handling/worktree conversion and binary skips and filters right here.
464-
let tokens = crate::blob::intern::InternedInput::new(
465-
crate::blob::sources::byte_lines_with_terminator(old.data),
466-
crate::blob::sources::byte_lines_with_terminator(new.data),
467-
);
468-
let counts = crate::blob::diff(
469-
algo,
470-
&tokens,
471-
crate::blob::sink::Counter::new(diff::Statistics {
472-
removed_bytes: 0,
473-
input: &tokens,
474-
}),
475-
);
476-
let similarity = (old.data.len() - counts.wrapped) as f32 / old.data.len().max(new.data.len()) as f32;
480+
if !has_new {
481+
diff_cache.set_resource(
482+
item_id.to_owned(),
483+
item_mode.kind(),
484+
item.location(path_backing),
485+
ResourceKind::NewOrDestination,
486+
objects,
487+
)?;
488+
has_new = true;
489+
}
490+
let (src_id, src_mode) = src.change.id_and_entry_mode();
491+
diff_cache.set_resource(
492+
src_id.to_owned(),
493+
src_mode.kind(),
494+
src.location(path_backing),
495+
ResourceKind::OldOrSource,
496+
objects,
497+
)?;
498+
let prep = diff_cache.prepare_diff()?;
477499
stats.num_similarity_checks += 1;
478-
if similarity >= percentage {
479-
return Ok(Some((
480-
can_idx,
481-
src,
482-
DiffLineStats {
483-
removals: counts.removals,
484-
insertions: counts.insertions,
485-
before: tokens.before.len().try_into().expect("interner handles only u32"),
486-
after: tokens.after.len().try_into().expect("interner handles only u32"),
487-
similarity,
500+
match prep.operation {
501+
Operation::InternalDiff { algorithm } => {
502+
let tokens =
503+
crate::blob::intern::InternedInput::new(prep.old.intern_source(), prep.new.intern_source());
504+
let counts = crate::blob::diff(
505+
algorithm,
506+
&tokens,
507+
crate::blob::sink::Counter::new(diff::Statistics {
508+
removed_bytes: 0,
509+
input: &tokens,
510+
}),
511+
);
512+
let old_data_len = prep.old.data.as_slice().unwrap_or_default().len();
513+
let new_data_len = prep.new.data.as_slice().unwrap_or_default().len();
514+
let similarity = (old_data_len - counts.wrapped) as f32 / old_data_len.max(new_data_len) as f32;
515+
if similarity >= percentage {
516+
return Ok(Some((
517+
can_idx,
518+
src,
519+
DiffLineStats {
520+
removals: counts.removals,
521+
insertions: counts.insertions,
522+
before: tokens.before.len().try_into().expect("interner handles only u32"),
523+
after: tokens.after.len().try_into().expect("interner handles only u32"),
524+
similarity,
525+
}
526+
.into(),
527+
)));
488528
}
489-
.into(),
490-
)));
491-
}
529+
}
530+
Operation::ExternalCommand { .. } => {
531+
unreachable!("we have disabled this possibility with an option")
532+
}
533+
Operation::SourceOrDestinationIsBinary => {
534+
// TODO: figure out if git does more here
535+
}
536+
};
492537
}
493538
}
494539
Ok(None)

gix-diff/tests/rewrites/tracker.rs

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -564,23 +564,23 @@ mod util {
564564
pub fn assert_emit_with_objects(
565565
tracker: &mut rewrites::Tracker<Change>,
566566
cb: impl FnMut(Destination<'_, Change>, Option<Source<'_>>) -> Action,
567-
objects: impl gix_object::Find,
567+
objects: impl gix_object::FindObjectOrHeader,
568568
) -> rewrites::Outcome {
569569
assert_emit_with_objects_and_sources(tracker, cb, objects, None)
570570
}
571571

572572
pub fn assert_emit_with_objects_and_sources<'a>(
573573
tracker: &mut rewrites::Tracker<Change>,
574574
cb: impl FnMut(Destination<'_, Change>, Option<Source<'_>>) -> Action,
575-
objects: impl gix_object::Find,
575+
objects: impl gix_object::FindObjectOrHeader,
576576
sources: impl IntoIterator<Item = (Change, &'a str)>,
577577
) -> rewrites::Outcome {
578578
let mut sources: Vec<_> = sources.into_iter().collect();
579579
tracker
580580
.emit(
581581
cb,
582+
&mut new_platform_no_worktree(),
582583
&objects,
583-
&mut gix_filter::Pipeline::new(Default::default(), Default::default()),
584584
|cb| -> Result<(), std::io::Error> {
585585
let sources = std::mem::take(&mut sources);
586586
if sources.is_empty() {
@@ -596,6 +596,34 @@ mod util {
596596
}
597597

598598
pub fn new_tracker(rewrites: Rewrites) -> rewrites::Tracker<Change> {
599-
rewrites::Tracker::new(rewrites, gix_diff::blob::Algorithm::Myers)
599+
rewrites::Tracker::new(rewrites)
600+
}
601+
602+
fn new_platform_no_worktree() -> gix_diff::blob::Platform {
603+
let root = gix_testtools::scripted_fixture_read_only_standalone("make_blob_repo.sh").expect("valid fixture");
604+
let attributes = gix_worktree::Stack::new(
605+
root,
606+
gix_worktree::stack::State::AttributesStack(gix_worktree::stack::state::Attributes::new(
607+
Default::default(),
608+
None,
609+
gix_worktree::stack::state::attributes::Source::IdMapping,
610+
Default::default(),
611+
)),
612+
gix_worktree::glob::pattern::Case::Sensitive,
613+
Vec::new(),
614+
Vec::new(),
615+
);
616+
let filter = gix_diff::blob::Pipeline::new(
617+
Default::default(),
618+
gix_filter::Pipeline::default(),
619+
Vec::new(),
620+
Default::default(),
621+
);
622+
gix_diff::blob::Platform::new(
623+
Default::default(),
624+
filter,
625+
gix_diff::blob::pipeline::Mode::ToGit,
626+
attributes,
627+
)
600628
}
601629
}

0 commit comments

Comments
 (0)