@@ -8,11 +8,11 @@ use std::ops::Range;
8
8
9
9
use gix_object:: tree:: { EntryKind , EntryMode } ;
10
10
11
- use crate :: blob:: DiffLineStats ;
11
+ use crate :: blob:: platform:: prepare_diff:: Operation ;
12
+ use crate :: blob:: { DiffLineStats , ResourceKind } ;
12
13
use crate :: rewrites:: { CopySource , Outcome } ;
13
14
use crate :: { rewrites:: Tracker , Rewrites } ;
14
15
use bstr:: BStr ;
15
- use gix_object:: FindExt ;
16
16
17
17
/// The kind of a change.
18
18
#[ derive( Debug , Copy , Clone , Ord , PartialOrd , PartialEq , Eq ) ]
@@ -123,21 +123,21 @@ pub mod emit {
123
123
FindExistingBlob ( #[ from] gix_object:: find:: existing_object:: Error ) ,
124
124
#[ error( "Could not obtain exhaustive item set to use as possible sources for copy detection" ) ]
125
125
GetItemsForExhaustiveCopyDetection ( #[ source] Box < dyn std:: error:: Error + Send + Sync > ) ,
126
+ #[ error( transparent) ]
127
+ SetResource ( #[ from] crate :: blob:: platform:: set_resource:: Error ) ,
128
+ #[ error( transparent) ]
129
+ PrepareDiff ( #[ from] crate :: blob:: platform:: prepare_diff:: Error ) ,
126
130
}
127
131
}
128
132
129
133
/// Lifecycle
130
134
impl < T : Change > Tracker < T > {
131
- /// Create a new instance with `rewrites` configuration, and the `diff_algo` to use when performing
132
- /// similarity checking.
133
- pub fn new ( rewrites : Rewrites , diff_algo : crate :: blob:: Algorithm ) -> Self {
135
+ /// Create a new instance with `rewrites` configuration.
136
+ pub fn new ( rewrites : Rewrites ) -> Self {
134
137
Tracker {
135
138
items : vec ! [ ] ,
136
139
path_backing : vec ! [ ] ,
137
- buf1 : Vec :: new ( ) ,
138
- buf2 : Vec :: new ( ) ,
139
140
rewrites,
140
- diff_algo,
141
141
}
142
142
}
143
143
}
@@ -177,25 +177,31 @@ impl<T: Change> Tracker<T> {
177
177
///
178
178
/// `objects` is used to access blob data for similarity checks if required and is taken directly from the object database.
179
179
/// Worktree filters and text conversions will be applied afterwards automatically. Note that object-caching *should not*
180
- /// be enabled as caching is implemented internally, after all, the blob that's actually diffed is going through conversion steps.
180
+ /// be enabled as caching is implemented by `diff_cache`, after all, the blob that's actually diffed is going
181
+ /// through conversion steps.
181
182
///
182
- /// Use `worktree_filter` to obtain working-tree versions of files present on disk before diffing to see if rewrites happened,
183
- /// with text-conversions being applied afterwards.
183
+ /// `diff_cache` is a way to retain a cache of resources that are prepared for rapid diffing, and it also controls
184
+ /// the diff-algorithm (provided no user-algorithm is set).
185
+ /// Note that we control a few options of `diff_cache` to assure it will ignore external commands.
186
+ /// Note that we do not control how the `diff_cache` converts resources, it's left to the caller to decide
187
+ /// if it should look at what's stored in `git`, or in the working tree, along with all diff-specific conversions.
184
188
///
185
189
/// `push_source_tree(push_fn: push(change, location))` is a function that is called when the entire tree of the source
186
190
/// should be added as modifications by calling `push` repeatedly to use for perfect copy tracking. Note that `push`
187
191
/// will panic if `change` is not a modification, and it's valid to not call `push` at all.
188
192
pub fn emit < PushSourceTreeFn , E > (
189
193
& mut self ,
190
194
mut cb : impl FnMut ( visit:: Destination < ' _ , T > , Option < visit:: Source < ' _ > > ) -> crate :: tree:: visit:: Action ,
191
- objects : & dyn gix_object :: Find ,
192
- _worktree_filter : & mut gix_filter :: Pipeline ,
195
+ diff_cache : & mut crate :: blob :: Platform ,
196
+ objects : & impl gix_object :: FindObjectOrHeader ,
193
197
mut push_source_tree : PushSourceTreeFn ,
194
198
) -> Result < Outcome , emit:: Error >
195
199
where
196
200
PushSourceTreeFn : FnMut ( & mut dyn FnMut ( T , & BStr ) ) -> Result < ( ) , E > ,
197
201
E : std:: error:: Error + Send + Sync + ' static ,
198
202
{
203
+ diff_cache. options . skip_internal_diff_if_external_is_configured = false ;
204
+
199
205
fn by_id_and_location < T : Change > ( a : & Item < T > , b : & Item < T > ) -> std:: cmp:: Ordering {
200
206
a. change
201
207
. id ( )
@@ -213,11 +219,19 @@ impl<T: Change> Tracker<T> {
213
219
& mut cb,
214
220
self . rewrites . percentage ,
215
221
& mut out,
222
+ diff_cache,
216
223
objects,
217
224
) ?;
218
225
219
226
if let Some ( copies) = self . rewrites . copies {
220
- self . match_pairs_of_kind ( visit:: SourceKind :: Copy , & mut cb, copies. percentage , & mut out, objects) ?;
227
+ self . match_pairs_of_kind (
228
+ visit:: SourceKind :: Copy ,
229
+ & mut cb,
230
+ copies. percentage ,
231
+ & mut out,
232
+ diff_cache,
233
+ objects,
234
+ ) ?;
221
235
222
236
match copies. source {
223
237
CopySource :: FromSetOfModifiedFiles => { }
@@ -233,7 +247,14 @@ impl<T: Change> Tracker<T> {
233
247
. map_err ( |err| emit:: Error :: GetItemsForExhaustiveCopyDetection ( Box :: new ( err) ) ) ?;
234
248
self . items . sort_by ( by_id_and_location) ;
235
249
236
- self . match_pairs_of_kind ( visit:: SourceKind :: Copy , & mut cb, copies. percentage , & mut out, objects) ?;
250
+ self . match_pairs_of_kind (
251
+ visit:: SourceKind :: Copy ,
252
+ & mut cb,
253
+ copies. percentage ,
254
+ & mut out,
255
+ diff_cache,
256
+ objects,
257
+ ) ?;
237
258
}
238
259
}
239
260
}
@@ -263,11 +284,14 @@ impl<T: Change> Tracker<T> {
263
284
cb : & mut impl FnMut ( visit:: Destination < ' _ , T > , Option < visit:: Source < ' _ > > ) -> crate :: tree:: visit:: Action ,
264
285
percentage : Option < f32 > ,
265
286
out : & mut Outcome ,
266
- objects : & dyn gix_object:: Find ,
287
+ diff_cache : & mut crate :: blob:: Platform ,
288
+ objects : & impl gix_object:: FindObjectOrHeader ,
267
289
) -> Result < ( ) , emit:: Error > {
268
290
// we try to cheaply reduce the set of possibilities first, before possibly looking more exhaustively.
269
291
let needs_second_pass = !needs_exact_match ( percentage) ;
270
- if self . match_pairs ( cb, None /* by identity */ , kind, out, objects) ? == crate :: tree:: visit:: Action :: Cancel {
292
+ if self . match_pairs ( cb, None /* by identity */ , kind, out, diff_cache, objects) ?
293
+ == crate :: tree:: visit:: Action :: Cancel
294
+ {
271
295
return Ok ( ( ) ) ;
272
296
}
273
297
if needs_second_pass {
@@ -292,7 +316,7 @@ impl<T: Change> Tracker<T> {
292
316
}
293
317
} ;
294
318
if !is_limited {
295
- self . match_pairs ( cb, percentage, kind, out, objects) ?;
319
+ self . match_pairs ( cb, percentage, kind, out, diff_cache , objects) ?;
296
320
}
297
321
}
298
322
Ok ( ( ) )
@@ -304,9 +328,9 @@ impl<T: Change> Tracker<T> {
304
328
percentage : Option < f32 > ,
305
329
kind : visit:: SourceKind ,
306
330
stats : & mut Outcome ,
307
- objects : & dyn gix_object:: Find ,
331
+ diff_cache : & mut crate :: blob:: Platform ,
332
+ objects : & impl gix_object:: FindObjectOrHeader ,
308
333
) -> Result < crate :: tree:: visit:: Action , emit:: Error > {
309
- // TODO(perf): reuse object data and interner state and interned tokens, make these available to `find_match()`
310
334
let mut dest_ofs = 0 ;
311
335
while let Some ( ( mut dest_idx, dest) ) = self . items [ dest_ofs..] . iter ( ) . enumerate ( ) . find_map ( |( idx, item) | {
312
336
( !item. emitted && matches ! ( item. change. kind( ) , ChangeKind :: Addition ) ) . then_some ( ( idx, item) )
@@ -317,12 +341,12 @@ impl<T: Change> Tracker<T> {
317
341
& self . items ,
318
342
dest,
319
343
dest_idx,
320
- percentage. map ( |p| ( p , self . diff_algo ) ) ,
344
+ percentage,
321
345
kind,
322
346
stats,
323
347
objects,
324
- & mut self . buf1 ,
325
- & mut self . buf2 ,
348
+ diff_cache ,
349
+ & self . path_backing ,
326
350
) ?
327
351
. map ( |( src_idx, src, diff) | {
328
352
let ( id, entry_mode) = src. change . id_and_entry_mode ( ) ;
@@ -409,15 +433,15 @@ fn find_match<'a, T: Change>(
409
433
items : & ' a [ Item < T > ] ,
410
434
item : & Item < T > ,
411
435
item_idx : usize ,
412
- percentage : Option < ( f32 , crate :: blob :: Algorithm ) > ,
436
+ percentage : Option < f32 > ,
413
437
kind : visit:: SourceKind ,
414
438
stats : & mut Outcome ,
415
- objects : & dyn gix_object:: Find ,
416
- buf1 : & mut Vec < u8 > ,
417
- buf2 : & mut Vec < u8 > ,
439
+ objects : & impl gix_object:: FindObjectOrHeader ,
440
+ diff_cache : & mut crate :: blob :: Platform ,
441
+ path_backing : & [ u8 ] ,
418
442
) -> Result < Option < SourceTuple < ' a , T > > , emit:: Error > {
419
443
let ( item_id, item_mode) = item. change . id_and_entry_mode ( ) ;
420
- if needs_exact_match ( percentage. map ( |t| t . 0 ) ) || item_mode. is_link ( ) {
444
+ if needs_exact_match ( percentage) || item_mode. is_link ( ) {
421
445
let first_idx = items. partition_point ( |a| a. change . id ( ) < item_id) ;
422
446
let range = match items. get ( first_idx..) . map ( |items| {
423
447
let end = items
@@ -440,55 +464,76 @@ fn find_match<'a, T: Change>(
440
464
return Ok ( Some ( src) ) ;
441
465
}
442
466
} else {
443
- let mut new = None ;
444
- let ( percentage, algo ) = percentage. expect ( "it's set to something below 1.0 and we assured this" ) ;
467
+ let mut has_new = false ;
468
+ let percentage = percentage. expect ( "it's set to something below 1.0 and we assured this" ) ;
445
469
debug_assert_eq ! (
446
470
item. change. entry_mode( ) . kind( ) ,
447
471
EntryKind :: Blob ,
448
472
"symlinks are matched exactly, and trees aren't used here"
449
473
) ;
474
+
450
475
for ( can_idx, src) in items
451
476
. iter ( )
452
477
. enumerate ( )
453
478
. filter ( |( src_idx, src) | * src_idx != item_idx && src. is_source_for_destination_of ( kind, item_mode) )
454
479
{
455
- let new = match & new {
456
- Some ( new) => new,
457
- None => {
458
- new = objects. find_blob ( item_id, buf1) ?. into ( ) ;
459
- new. as_ref ( ) . expect ( "just set" )
460
- }
461
- } ;
462
- let old = objects. find_blob ( src. change . id ( ) , buf2) ?;
463
- // TODO: make sure we get attribute handling/worktree conversion and binary skips and filters right here.
464
- let tokens = crate :: blob:: intern:: InternedInput :: new (
465
- crate :: blob:: sources:: byte_lines_with_terminator ( old. data ) ,
466
- crate :: blob:: sources:: byte_lines_with_terminator ( new. data ) ,
467
- ) ;
468
- let counts = crate :: blob:: diff (
469
- algo,
470
- & tokens,
471
- crate :: blob:: sink:: Counter :: new ( diff:: Statistics {
472
- removed_bytes : 0 ,
473
- input : & tokens,
474
- } ) ,
475
- ) ;
476
- let similarity = ( old. data . len ( ) - counts. wrapped ) as f32 / old. data . len ( ) . max ( new. data . len ( ) ) as f32 ;
480
+ if !has_new {
481
+ diff_cache. set_resource (
482
+ item_id. to_owned ( ) ,
483
+ item_mode. kind ( ) ,
484
+ item. location ( path_backing) ,
485
+ ResourceKind :: NewOrDestination ,
486
+ objects,
487
+ ) ?;
488
+ has_new = true ;
489
+ }
490
+ let ( src_id, src_mode) = src. change . id_and_entry_mode ( ) ;
491
+ diff_cache. set_resource (
492
+ src_id. to_owned ( ) ,
493
+ src_mode. kind ( ) ,
494
+ src. location ( path_backing) ,
495
+ ResourceKind :: OldOrSource ,
496
+ objects,
497
+ ) ?;
498
+ let prep = diff_cache. prepare_diff ( ) ?;
477
499
stats. num_similarity_checks += 1 ;
478
- if similarity >= percentage {
479
- return Ok ( Some ( (
480
- can_idx,
481
- src,
482
- DiffLineStats {
483
- removals : counts. removals ,
484
- insertions : counts. insertions ,
485
- before : tokens. before . len ( ) . try_into ( ) . expect ( "interner handles only u32" ) ,
486
- after : tokens. after . len ( ) . try_into ( ) . expect ( "interner handles only u32" ) ,
487
- similarity,
500
+ match prep. operation {
501
+ Operation :: InternalDiff { algorithm } => {
502
+ let tokens =
503
+ crate :: blob:: intern:: InternedInput :: new ( prep. old . intern_source ( ) , prep. new . intern_source ( ) ) ;
504
+ let counts = crate :: blob:: diff (
505
+ algorithm,
506
+ & tokens,
507
+ crate :: blob:: sink:: Counter :: new ( diff:: Statistics {
508
+ removed_bytes : 0 ,
509
+ input : & tokens,
510
+ } ) ,
511
+ ) ;
512
+ let old_data_len = prep. old . data . as_slice ( ) . unwrap_or_default ( ) . len ( ) ;
513
+ let new_data_len = prep. new . data . as_slice ( ) . unwrap_or_default ( ) . len ( ) ;
514
+ let similarity = ( old_data_len - counts. wrapped ) as f32 / old_data_len. max ( new_data_len) as f32 ;
515
+ if similarity >= percentage {
516
+ return Ok ( Some ( (
517
+ can_idx,
518
+ src,
519
+ DiffLineStats {
520
+ removals : counts. removals ,
521
+ insertions : counts. insertions ,
522
+ before : tokens. before . len ( ) . try_into ( ) . expect ( "interner handles only u32" ) ,
523
+ after : tokens. after . len ( ) . try_into ( ) . expect ( "interner handles only u32" ) ,
524
+ similarity,
525
+ }
526
+ . into ( ) ,
527
+ ) ) ) ;
488
528
}
489
- . into ( ) ,
490
- ) ) ) ;
491
- }
529
+ }
530
+ Operation :: ExternalCommand { .. } => {
531
+ unreachable ! ( "we have disabled this possibility with an option" )
532
+ }
533
+ Operation :: SourceOrDestinationIsBinary => {
534
+ // TODO: figure out if git does more here
535
+ }
536
+ } ;
492
537
}
493
538
}
494
539
Ok ( None )
0 commit comments