@@ -367,6 +367,144 @@ static int find_exact_renames(struct diff_options *options)
367
367
return renames ;
368
368
}
369
369
370
+ static const char * get_basename (const char * filename )
371
+ {
372
+ /*
373
+ * gitbasename() has to worry about special drives, multiple
374
+ * directory separator characters, trailing slashes, NULL or
375
+ * empty strings, etc. We only work on filenames as stored in
376
+ * git, and thus get to ignore all those complications.
377
+ */
378
+ const char * base = strrchr (filename , '/' );
379
+ return base ? base + 1 : filename ;
380
+ }
381
+
382
+ static int find_basename_matches (struct diff_options * options ,
383
+ int minimum_score )
384
+ {
385
+ /*
386
+ * When I checked in early 2020, over 76% of file renames in linux
387
+ * just moved files to a different directory but kept the same
388
+ * basename. gcc did that with over 64% of renames, gecko did it
389
+ * with over 79%, and WebKit did it with over 89%.
390
+ *
391
+ * Therefore we can bypass the normal exhaustive NxM matrix
392
+ * comparison of similarities between all potential rename sources
393
+ * and destinations by instead using file basename as a hint (i.e.
394
+ * the portion of the filename after the last '/'), checking for
395
+ * similarity between files with the same basename, and if we find
396
+ * a pair that are sufficiently similar, record the rename pair and
397
+ * exclude those two from the NxM matrix.
398
+ *
399
+ * This *might* cause us to find a less than optimal pairing (if
400
+ * there is another file that we are even more similar to but has a
401
+ * different basename). Given the huge performance advantage
402
+ * basename matching provides, and given the frequency with which
403
+ * people use the same basename in real world projects, that's a
404
+ * trade-off we are willing to accept when doing just rename
405
+ * detection.
406
+ *
407
+ * If someone wants copy detection that implies they are willing to
408
+ * spend more cycles to find similarities between files, so it may
409
+ * be less likely that this heuristic is wanted. If someone is
410
+ * doing break detection, that means they do not want filename
411
+ * similarity to imply any form of content similiarity, and thus
412
+ * this heuristic would definitely be incompatible.
413
+ */
414
+
415
+ int i , renames = 0 ;
416
+ struct strintmap sources ;
417
+ struct strintmap dests ;
418
+ struct hashmap_iter iter ;
419
+ struct strmap_entry * entry ;
420
+
421
+ /*
422
+ * The prefeteching stuff wants to know if it can skip prefetching
423
+ * blobs that are unmodified...and will then do a little extra work
424
+ * to verify that the oids are indeed different before prefetching.
425
+ * Unmodified blobs are only relevant when doing copy detection;
426
+ * when limiting to rename detection, diffcore_rename[_extended]()
427
+ * will never be called with unmodified source paths fed to us, so
428
+ * the extra work necessary to check if rename_src entries are
429
+ * unmodified would be a small waste.
430
+ */
431
+ int skip_unmodified = 0 ;
432
+
433
+ /*
434
+ * Create maps of basename -> fullname(s) for remaining sources and
435
+ * dests.
436
+ */
437
+ strintmap_init_with_options (& sources , -1 , NULL , 0 );
438
+ strintmap_init_with_options (& dests , -1 , NULL , 0 );
439
+ for (i = 0 ; i < rename_src_nr ; ++ i ) {
440
+ char * filename = rename_src [i ].p -> one -> path ;
441
+ const char * base ;
442
+
443
+ /* exact renames removed in remove_unneeded_paths_from_src() */
444
+ assert (!rename_src [i ].p -> one -> rename_used );
445
+
446
+ /* Record index within rename_src (i) if basename is unique */
447
+ base = get_basename (filename );
448
+ if (strintmap_contains (& sources , base ))
449
+ strintmap_set (& sources , base , -1 );
450
+ else
451
+ strintmap_set (& sources , base , i );
452
+ }
453
+ for (i = 0 ; i < rename_dst_nr ; ++ i ) {
454
+ char * filename = rename_dst [i ].p -> two -> path ;
455
+ const char * base ;
456
+
457
+ if (rename_dst [i ].is_rename )
458
+ continue ; /* involved in exact match already. */
459
+
460
+ /* Record index within rename_dst (i) if basename is unique */
461
+ base = get_basename (filename );
462
+ if (strintmap_contains (& dests , base ))
463
+ strintmap_set (& dests , base , -1 );
464
+ else
465
+ strintmap_set (& dests , base , i );
466
+ }
467
+
468
+ /* Now look for basename matchups and do similarity estimation */
469
+ strintmap_for_each_entry (& sources , & iter , entry ) {
470
+ const char * base = entry -> key ;
471
+ intptr_t src_index = (intptr_t )entry -> value ;
472
+ intptr_t dst_index ;
473
+ if (src_index == -1 )
474
+ continue ;
475
+
476
+ if (0 <= (dst_index = strintmap_get (& dests , base ))) {
477
+ struct diff_filespec * one , * two ;
478
+ int score ;
479
+
480
+ /* Estimate the similarity */
481
+ one = rename_src [src_index ].p -> one ;
482
+ two = rename_dst [dst_index ].p -> two ;
483
+ score = estimate_similarity (options -> repo , one , two ,
484
+ minimum_score , skip_unmodified );
485
+
486
+ /* If sufficiently similar, record as rename pair */
487
+ if (score < minimum_score )
488
+ continue ;
489
+ record_rename_pair (dst_index , src_index , score );
490
+ renames ++ ;
491
+
492
+ /*
493
+ * Found a rename so don't need text anymore; if we
494
+ * didn't find a rename, the filespec_blob would get
495
+ * re-used when doing the matrix of comparisons.
496
+ */
497
+ diff_free_filespec_blob (one );
498
+ diff_free_filespec_blob (two );
499
+ }
500
+ }
501
+
502
+ strintmap_clear (& sources );
503
+ strintmap_clear (& dests );
504
+
505
+ return renames ;
506
+ }
507
+
370
508
#define NUM_CANDIDATE_PER_DST 4
371
509
static void record_if_better (struct diff_score m [], struct diff_score * o )
372
510
{
@@ -454,6 +592,54 @@ static int find_renames(struct diff_score *mx, int dst_cnt, int minimum_score, i
454
592
return count ;
455
593
}
456
594
595
+ static void remove_unneeded_paths_from_src (int detecting_copies )
596
+ {
597
+ int i , new_num_src ;
598
+
599
+ if (detecting_copies )
600
+ return ; /* nothing to remove */
601
+ if (break_idx )
602
+ return ; /* culling incompatible with break detection */
603
+
604
+ /*
605
+ * Note on reasons why we cull unneeded sources but not destinations:
606
+ * 1) Pairings are stored in rename_dst (not rename_src), which we
607
+ * need to keep around. So, we just can't cull rename_dst even
608
+ * if we wanted to. But doing so wouldn't help because...
609
+ *
610
+ * 2) There is a matrix pairwise comparison that follows the
611
+ * "Performing inexact rename detection" progress message.
612
+ * Iterating over the destinations is done in the outer loop,
613
+ * hence we only iterate over each of those once and we can
614
+ * easily skip the outer loop early if the destination isn't
615
+ * relevant. That's only one check per destination path to
616
+ * skip.
617
+ *
618
+ * By contrast, the sources are iterated in the inner loop; if
619
+ * we check whether a source can be skipped, then we'll be
620
+ * checking it N separate times, once for each destination.
621
+ * We don't want to have to iterate over known-not-needed
622
+ * sources N times each, so avoid that by removing the sources
623
+ * from rename_src here.
624
+ */
625
+ for (i = 0 , new_num_src = 0 ; i < rename_src_nr ; i ++ ) {
626
+ /*
627
+ * renames are stored in rename_dst, so if a rename has
628
+ * already been detected using this source, we can just
629
+ * remove the source knowing rename_dst has its info.
630
+ */
631
+ if (rename_src [i ].p -> one -> rename_used )
632
+ continue ;
633
+
634
+ if (new_num_src < i )
635
+ memcpy (& rename_src [new_num_src ], & rename_src [i ],
636
+ sizeof (struct diff_rename_src ));
637
+ new_num_src ++ ;
638
+ }
639
+
640
+ rename_src_nr = new_num_src ;
641
+ }
642
+
457
643
void diffcore_rename (struct diff_options * options )
458
644
{
459
645
int detect_rename = options -> detect_rename ;
@@ -463,9 +649,11 @@ void diffcore_rename(struct diff_options *options)
463
649
struct diff_score * mx ;
464
650
int i , j , rename_count , skip_unmodified = 0 ;
465
651
int num_destinations , dst_cnt ;
652
+ int num_sources , want_copies ;
466
653
struct progress * progress = NULL ;
467
654
468
655
trace2_region_enter ("diff" , "setup" , options -> repo );
656
+ want_copies = (detect_rename == DIFF_DETECT_COPY );
469
657
if (!minimum_score )
470
658
minimum_score = DEFAULT_RENAME_SCORE ;
471
659
@@ -502,7 +690,7 @@ void diffcore_rename(struct diff_options *options)
502
690
p -> one -> rename_used ++ ;
503
691
register_rename_src (p );
504
692
}
505
- else if (detect_rename == DIFF_DETECT_COPY ) {
693
+ else if (want_copies ) {
506
694
/*
507
695
* Increment the "rename_used" score by
508
696
* one, to indicate ourselves as a user.
@@ -527,17 +715,60 @@ void diffcore_rename(struct diff_options *options)
527
715
if (minimum_score == MAX_SCORE )
528
716
goto cleanup ;
529
717
530
- /*
531
- * Calculate how many renames are left (but all the source
532
- * files still remain as options for rename/copies!)
533
- */
718
+ num_sources = rename_src_nr ;
719
+
720
+ if (want_copies || break_idx ) {
721
+ /*
722
+ * Cull sources:
723
+ * - remove ones corresponding to exact renames
724
+ */
725
+ trace2_region_enter ("diff" , "cull after exact" , options -> repo );
726
+ remove_unneeded_paths_from_src (want_copies );
727
+ trace2_region_leave ("diff" , "cull after exact" , options -> repo );
728
+ } else {
729
+ /* Determine minimum score to match basenames */
730
+ double factor = 0.5 ;
731
+ char * basename_factor = getenv ("GIT_BASENAME_FACTOR" );
732
+ int min_basename_score ;
733
+
734
+ if (basename_factor )
735
+ factor = strtol (basename_factor , NULL , 10 )/100.0 ;
736
+ assert (factor >= 0.0 && factor <= 1.0 );
737
+ min_basename_score = minimum_score +
738
+ (int )(factor * (MAX_SCORE - minimum_score ));
739
+
740
+ /*
741
+ * Cull sources:
742
+ * - remove ones involved in renames (found via exact match)
743
+ */
744
+ trace2_region_enter ("diff" , "cull after exact" , options -> repo );
745
+ remove_unneeded_paths_from_src (want_copies );
746
+ trace2_region_leave ("diff" , "cull after exact" , options -> repo );
747
+
748
+ /* Utilize file basenames to quickly find renames. */
749
+ trace2_region_enter ("diff" , "basename matches" , options -> repo );
750
+ rename_count += find_basename_matches (options ,
751
+ min_basename_score );
752
+ trace2_region_leave ("diff" , "basename matches" , options -> repo );
753
+
754
+ /*
755
+ * Cull sources, again:
756
+ * - remove ones involved in renames (found via basenames)
757
+ */
758
+ trace2_region_enter ("diff" , "cull basename" , options -> repo );
759
+ remove_unneeded_paths_from_src (want_copies );
760
+ trace2_region_leave ("diff" , "cull basename" , options -> repo );
761
+ }
762
+
763
+ /* Calculate how many rename destinations are left */
534
764
num_destinations = (rename_dst_nr - rename_count );
765
+ num_sources = rename_src_nr ; /* rename_src_nr reflects lower number */
535
766
536
767
/* All done? */
537
- if (!num_destinations )
768
+ if (!num_destinations || ! num_sources )
538
769
goto cleanup ;
539
770
540
- switch (too_many_rename_candidates (num_destinations , rename_src_nr ,
771
+ switch (too_many_rename_candidates (num_destinations , num_sources ,
541
772
options )) {
542
773
case 1 :
543
774
goto cleanup ;
@@ -553,7 +784,7 @@ void diffcore_rename(struct diff_options *options)
553
784
if (options -> show_rename_progress ) {
554
785
progress = start_delayed_progress (
555
786
_ ("Performing inexact rename detection" ),
556
- (uint64_t )num_destinations * (uint64_t )rename_src_nr );
787
+ (uint64_t )num_destinations * (uint64_t )num_sources );
557
788
}
558
789
559
790
mx = xcalloc (st_mult (NUM_CANDIDATE_PER_DST , num_destinations ),
@@ -563,7 +794,7 @@ void diffcore_rename(struct diff_options *options)
563
794
struct diff_score * m ;
564
795
565
796
if (rename_dst [i ].is_rename )
566
- continue ; /* dealt with exact match already. */
797
+ continue ; /* exact or basename match already handled */
567
798
568
799
m = & mx [dst_cnt * NUM_CANDIDATE_PER_DST ];
569
800
for (j = 0 ; j < NUM_CANDIDATE_PER_DST ; j ++ )
@@ -573,6 +804,8 @@ void diffcore_rename(struct diff_options *options)
573
804
struct diff_filespec * one = rename_src [j ].p -> one ;
574
805
struct diff_score this_src ;
575
806
807
+ assert (!one -> rename_used || want_copies || break_idx );
808
+
576
809
if (skip_unmodified &&
577
810
diff_unmodified_pair (rename_src [j ].p ))
578
811
continue ;
@@ -594,15 +827,15 @@ void diffcore_rename(struct diff_options *options)
594
827
}
595
828
dst_cnt ++ ;
596
829
display_progress (progress ,
597
- (uint64_t )dst_cnt * (uint64_t )rename_src_nr );
830
+ (uint64_t )dst_cnt * (uint64_t )num_sources );
598
831
}
599
832
stop_progress (& progress );
600
833
601
834
/* cost matrix sorted by most to least similar pair */
602
835
STABLE_QSORT (mx , dst_cnt * NUM_CANDIDATE_PER_DST , score_compare );
603
836
604
837
rename_count += find_renames (mx , dst_cnt , minimum_score , 0 );
605
- if (detect_rename == DIFF_DETECT_COPY )
838
+ if (want_copies )
606
839
rename_count += find_renames (mx , dst_cnt , minimum_score , 1 );
607
840
free (mx );
608
841
trace2_region_leave ("diff" , "inexact renames" , options -> repo );
0 commit comments