Skip to content

Commit bd24aa2

Browse files
newrengitster
authored andcommitted
diffcore-rename: guide inexact rename detection based on basenames
Make use of the new find_basename_matches() function added in the last two patches, to find renames more rapidly in cases where we can match up files based on basenames. As a quick reminder (see the last two commit messages for more details), this means for example that docs/extensions.txt and docs/config/extensions.txt are considered likely renames if there are no remaining 'extensions.txt' files elsewhere among the added and deleted files, and if a similarity check confirms they are similar, then they are marked as a rename without looking for a better similarity match among other files. This is a behavioral change, as covered in more detail in the previous commit message. We do not use this heuristic together with either break or copy detection. The point of break detection is to say that filename similarity does not imply file content similarity, and we only want to know about file content similarity. The point of copy detection is to use more resources to check for additional similarities, while this is an optimization that uses far less resources but which might also result in finding slightly fewer similarities. So the idea behind this optimization goes against both of those features, and will be turned off for both. For the testcases mentioned in commit 557ac03 ("merge-ort: begin performance work; instrument with trace2_region_* calls", 2020-10-28), this change improves the performance as follows: Before After no-renames: 13.815 s ± 0.062 s 13.294 s ± 0.103 s mega-renames: 1799.937 s ± 0.493 s 187.248 s ± 0.882 s just-one-mega: 51.289 s ± 0.019 s 5.557 s ± 0.017 s Signed-off-by: Elijah Newren <[email protected]> Signed-off-by: Junio C Hamano <[email protected]>
1 parent da09f65 commit bd24aa2

File tree

2 files changed

+52
-8
lines changed

2 files changed

+52
-8
lines changed

diffcore-rename.c

Lines changed: 48 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -379,7 +379,6 @@ static const char *get_basename(const char *filename)
379379
return base ? base + 1 : filename;
380380
}
381381

382-
MAYBE_UNUSED
383382
static int find_basename_matches(struct diff_options *options,
384383
int minimum_score)
385384
{
@@ -716,11 +715,55 @@ void diffcore_rename(struct diff_options *options)
716715
if (minimum_score == MAX_SCORE)
717716
goto cleanup;
718717

719-
/* Calculate how many renames are left */
720-
num_destinations = (rename_dst_nr - rename_count);
721-
remove_unneeded_paths_from_src(want_copies);
722718
num_sources = rename_src_nr;
723719

720+
if (want_copies || break_idx) {
721+
/*
722+
* Cull sources:
723+
* - remove ones corresponding to exact renames
724+
*/
725+
trace2_region_enter("diff", "cull after exact", options->repo);
726+
remove_unneeded_paths_from_src(want_copies);
727+
trace2_region_leave("diff", "cull after exact", options->repo);
728+
} else {
729+
/* Determine minimum score to match basenames */
730+
double factor = 0.5;
731+
char *basename_factor = getenv("GIT_BASENAME_FACTOR");
732+
int min_basename_score;
733+
734+
if (basename_factor)
735+
factor = strtol(basename_factor, NULL, 10)/100.0;
736+
assert(factor >= 0.0 && factor <= 1.0);
737+
min_basename_score = minimum_score +
738+
(int)(factor * (MAX_SCORE - minimum_score));
739+
740+
/*
741+
* Cull sources:
742+
* - remove ones involved in renames (found via exact match)
743+
*/
744+
trace2_region_enter("diff", "cull after exact", options->repo);
745+
remove_unneeded_paths_from_src(want_copies);
746+
trace2_region_leave("diff", "cull after exact", options->repo);
747+
748+
/* Utilize file basenames to quickly find renames. */
749+
trace2_region_enter("diff", "basename matches", options->repo);
750+
rename_count += find_basename_matches(options,
751+
min_basename_score);
752+
trace2_region_leave("diff", "basename matches", options->repo);
753+
754+
/*
755+
* Cull sources, again:
756+
* - remove ones involved in renames (found via basenames)
757+
*/
758+
trace2_region_enter("diff", "cull basename", options->repo);
759+
remove_unneeded_paths_from_src(want_copies);
760+
trace2_region_leave("diff", "cull basename", options->repo);
761+
}
762+
763+
/* Calculate how many rename destinations are left */
764+
num_destinations = (rename_dst_nr - rename_count);
765+
num_sources = rename_src_nr; /* rename_src_nr reflects lower number */
766+
724767
/* All done? */
725768
if (!num_destinations || !num_sources)
726769
goto cleanup;
@@ -751,7 +794,7 @@ void diffcore_rename(struct diff_options *options)
751794
struct diff_score *m;
752795

753796
if (rename_dst[i].is_rename)
754-
continue; /* dealt with exact match already. */
797+
continue; /* exact or basename match already handled */
755798

756799
m = &mx[dst_cnt * NUM_CANDIDATE_PER_DST];
757800
for (j = 0; j < NUM_CANDIDATE_PER_DST; j++)

t/t4001-diff-rename.sh

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -277,10 +277,11 @@ test_expect_success 'basename similarity vs best similarity' '
277277
git add file.txt file.md &&
278278
git commit -a -m "rename" &&
279279
git diff-tree -r -M --name-status HEAD^ HEAD >actual &&
280-
# subdir/file.txt is 88% similar to file.md and 78% similar to file.txt
280+
# subdir/file.txt is 88% similar to file.md, 78% similar to file.txt,
281+
# but since same basenames are checked first...
281282
cat >expected <<-\EOF &&
282-
R088 subdir/file.txt file.md
283-
A file.txt
283+
A file.md
284+
R078 subdir/file.txt file.txt
284285
EOF
285286
test_cmp expected actual
286287
'

0 commit comments

Comments
 (0)