Skip to content

Commit 0002bca

Browse files
committed
diffcore-rename: use relevant_sources to filter possible rename sources
Have merge-ort pass the computed list of relevant_sources on to diffcore_rename_extended(), and use that list after exact rename detection to further filter the list of possible rename sources before doing inexact rename detection. Since it is often the case that only a small subset of sources have been modified on the side of history that did not rename/delete those files, this can dramatically accelerate rename detection performance. There are two subtle points that maximize the effectiveness of this optimization, without which the performance improvements from this patch would be far less pronounced (which I know from experience of trying it the wrong way first): * Instead of adding to the relevant_sources strset, we could have just avoided creating the diff_filepair for any irrelevant sources. However, that is actually slower, because exact rename detection is able to remove both sources and destinations from the comparison matrix, whereas relevant_source checking can only remove sources. So we want to allow exact rename detection (which needs both the sources and destination diff_filepairs available), and then we pass the relevant_sources to diffcore_rename_extended() so it can filter the list of sources after that point. * Earlier in this series, we filtered rename_src in diffcore-rename to remove the sources corresponding to exact renames from the list to focus our iteration comparisons on relevant entries. This change made almost no performance difference at the time. However, that change became critically important here. If instead of filtering rename_src before the inexact rename detection, we for every one->path source checked strset_contains(relevant_sources, one->path) inside the j < num_sources loop, then in a repository with 30k renames we'd have to do 30k strset_contains() calls PER outer loop, meaning 30k * 30k = 900 million strset_contains() calls. That would kill performance and erase many of the gains from this optimization. It's probably also worth noting that this optimization was only possible due to the changes to make various conflict types more consistent (which work was done precisely with this change in mind): * bringing consistency to add/add, rename/add, and rename/rename conflict types, as done back in the topic merged at commit ac193e0 ("Merge branch 'en/merge-path-collision'", 2019-01-04), and further extended in commits 2a7c16c ("t6422, t6426: be more flexible for add/add conflicts involving renames", 2020-08-10) and e8eb99d ("t642[23]: be more flexible for add/add conflicts involving pair renames", 2020-08-10) * making rename/delete more consistent with modify/delete as done in commits 1f3c9ba ("t6425: be more flexible with rename/delete conflict messages", 2020-08-10) and 727c75b ("t6404, t6423: expect improved rename/delete handling in ort backend", 2020-10-26) For the testcases mentioned in commit 557ac03 ("merge-ort: begin performance work; instrument with trace2_region_* calls", 2020-10-28), this change improves the performance as follows: Before After no-renames: 12.596 s ± 0.061 s 6.003 s ± 0.048 s mega-renames: 130.465 s ± 0.259 s 114.009 s ± 0.236 s just-one-mega: 3.958 s ± 0.010 s 3.489 s ± 0.017 s Signed-off-by: Elijah Newren <[email protected]>
1 parent bd27b9e commit 0002bca

File tree

3 files changed

+21
-7
lines changed

3 files changed

+21
-7
lines changed

diffcore-rename.c

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -978,11 +978,12 @@ static int find_renames(struct diff_score *mx,
978978
return count;
979979
}
980980

981-
static void remove_unneeded_paths_from_src(int detecting_copies)
981+
static void remove_unneeded_paths_from_src(int detecting_copies,
982+
struct strset *interesting)
982983
{
983984
int i, new_num_src;
984985

985-
if (detecting_copies)
986+
if (detecting_copies && !interesting)
986987
return; /* nothing to remove */
987988
if (break_idx)
988989
return; /* culling incompatbile with break detection */
@@ -1009,12 +1010,18 @@ static void remove_unneeded_paths_from_src(int detecting_copies)
10091010
* from rename_src here.
10101011
*/
10111012
for (i = 0, new_num_src = 0; i < rename_src_nr; i++) {
1013+
struct diff_filespec *one = rename_src[i].p->one;
1014+
10121015
/*
10131016
* renames are stored in rename_dst, so if a rename has
10141017
* already been detected using this source, we can just
10151018
* remove the source knowing rename_dst has its info.
10161019
*/
1017-
if (rename_src[i].p->one->rename_used)
1020+
if (!detecting_copies && one->rename_used)
1021+
continue;
1022+
1023+
/* If we don't care about the source path, skip it */
1024+
if (interesting && !strset_contains(interesting, one->path))
10181025
continue;
10191026

10201027
if (new_num_src < i)
@@ -1027,6 +1034,7 @@ static void remove_unneeded_paths_from_src(int detecting_copies)
10271034
}
10281035

10291036
void diffcore_rename_extended(struct diff_options *options,
1037+
struct strset *relevant_sources,
10301038
struct strset *dirs_removed,
10311039
struct strmap *dir_rename_count)
10321040
{
@@ -1047,6 +1055,8 @@ void diffcore_rename_extended(struct diff_options *options,
10471055
want_copies = (detect_rename == DIFF_DETECT_COPY);
10481056
if (dirs_removed && (break_idx || want_copies))
10491057
BUG("dirs_removed incompatible with break/copy detection");
1058+
if (break_idx && relevant_sources)
1059+
BUG("break detection incompatible with source specification");
10501060
if (!minimum_score)
10511061
minimum_score = DEFAULT_RENAME_SCORE;
10521062

@@ -1114,9 +1124,10 @@ void diffcore_rename_extended(struct diff_options *options,
11141124
/*
11151125
* Cull sources:
11161126
* - remove ones corresponding to exact renames
1127+
* - remove ones not found in relevant_sources
11171128
*/
11181129
trace2_region_enter("diff", "cull after exact", options->repo);
1119-
remove_unneeded_paths_from_src(want_copies);
1130+
remove_unneeded_paths_from_src(want_copies, relevant_sources);
11201131
trace2_region_leave("diff", "cull after exact", options->repo);
11211132
} else {
11221133
/* Determine minimum score to match basenames */
@@ -1135,7 +1146,7 @@ void diffcore_rename_extended(struct diff_options *options,
11351146
* - remove ones involved in renames (found via exact match)
11361147
*/
11371148
trace2_region_enter("diff", "cull after exact", options->repo);
1138-
remove_unneeded_paths_from_src(want_copies);
1149+
remove_unneeded_paths_from_src(want_copies, NULL);
11391150
trace2_region_leave("diff", "cull after exact", options->repo);
11401151

11411152
/* Preparation for basename-driven matching. */
@@ -1155,9 +1166,10 @@ void diffcore_rename_extended(struct diff_options *options,
11551166
/*
11561167
* Cull sources, again:
11571168
* - remove ones involved in renames (found via basenames)
1169+
* - remove ones not found in relevant_sources
11581170
*/
11591171
trace2_region_enter("diff", "cull basename", options->repo);
1160-
remove_unneeded_paths_from_src(want_copies);
1172+
remove_unneeded_paths_from_src(want_copies, relevant_sources);
11611173
trace2_region_leave("diff", "cull basename", options->repo);
11621174
}
11631175

@@ -1332,5 +1344,5 @@ void diffcore_rename_extended(struct diff_options *options,
13321344

13331345
void diffcore_rename(struct diff_options *options)
13341346
{
1335-
diffcore_rename_extended(options, NULL, NULL);
1347+
diffcore_rename_extended(options, NULL, NULL, NULL);
13361348
}

diffcore.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,7 @@ void partial_clear_dir_rename_count(struct strmap *dir_rename_count);
166166
void diffcore_break(struct repository *, int);
167167
void diffcore_rename(struct diff_options *);
168168
void diffcore_rename_extended(struct diff_options *options,
169+
struct strset *relevant_sources,
169170
struct strset *dirs_removed,
170171
struct strmap *dir_rename_count);
171172
void diffcore_merge_broken(void);

merge-ort.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2209,6 +2209,7 @@ static void detect_regular_renames(struct merge_options *opt,
22092209
diff_queued_diff = renames->pairs[side_index];
22102210
trace2_region_enter("diff", "diffcore_rename", opt->repo);
22112211
diffcore_rename_extended(&diff_opts,
2212+
&renames->relevant_sources[side_index],
22122213
&renames->dirs_removed[side_index],
22132214
&renames->dir_rename_count[side_index]);
22142215
trace2_region_leave("diff", "diffcore_rename", opt->repo);

0 commit comments

Comments
 (0)