diffcore-rename: use relevant_sources to filter possible rename sources

newren · newren · commit 0002bcaeb2c8 · 2021-02-12T13:10:42.000-08:00
Have merge-ort pass the computed list of relevant_sources on to diffcore_rename_extended(), and use that list after exact rename detection to further filter the list of possible rename sources before doing inexact rename detection. Since it is often the case that only a small subset of sources have been modified on the side of history that did not rename/delete those files, this can dramatically accelerate rename detection performance. There are two subtle points that maximize the effectiveness of this optimization, without which the performance improvements from this patch would be far less pronounced (which I know from experience of trying it the wrong way first): * Instead of adding to the relevant_sources strset, we could have just avoided creating the diff_filepair for any irrelevant sources. However, that is actually slower, because exact rename detection is able to remove both sources and destinations from the comparison matrix, whereas relevant_source checking can only remove sources. So we want to allow exact rename detection (which needs both the sources and destination diff_filepairs available), and then we pass the relevant_sources to diffcore_rename_extended() so it can filter the list of sources after that point. * Earlier in this series, we filtered rename_src in diffcore-rename to remove the sources corresponding to exact renames from the list to focus our iteration comparisons on relevant entries. This change made almost no performance difference at the time. However, that change became critically important here. If instead of filtering rename_src before the inexact rename detection, we for every one->path source checked strset_contains(relevant_sources, one->path) inside the j < num_sources loop, then in a repository with 30k renames we'd have to do 30k strset_contains() calls PER outer loop, meaning 30k * 30k = 900 million strset_contains() calls. That would kill performance and erase many of the gains from this optimization. It's probably also worth noting that this optimization was only possible due to the changes to make various conflict types more consistent (which work was done precisely with this change in mind): * bringing consistency to add/add, rename/add, and rename/rename conflict types, as done back in the topic merged at commit ac193e0 ("Merge branch 'en/merge-path-collision'", 2019-01-04), and further extended in commits 2a7c16c ("t6422, t6426: be more flexible for add/add conflicts involving renames", 2020-08-10) and e8eb99d ("t642[23]: be more flexible for add/add conflicts involving pair renames", 2020-08-10) * making rename/delete more consistent with modify/delete as done in commits 1f3c9ba ("t6425: be more flexible with rename/delete conflict messages", 2020-08-10) and 727c75b ("t6404, t6423: expect improved rename/delete handling in ort backend", 2020-10-26) For the testcases mentioned in commit 557ac03 ("merge-ort: begin performance work; instrument with trace2_region_* calls", 2020-10-28), this change improves the performance as follows: Before After no-renames: 12.596 s ± 0.061 s 6.003 s ± 0.048 s mega-renames: 130.465 s ± 0.259 s 114.009 s ± 0.236 s just-one-mega: 3.958 s ± 0.010 s 3.489 s ± 0.017 s Signed-off-by: Elijah Newren <newren@gmail.com>
diff --git a/diffcore-rename.c b/diffcore-rename.c
@@ -978,11 +978,12 @@ static int find_renames(struct diff_score *mx,
 	return count;
 }
 
-static void remove_unneeded_paths_from_src(int detecting_copies)
+static void remove_unneeded_paths_from_src(int detecting_copies,
+					   struct strset *interesting)
 {
 	int i, new_num_src;
 
-	if (detecting_copies)
+	if (detecting_copies && !interesting)
 		return; /* nothing to remove */
 	if (break_idx)
 		return; /* culling incompatbile with break detection */
@@ -1009,12 +1010,18 @@ static void remove_unneeded_paths_from_src(int detecting_copies)
 	 *      from rename_src here.
 	 */
 	for (i = 0, new_num_src = 0; i < rename_src_nr; i++) {
+		struct diff_filespec *one = rename_src[i].p->one;
+
 		/*
 		 * renames are stored in rename_dst, so if a rename has
 		 * already been detected using this source, we can just
 		 * remove the source knowing rename_dst has its info.
 		 */
-		if (rename_src[i].p->one->rename_used)
+		if (!detecting_copies && one->rename_used)
+			continue;
+
+		/* If we don't care about the source path, skip it */
+		if (interesting && !strset_contains(interesting, one->path))
 			continue;
 
 		if (new_num_src < i)
@@ -1027,6 +1034,7 @@ static void remove_unneeded_paths_from_src(int detecting_copies)
 }
 
 void diffcore_rename_extended(struct diff_options *options,
+			      struct strset *relevant_sources,
 			      struct strset *dirs_removed,
 			      struct strmap *dir_rename_count)
 {
@@ -1047,6 +1055,8 @@ void diffcore_rename_extended(struct diff_options *options,
 	want_copies = (detect_rename == DIFF_DETECT_COPY);
 	if (dirs_removed && (break_idx || want_copies))
 		BUG("dirs_removed incompatible with break/copy detection");
+	if (break_idx && relevant_sources)
+		BUG("break detection incompatible with source specification");
 	if (!minimum_score)
 		minimum_score = DEFAULT_RENAME_SCORE;
 
@@ -1114,9 +1124,10 @@ void diffcore_rename_extended(struct diff_options *options,
 		/*
 		 * Cull sources:
 		 *   - remove ones corresponding to exact renames
+		 *   - remove ones not found in relevant_sources
 		 */
 		trace2_region_enter("diff", "cull after exact", options->repo);
-		remove_unneeded_paths_from_src(want_copies);
+		remove_unneeded_paths_from_src(want_copies, relevant_sources);
 		trace2_region_leave("diff", "cull after exact", options->repo);
 	} else {
 		/* Determine minimum score to match basenames */
@@ -1135,7 +1146,7 @@ void diffcore_rename_extended(struct diff_options *options,
 		 *   - remove ones involved in renames (found via exact match)
 		 */
 		trace2_region_enter("diff", "cull after exact", options->repo);
-		remove_unneeded_paths_from_src(want_copies);
+		remove_unneeded_paths_from_src(want_copies, NULL);
 		trace2_region_leave("diff", "cull after exact", options->repo);
 
 		/* Preparation for basename-driven matching. */
@@ -1155,9 +1166,10 @@ void diffcore_rename_extended(struct diff_options *options,
 		/*
 		 * Cull sources, again:
 		 *   - remove ones involved in renames (found via basenames)
+		 *   - remove ones not found in relevant_sources
 		 */
 		trace2_region_enter("diff", "cull basename", options->repo);
-		remove_unneeded_paths_from_src(want_copies);
+		remove_unneeded_paths_from_src(want_copies, relevant_sources);
 		trace2_region_leave("diff", "cull basename", options->repo);
 	}
 
@@ -1332,5 +1344,5 @@ void diffcore_rename_extended(struct diff_options *options,
 
 void diffcore_rename(struct diff_options *options)
 {
-	diffcore_rename_extended(options, NULL, NULL);
+	diffcore_rename_extended(options, NULL, NULL, NULL);
 }
diff --git a/diffcore.h b/diffcore.h
@@ -166,6 +166,7 @@ void partial_clear_dir_rename_count(struct strmap *dir_rename_count);
 void diffcore_break(struct repository *, int);
 void diffcore_rename(struct diff_options *);
 void diffcore_rename_extended(struct diff_options *options,
+			      struct strset *relevant_sources,
 			      struct strset *dirs_removed,
 			      struct strmap *dir_rename_count);
 void diffcore_merge_broken(void);
diff --git a/merge-ort.c b/merge-ort.c
@@ -2209,6 +2209,7 @@ static void detect_regular_renames(struct merge_options *opt,
 	diff_queued_diff = renames->pairs[side_index];
 	trace2_region_enter("diff", "diffcore_rename", opt->repo);
 	diffcore_rename_extended(&diff_opts,
+				 &renames->relevant_sources[side_index],
 				 &renames->dirs_removed[side_index],
 				 &renames->dir_rename_count[side_index]);
 	trace2_region_leave("diff", "diffcore_rename", opt->repo);