Skip to content

Commit 9db2ac5

Browse files
newrengitster
authored andcommitted
diffcore-rename: accelerate rename_dst setup
register_rename_src() simply references the passed pair inside rename_src. In contrast, add_rename_dst() did something entirely different for rename_dst. Instead of copying the passed pair, it made a copy of the second diff_filespec from the passed pair, referenced it, and then set the diff_rename_dst.pair field to NULL. Later, when a pairing is found, record_rename_pair() allocated a full diff_filepair via diff_queue() and pointed its src and dst fields at the appropriate diff_filespecs. This contrast between register_rename_src() for the rename_src data structure and add_rename_dst() for the rename_dst data structure is oddly inconsistent and requires more memory and work than necessary. Let's just reference the original diff_filepair in rename_dst as-is, just as we do with rename_src. Add a new rename_dst.is_rename field, since the rename_dst.p field is never NULL unlike the old rename_dst.pair field. Taking advantage of this change and the fact that same-named paths will be adjacent, we can get rid of the sorting of the array and most of the lookups on it, allowing us to instead just append as we go. However, there is one remaining reason to still keep locate_rename_dst(): handling broken pairs (i.e. when break detection is on). Those are somewhat rare, but we can set up a simple strintmap to get the map between the source and the index. Doing that allows us to still have a fast lookup without sorting the rename_dst array. Since the sorting had been done in a weakly quadratic manner, when many renames are involved this time could add up. There is still a strcmp() in add_rename_dst() that I have left in place to make it easier to verify that the algorithm has the same results. This strcmp() is there to check for duplicate destination entries (which was the easiest way at the time to avoid segfaults in the diffcore-rename code when trees had multiple entries at a given path). The underlying double free()s are no longer an issue with the new algorithm, but that can be addressed in a subsequent commit. This patch is being submitted in a different order than its original development, but in a large rebase of many commits with lots of renames and with several optimizations to inexact rename detection, both setup time and write back to output queue time from diffcore_rename() were sizeable chunks of overall runtime. This patch accelerated the setup time by about 65%, and final write back to the output queue time by about 50%, resulting in an overall drop of 3.5% on the execution time of rebasing a few dozen patches. Signed-off-by: Elijah Newren <[email protected]> Signed-off-by: Junio C Hamano <[email protected]>
1 parent b970b4e commit 9db2ac5

File tree

1 file changed

+65
-83
lines changed

1 file changed

+65
-83
lines changed

diffcore-rename.c

Lines changed: 65 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -9,63 +9,48 @@
99
#include "hashmap.h"
1010
#include "progress.h"
1111
#include "promisor-remote.h"
12+
#include "strmap.h"
1213

1314
/* Table of rename/copy destinations */
1415

1516
static struct diff_rename_dst {
16-
struct diff_filespec *two;
17-
struct diff_filepair *pair;
17+
struct diff_filepair *p;
18+
struct diff_filespec *filespec_to_free;
19+
int is_rename; /* false -> just a create; true -> rename or copy */
1820
} *rename_dst;
1921
static int rename_dst_nr, rename_dst_alloc;
22+
/* Mapping from break source pathname to break destination index */
23+
static struct strintmap *break_idx = NULL;
2024

21-
static int find_rename_dst(struct diff_filespec *two)
22-
{
23-
int first, last;
24-
25-
first = 0;
26-
last = rename_dst_nr;
27-
while (last > first) {
28-
int next = first + ((last - first) >> 1);
29-
struct diff_rename_dst *dst = &(rename_dst[next]);
30-
int cmp = strcmp(two->path, dst->two->path);
31-
if (!cmp)
32-
return next;
33-
if (cmp < 0) {
34-
last = next;
35-
continue;
36-
}
37-
first = next+1;
38-
}
39-
return -first - 1;
40-
}
41-
42-
static struct diff_rename_dst *locate_rename_dst(struct diff_filespec *two)
25+
static struct diff_rename_dst *locate_rename_dst(struct diff_filepair *p)
4326
{
44-
int ofs = find_rename_dst(two);
45-
return ofs < 0 ? NULL : &rename_dst[ofs];
27+
/* Lookup by p->ONE->path */
28+
int idx = break_idx ? strintmap_get(break_idx, p->one->path) : -1;
29+
return (idx == -1) ? NULL : &rename_dst[idx];
4630
}
4731

4832
/*
4933
* Returns 0 on success, -1 if we found a duplicate.
5034
*/
51-
static int add_rename_dst(struct diff_filespec *two)
35+
static int add_rename_dst(struct diff_filepair *p)
5236
{
53-
int first = find_rename_dst(two);
54-
55-
if (first >= 0)
37+
/*
38+
* If we have multiple entries at the same path in the destination
39+
* tree (an invalid tree, to be sure), turn off rename detection
40+
* entirely. Once upon a time, diffcore-rename had double free()
41+
* issues due to such duplicate paths, resulting in segfaults. See
42+
* commit 4d6be03b95 ("diffcore-rename: avoid processing duplicate
43+
* destinations", 2015-02-26).
44+
*/
45+
if (rename_dst_nr > 0 &&
46+
!strcmp(rename_dst[rename_dst_nr-1].p->two->path, p->two->path))
5647
return -1;
57-
first = -first - 1;
5848

59-
/* insert to make it at "first" */
6049
ALLOC_GROW(rename_dst, rename_dst_nr + 1, rename_dst_alloc);
50+
rename_dst[rename_dst_nr].p = p;
51+
rename_dst[rename_dst_nr].filespec_to_free = NULL;
52+
rename_dst[rename_dst_nr].is_rename = 0;
6153
rename_dst_nr++;
62-
if (first < rename_dst_nr)
63-
MOVE_ARRAY(rename_dst + first + 1, rename_dst + first,
64-
rename_dst_nr - first - 1);
65-
rename_dst[first].two = alloc_filespec(two->path);
66-
fill_filespec(rename_dst[first].two, &two->oid, two->oid_valid,
67-
two->mode);
68-
rename_dst[first].pair = NULL;
6954
return 0;
7055
}
7156

@@ -89,6 +74,14 @@ static void register_rename_src(struct diff_filepair *p)
8974
!strcmp(rename_src[rename_src_nr-1].p->one->path, p->one->path))
9075
return;
9176

77+
if (p->broken_pair) {
78+
if (!break_idx) {
79+
break_idx = xmalloc(sizeof(*break_idx));
80+
strintmap_init(break_idx, -1);
81+
}
82+
strintmap_set(break_idx, p->one->path, rename_dst_nr);
83+
}
84+
9285
ALLOC_GROW(rename_src, rename_src_nr + 1, rename_src_alloc);
9386
rename_src[rename_src_nr].p = p;
9487
rename_src[rename_src_nr].score = p->score;
@@ -128,14 +121,14 @@ static void prefetch(void *prefetch_options)
128121
struct oid_array to_fetch = OID_ARRAY_INIT;
129122

130123
for (i = 0; i < rename_dst_nr; i++) {
131-
if (rename_dst[i].pair)
124+
if (rename_dst[i].p->renamed_pair)
132125
/*
133126
* The loop in diffcore_rename() will not need these
134127
* blobs, so skip prefetching.
135128
*/
136129
continue; /* already found exact match */
137130
diff_add_if_missing(options->repo, &to_fetch,
138-
rename_dst[i].two);
131+
rename_dst[i].p->two);
139132
}
140133
for (i = 0; i < rename_src_nr; i++) {
141134
if (options->skip_unmodified &&
@@ -245,26 +238,24 @@ static int estimate_similarity(struct repository *r,
245238

246239
static void record_rename_pair(int dst_index, int src_index, int score)
247240
{
248-
struct diff_filespec *src, *dst;
249-
struct diff_filepair *dp;
241+
struct diff_filepair *src = rename_src[src_index].p;
242+
struct diff_filepair *dst = rename_dst[dst_index].p;
250243

251-
if (rename_dst[dst_index].pair)
244+
if (dst->renamed_pair)
252245
die("internal error: dst already matched.");
253246

254-
src = rename_src[src_index].p->one;
255-
src->rename_used++;
256-
src->count++;
247+
src->one->rename_used++;
248+
src->one->count++;
257249

258-
dst = rename_dst[dst_index].two;
259-
dst->count++;
250+
rename_dst[dst_index].filespec_to_free = dst->one;
251+
rename_dst[dst_index].is_rename = 1;
260252

261-
dp = diff_queue(NULL, src, dst);
262-
dp->renamed_pair = 1;
263-
if (!strcmp(src->path, dst->path))
264-
dp->score = rename_src[src_index].score;
253+
dst->one = src->one;
254+
dst->renamed_pair = 1;
255+
if (!strcmp(dst->one->path, dst->two->path))
256+
dst->score = rename_src[src_index].score;
265257
else
266-
dp->score = score;
267-
rename_dst[dst_index].pair = dp;
258+
dst->score = score;
268259
}
269260

270261
/*
@@ -310,7 +301,7 @@ static int find_identical_files(struct hashmap *srcs,
310301
struct diff_options *options)
311302
{
312303
int renames = 0;
313-
struct diff_filespec *target = rename_dst[dst_index].two;
304+
struct diff_filespec *target = rename_dst[dst_index].p->two;
314305
struct file_similarity *p, *best = NULL;
315306
int i = 100, best_score = -1;
316307
unsigned int hash = hash_filespec(options->repo, target);
@@ -476,7 +467,7 @@ static int find_renames(struct diff_score *mx, int dst_cnt, int minimum_score, i
476467
(mx[i].score < minimum_score))
477468
break; /* there is no more usable pair. */
478469
dst = &rename_dst[mx[i].dst];
479-
if (dst->pair)
470+
if (dst->is_rename)
480471
continue; /* already done, either exact or fuzzy. */
481472
if (!copies && rename_src[mx[i].src].p->one->rename_used)
482473
continue;
@@ -511,7 +502,7 @@ void diffcore_rename(struct diff_options *options)
511502
else if (!options->flags.rename_empty &&
512503
is_empty_blob_oid(&p->two->oid))
513504
continue;
514-
else if (add_rename_dst(p->two) < 0) {
505+
else if (add_rename_dst(p) < 0) {
515506
warning("skipping rename detection, detected"
516507
" duplicate destination '%s'",
517508
p->two->path);
@@ -586,10 +577,10 @@ void diffcore_rename(struct diff_options *options)
586577
mx = xcalloc(st_mult(NUM_CANDIDATE_PER_DST, num_destinations),
587578
sizeof(*mx));
588579
for (dst_cnt = i = 0; i < rename_dst_nr; i++) {
589-
struct diff_filespec *two = rename_dst[i].two;
580+
struct diff_filespec *two = rename_dst[i].p->two;
590581
struct diff_score *m;
591582

592-
if (rename_dst[i].pair)
583+
if (rename_dst[i].is_rename)
593584
continue; /* dealt with exact match already. */
594585

595586
m = &mx[dst_cnt * NUM_CANDIDATE_PER_DST];
@@ -646,22 +637,8 @@ void diffcore_rename(struct diff_options *options)
646637
diff_q(&outq, p);
647638
}
648639
else if (!DIFF_FILE_VALID(p->one) && DIFF_FILE_VALID(p->two)) {
649-
/*
650-
* Creation
651-
*
652-
* We would output this create record if it has
653-
* not been turned into a rename/copy already.
654-
*/
655-
struct diff_rename_dst *dst = locate_rename_dst(p->two);
656-
if (dst && dst->pair) {
657-
diff_q(&outq, dst->pair);
658-
pair_to_free = p;
659-
}
660-
else
661-
/* no matching rename/copy source, so
662-
* record this as a creation.
663-
*/
664-
diff_q(&outq, p);
640+
/* Creation */
641+
diff_q(&outq, p);
665642
}
666643
else if (DIFF_FILE_VALID(p->one) && !DIFF_FILE_VALID(p->two)) {
667644
/*
@@ -682,8 +659,10 @@ void diffcore_rename(struct diff_options *options)
682659
*/
683660
if (DIFF_PAIR_BROKEN(p)) {
684661
/* broken delete */
685-
struct diff_rename_dst *dst = locate_rename_dst(p->one);
686-
if (dst && dst->pair)
662+
struct diff_rename_dst *dst = locate_rename_dst(p);
663+
if (!dst)
664+
BUG("tracking failed somehow; failed to find associated dst for broken pair");
665+
if (dst->is_rename)
687666
/* counterpart is now rename/copy */
688667
pair_to_free = p;
689668
}
@@ -693,16 +672,14 @@ void diffcore_rename(struct diff_options *options)
693672
pair_to_free = p;
694673
}
695674

696-
if (pair_to_free)
697-
;
698-
else
675+
if (!pair_to_free)
699676
diff_q(&outq, p);
700677
}
701678
else if (!diff_unmodified_pair(p))
702679
/* all the usual ones need to be kept */
703680
diff_q(&outq, p);
704681
else
705-
/* no need to keep unmodified pairs */
682+
/* no need to keep unmodified pairs; FIXME: remove earlier? */
706683
pair_to_free = p;
707684

708685
if (pair_to_free)
@@ -715,11 +692,16 @@ void diffcore_rename(struct diff_options *options)
715692
diff_debug_queue("done collapsing", q);
716693

717694
for (i = 0; i < rename_dst_nr; i++)
718-
free_filespec(rename_dst[i].two);
695+
if (rename_dst[i].filespec_to_free)
696+
free_filespec(rename_dst[i].filespec_to_free);
719697

720698
FREE_AND_NULL(rename_dst);
721699
rename_dst_nr = rename_dst_alloc = 0;
722700
FREE_AND_NULL(rename_src);
723701
rename_src_nr = rename_src_alloc = 0;
702+
if (break_idx) {
703+
strintmap_clear(break_idx);
704+
FREE_AND_NULL(break_idx);
705+
}
724706
return;
725707
}

0 commit comments

Comments
 (0)