Skip to content

Commit b3dfeeb

Browse files
Kevin Willfordgitster
authored andcommitted
rebase: avoid computing unnecessary patch IDs
The `rebase` family of Git commands avoid applying patches that were already integrated upstream. They do that by using the revision walking option that computes the patch IDs of the two sides of the rebase (local-only patches vs upstream-only ones) and skipping those local patches whose patch ID matches one of the upstream ones. In many cases, this causes unnecessary churn, as already the set of paths touched by a given commit would suffice to determine that an upstream patch has no local equivalent. This hurts performance in particular when there are a lot of upstream patches, and/or large ones. Therefore, let's introduce the concept of a "diff-header-only" patch ID, compare those first, and only evaluate the "full" patch ID lazily. Please note that in contrast to the "full" patch IDs, those "diff-header-only" patch IDs are prone to collide with one another, as adjacent commits frequently touch the very same files. Hence we now have to be careful to allow multiple hash entries with the same hash. We accomplish that by using the hashmap_add() function that does not even test for hash collisions. This also allows us to evaluate the full patch ID lazily, i.e. only when we found commits with matching diff-header-only patch IDs. We add a performance test that demonstrates ~1-6% improvement. In practice this will depend on various factors such as how many upstream changes and how big those changes are along with whether file system caches are cold or warm. As Git's test suite has no way of catching performance regressions, we also add a regression test that verifies that the full patch ID computation is skipped when the diff-header-only computation suffices. Signed-off-by: Kevin Willford <[email protected]> Signed-off-by: Junio C Hamano <[email protected]>
1 parent 3e8e32c commit b3dfeeb

File tree

5 files changed

+85
-8
lines changed

5 files changed

+85
-8
lines changed

builtin/log.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1315,7 +1315,7 @@ static void prepare_bases(struct base_tree_info *bases,
13151315
struct object_id *patch_id;
13161316
if (commit->util)
13171317
continue;
1318-
if (commit_patch_id(commit, &diffopt, sha1))
1318+
if (commit_patch_id(commit, &diffopt, sha1, 0))
13191319
die(_("cannot get patch id"));
13201320
ALLOC_GROW(bases->patch_id, bases->nr_patch_id + 1, bases->alloc_patch_id);
13211321
patch_id = bases->patch_id + bases->nr_patch_id;

patch-ids.c

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,21 +5,39 @@
55
#include "patch-ids.h"
66

77
int commit_patch_id(struct commit *commit, struct diff_options *options,
8-
unsigned char *sha1)
8+
unsigned char *sha1, int diff_header_only)
99
{
1010
if (commit->parents)
1111
diff_tree_sha1(commit->parents->item->object.oid.hash,
1212
commit->object.oid.hash, "", options);
1313
else
1414
diff_root_tree_sha1(commit->object.oid.hash, "", options);
1515
diffcore_std(options);
16-
return diff_flush_patch_id(options, sha1, 0);
16+
return diff_flush_patch_id(options, sha1, diff_header_only);
1717
}
1818

19+
/*
20+
* When we cannot load the full patch-id for both commits for whatever
21+
* reason, the function returns -1 (i.e. return error(...)). Despite
22+
* the "cmp" in the name of this function, the caller only cares about
23+
* the return value being zero (a and b are equivalent) or non-zero (a
24+
* and b are different), and returning non-zero would keep both in the
25+
* result, even if they actually were equivalent, in order to err on
26+
* the side of safety. The actual value being negative does not have
27+
* any significance; only that it is non-zero matters.
28+
*/
1929
static int patch_id_cmp(struct patch_id *a,
2030
struct patch_id *b,
21-
void *keydata)
31+
struct diff_options *opt)
2232
{
33+
if (is_null_sha1(a->patch_id) &&
34+
commit_patch_id(a->commit, opt, a->patch_id, 0))
35+
return error("Could not get patch ID for %s",
36+
oid_to_hex(&a->commit->object.oid));
37+
if (is_null_sha1(b->patch_id) &&
38+
commit_patch_id(b->commit, opt, b->patch_id, 0))
39+
return error("Could not get patch ID for %s",
40+
oid_to_hex(&b->commit->object.oid));
2341
return hashcmp(a->patch_id, b->patch_id);
2442
}
2543

@@ -43,11 +61,13 @@ static int init_patch_id_entry(struct patch_id *patch,
4361
struct commit *commit,
4462
struct patch_ids *ids)
4563
{
64+
unsigned char header_only_patch_id[GIT_SHA1_RAWSZ];
65+
4666
patch->commit = commit;
47-
if (commit_patch_id(commit, &ids->diffopts, patch->patch_id))
67+
if (commit_patch_id(commit, &ids->diffopts, header_only_patch_id, 1))
4868
return -1;
4969

50-
hashmap_entry_init(patch, sha1hash(patch->patch_id));
70+
hashmap_entry_init(patch, sha1hash(header_only_patch_id));
5171
return 0;
5272
}
5373

@@ -60,7 +80,7 @@ struct patch_id *has_commit_patch_id(struct commit *commit,
6080
if (init_patch_id_entry(&patch, commit, ids))
6181
return NULL;
6282

63-
return hashmap_get(&ids->patches, &patch, NULL);
83+
return hashmap_get(&ids->patches, &patch, &ids->diffopts);
6484
}
6585

6686
struct patch_id *add_commit_patch_id(struct commit *commit,

patch-ids.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ struct patch_ids {
1313
};
1414

1515
int commit_patch_id(struct commit *commit, struct diff_options *options,
16-
unsigned char *sha1);
16+
unsigned char *sha1, int);
1717
int init_patch_ids(struct patch_ids *);
1818
int free_patch_ids(struct patch_ids *);
1919
struct patch_id *add_commit_patch_id(struct commit *, struct patch_ids *);

t/perf/p3400-rebase.sh

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#!/bin/sh
2+
3+
test_description='Tests rebase performance'
4+
. ./perf-lib.sh
5+
6+
test_perf_default_repo
7+
8+
test_expect_success 'setup' '
9+
git checkout -f -b base &&
10+
git checkout -b to-rebase &&
11+
git checkout -b upstream &&
12+
for i in $(seq 100)
13+
do
14+
# simulate huge diffs
15+
echo change$i >unrelated-file$i &&
16+
seq 1000 >>unrelated-file$i &&
17+
git add unrelated-file$i &&
18+
test_tick &&
19+
git commit -m commit$i unrelated-file$i &&
20+
echo change$i >unrelated-file$i &&
21+
seq 1000 | tac >>unrelated-file$i &&
22+
git add unrelated-file$i &&
23+
test_tick &&
24+
git commit -m commit$i-reverse unrelated-file$i ||
25+
break
26+
done &&
27+
git checkout to-rebase &&
28+
test_commit our-patch interesting-file
29+
'
30+
31+
test_perf 'rebase on top of a lot of unrelated changes' '
32+
git rebase --onto upstream HEAD^ &&
33+
git rebase --onto base HEAD^
34+
'
35+
36+
test_done

t/t6007-rev-list-cherry-pick-file.sh

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,4 +207,25 @@ test_expect_success '--count --left-right' '
207207
test_cmp expect actual
208208
'
209209

210+
# Corrupt the object store deliberately to make sure
211+
# the object is not even checked for its existence.
212+
remove_loose_object () {
213+
sha1="$(git rev-parse "$1")" &&
214+
remainder=${sha1#??} &&
215+
firsttwo=${sha1%$remainder} &&
216+
rm .git/objects/$firsttwo/$remainder
217+
}
218+
219+
test_expect_success '--cherry-pick avoids looking at full diffs' '
220+
git checkout -b shy-diff &&
221+
test_commit dont-look-at-me &&
222+
echo Hello >dont-look-at-me.t &&
223+
test_tick &&
224+
git commit -m tip dont-look-at-me.t &&
225+
git checkout -b mainline HEAD^ &&
226+
test_commit to-cherry-pick &&
227+
remove_loose_object shy-diff^:dont-look-at-me.t &&
228+
git rev-list --cherry-pick ...shy-diff
229+
'
230+
210231
test_done

0 commit comments

Comments
 (0)