Skip to content

Commit 6aca863

Browse files
committed
Merge branch 'ds/pack-name-hash-tweak' into seen
In a repository with too many (more than --window size) similarly named files, "git repack" would not find good delta-base candidate and worse, it may not use a blob from exactly the same path as a good delta-base candidate. Optionally replace the name hash so that only blobs at the same path and nothing else are used as delta-base candidate. * ds/pack-name-hash-tweak: p5313: add size comparison test p5314: add a size test for name-hash collisions git-repack: update usage to match docs pack-objects: add --full-name-hash option
2 parents dc38dda + f17429d commit 6aca863

File tree

12 files changed

+186
-9
lines changed

12 files changed

+186
-9
lines changed

Documentation/git-pack-objects.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@ SYNOPSIS
1515
[--revs [--unpacked | --all]] [--keep-pack=<pack-name>]
1616
[--cruft] [--cruft-expiration=<time>]
1717
[--stdout [--filter=<filter-spec>] | <base-name>]
18-
[--shallow] [--keep-true-parents] [--[no-]sparse] < <object-list>
18+
[--shallow] [--keep-true-parents] [--[no-]sparse]
19+
[--full-name-hash] < <object-list>
1920

2021

2122
DESCRIPTION

Documentation/git-repack.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@ git-repack - Pack unpacked objects in a repository
99
SYNOPSIS
1010
--------
1111
[verse]
12-
'git repack' [-a] [-A] [-d] [-f] [-F] [-l] [-n] [-q] [-b] [-m] [--window=<n>] [--depth=<n>] [--threads=<n>] [--keep-pack=<pack-name>] [--write-midx]
12+
'git repack' [-a] [-A] [-d] [-f] [-F] [-l] [-n] [-q] [-b] [-m]
13+
[--window=<n>] [--depth=<n>] [--threads=<n>] [--keep-pack=<pack-name>]
14+
[--write-midx] [--full-name-hash]
1315

1416
DESCRIPTION
1517
-----------

Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -816,6 +816,7 @@ TEST_BUILTINS_OBJS += test-lazy-init-name-hash.o
816816
TEST_BUILTINS_OBJS += test-match-trees.o
817817
TEST_BUILTINS_OBJS += test-mergesort.o
818818
TEST_BUILTINS_OBJS += test-mktemp.o
819+
TEST_BUILTINS_OBJS += test-name-hash.o
819820
TEST_BUILTINS_OBJS += test-online-cpus.o
820821
TEST_BUILTINS_OBJS += test-pack-mtimes.o
821822
TEST_BUILTINS_OBJS += test-parse-options.o

builtin/pack-objects.c

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,14 @@ struct configured_exclusion {
266266
static struct oidmap configured_exclusions;
267267

268268
static struct oidset excluded_by_config;
269+
static int use_full_name_hash;
270+
271+
static inline uint32_t pack_name_hash_fn(const char *name)
272+
{
273+
if (use_full_name_hash)
274+
return pack_full_name_hash(name);
275+
return pack_name_hash(name);
276+
}
269277

270278
/*
271279
* stats
@@ -1699,7 +1707,7 @@ static int add_object_entry(const struct object_id *oid, enum object_type type,
16991707
return 0;
17001708
}
17011709

1702-
create_object_entry(oid, type, pack_name_hash(name),
1710+
create_object_entry(oid, type, pack_name_hash_fn(name),
17031711
exclude, name && no_try_delta(name),
17041712
found_pack, found_offset);
17051713
return 1;
@@ -1913,7 +1921,7 @@ static void add_preferred_base_object(const char *name)
19131921
{
19141922
struct pbase_tree *it;
19151923
size_t cmplen;
1916-
unsigned hash = pack_name_hash(name);
1924+
unsigned hash = pack_name_hash_fn(name);
19171925

19181926
if (!num_preferred_base || check_pbase_path(hash))
19191927
return;
@@ -3423,7 +3431,7 @@ static void show_object_pack_hint(struct object *object, const char *name,
34233431
* here using a now in order to perhaps improve the delta selection
34243432
* process.
34253433
*/
3426-
oe->hash = pack_name_hash(name);
3434+
oe->hash = pack_name_hash_fn(name);
34273435
oe->no_try_delta = name && no_try_delta(name);
34283436

34293437
stdin_packs_hints_nr++;
@@ -3573,7 +3581,7 @@ static void add_cruft_object_entry(const struct object_id *oid, enum object_type
35733581
entry = packlist_find(&to_pack, oid);
35743582
if (entry) {
35753583
if (name) {
3576-
entry->hash = pack_name_hash(name);
3584+
entry->hash = pack_name_hash_fn(name);
35773585
entry->no_try_delta = no_try_delta(name);
35783586
}
35793587
} else {
@@ -3596,7 +3604,7 @@ static void add_cruft_object_entry(const struct object_id *oid, enum object_type
35963604
return;
35973605
}
35983606

3599-
entry = create_object_entry(oid, type, pack_name_hash(name),
3607+
entry = create_object_entry(oid, type, pack_name_hash_fn(name),
36003608
0, name && no_try_delta(name),
36013609
pack, offset);
36023610
}
@@ -4430,6 +4438,8 @@ int cmd_pack_objects(int argc,
44304438
OPT_STRING_LIST(0, "uri-protocol", &uri_protocols,
44314439
N_("protocol"),
44324440
N_("exclude any configured uploadpack.blobpackfileuri with this protocol")),
4441+
OPT_BOOL(0, "full-name-hash", &use_full_name_hash,
4442+
N_("optimize delta compression across identical path names over time")),
44334443
OPT_END(),
44344444
};
44354445

builtin/repack.c

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,9 @@ static int run_update_server_info = 1;
3939
static char *packdir, *packtmp_name, *packtmp;
4040

4141
static const char *const git_repack_usage[] = {
42-
N_("git repack [<options>]"),
42+
N_("git repack [-a] [-A] [-d] [-f] [-F] [-l] [-n] [-q] [-b] [-m]\n"
43+
"[--window=<n>] [--depth=<n>] [--threads=<n>] [--keep-pack=<pack-name>]\n"
44+
"[--write-midx] [--full-name-hash]"),
4345
NULL
4446
};
4547

@@ -58,6 +60,7 @@ struct pack_objects_args {
5860
int no_reuse_object;
5961
int quiet;
6062
int local;
63+
int full_name_hash;
6164
struct list_objects_filter_options filter_options;
6265
};
6366

@@ -306,6 +309,8 @@ static void prepare_pack_objects(struct child_process *cmd,
306309
strvec_pushf(&cmd->args, "--no-reuse-delta");
307310
if (args->no_reuse_object)
308311
strvec_pushf(&cmd->args, "--no-reuse-object");
312+
if (args->full_name_hash)
313+
strvec_pushf(&cmd->args, "--full-name-hash");
309314
if (args->local)
310315
strvec_push(&cmd->args, "--local");
311316
if (args->quiet)
@@ -1203,6 +1208,8 @@ int cmd_repack(int argc,
12031208
N_("pass --no-reuse-delta to git-pack-objects")),
12041209
OPT_BOOL('F', NULL, &po_args.no_reuse_object,
12051210
N_("pass --no-reuse-object to git-pack-objects")),
1211+
OPT_BOOL(0, "full-name-hash", &po_args.full_name_hash,
1212+
N_("pass --full-name-hash to git-pack-objects")),
12061213
OPT_NEGBIT('n', NULL, &run_update_server_info,
12071214
N_("do not run git-update-server-info"), 1),
12081215
OPT__QUIET(&po_args.quiet, N_("be quiet")),

pack-objects.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,26 @@ static inline uint32_t pack_name_hash(const char *name)
207207
return hash;
208208
}
209209

210+
static inline uint32_t pack_full_name_hash(const char *name)
211+
{
212+
const uint32_t bigp = 1234572167U;
213+
uint32_t c, hash = bigp;
214+
215+
if (!name)
216+
return 0;
217+
218+
/*
219+
* Just do the dumbest thing possible: add random multiples of a
220+
* large prime number with a binary shift. Goal is not cryptographic,
221+
* but generally uniformly distributed.
222+
*/
223+
while ((c = *name++) != 0) {
224+
hash += c * bigp;
225+
hash = (hash >> 5) | (hash << 27);
226+
}
227+
return hash;
228+
}
229+
210230
static inline enum object_type oe_type(const struct object_entry *e)
211231
{
212232
return e->type_valid ? e->type_ : OBJ_BAD;

t/helper/test-name-hash.c

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
/*
2+
* test-name-hash.c: Read a list of paths over stdin and report on their
3+
* name-hash and full name-hash.
4+
*/
5+
6+
#include "test-tool.h"
7+
#include "git-compat-util.h"
8+
#include "pack-objects.h"
9+
#include "strbuf.h"
10+
11+
int cmd__name_hash(int argc UNUSED, const char **argv UNUSED)
12+
{
13+
struct strbuf line = STRBUF_INIT;
14+
15+
while (!strbuf_getline(&line, stdin)) {
16+
uint32_t name_hash = pack_name_hash(line.buf);
17+
uint32_t full_hash = pack_full_name_hash(line.buf);
18+
19+
printf("%10"PRIu32"\t%10"PRIu32"\t%s\n", name_hash, full_hash, line.buf);
20+
}
21+
22+
return 0;
23+
}

t/helper/test-tool.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ static struct test_cmd cmds[] = {
4444
{ "match-trees", cmd__match_trees },
4545
{ "mergesort", cmd__mergesort },
4646
{ "mktemp", cmd__mktemp },
47+
{ "name-hash", cmd__name_hash },
4748
{ "online-cpus", cmd__online_cpus },
4849
{ "pack-mtimes", cmd__pack_mtimes },
4950
{ "parse-options", cmd__parse_options },

t/helper/test-tool.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ int cmd__lazy_init_name_hash(int argc, const char **argv);
3737
int cmd__match_trees(int argc, const char **argv);
3838
int cmd__mergesort(int argc, const char **argv);
3939
int cmd__mktemp(int argc, const char **argv);
40+
int cmd__name_hash(int argc, const char **argv);
4041
int cmd__online_cpus(int argc, const char **argv);
4142
int cmd__pack_mtimes(int argc, const char **argv);
4243
int cmd__parse_options(int argc, const char **argv);

t/perf/p5313-pack-objects.sh

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
#!/bin/sh
2+
3+
test_description='Tests pack performance using bitmaps'
4+
. ./perf-lib.sh
5+
6+
GIT_TEST_PASSING_SANITIZE_LEAK=0
7+
export GIT_TEST_PASSING_SANITIZE_LEAK
8+
9+
test_perf_large_repo
10+
11+
test_expect_success 'create rev input' '
12+
cat >in-thin <<-EOF &&
13+
$(git rev-parse HEAD)
14+
^$(git rev-parse HEAD~1)
15+
EOF
16+
17+
cat >in-big <<-EOF
18+
$(git rev-parse HEAD)
19+
^$(git rev-parse HEAD~1000)
20+
EOF
21+
'
22+
23+
test_perf 'thin pack' '
24+
git pack-objects --thin --stdout --revs --sparse <in-thin >out
25+
'
26+
27+
test_size 'thin pack size' '
28+
wc -c <out
29+
'
30+
31+
test_perf 'thin pack with --full-name-hash' '
32+
git pack-objects --thin --stdout --revs --sparse --full-name-hash <in-thin >out
33+
'
34+
35+
test_size 'thin pack size with --full-name-hash' '
36+
wc -c <out
37+
'
38+
39+
test_perf 'big pack' '
40+
git pack-objects --stdout --revs --sparse <in-big >out
41+
'
42+
43+
test_size 'big pack size' '
44+
wc -c <out
45+
'
46+
47+
test_perf 'big pack with --full-name-hash' '
48+
git pack-objects --stdout --revs --sparse --full-name-hash <in-big >out
49+
'
50+
51+
test_size 'big pack size with --full-name-hash' '
52+
wc -c <out
53+
'
54+
55+
test_perf 'repack' '
56+
git repack -adf
57+
'
58+
59+
test_size 'repack size' '
60+
du -a .git/objects/pack | sort -nr | awk "{ print \$1; }" | head -n 1
61+
'
62+
63+
test_perf 'repack with --full-name-hash' '
64+
git repack -adf --full-name-hash
65+
'
66+
67+
test_size 'repack size with --full-name-hash' '
68+
du -a .git/objects/pack | sort -nr | awk "{ print \$1; }" | head -n 1
69+
'
70+
71+
test_done

t/perf/p5314-name-hash.sh

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
#!/bin/sh
2+
3+
test_description='Tests pack performance using bitmaps'
4+
. ./perf-lib.sh
5+
6+
GIT_TEST_PASSING_SANITIZE_LEAK=0
7+
export GIT_TEST_PASSING_SANITIZE_LEAK
8+
9+
test_perf_large_repo
10+
11+
test_size 'paths at head' '
12+
git ls-tree -r --name-only HEAD >path-list &&
13+
wc -l <path-list
14+
'
15+
16+
test_size 'number of distinct name-hashes' '
17+
cat path-list | test-tool name-hash >name-hashes &&
18+
cat name-hashes | awk "{ print \$1; }" | sort -n | uniq -c >name-hash-count &&
19+
wc -l <name-hash-count
20+
'
21+
22+
test_size 'number of distinct full-name-hashes' '
23+
cat name-hashes | awk "{ print \$2; }" | sort -n | uniq -c >full-name-hash-count &&
24+
wc -l <full-name-hash-count
25+
'
26+
27+
test_size 'maximum multiplicity of name-hashes' '
28+
cat name-hash-count | \
29+
sort -nr | \
30+
head -n 1 | \
31+
awk "{ print \$1; }"
32+
'
33+
34+
test_size 'maximum multiplicity of fullname-hashes' '
35+
cat full-name-hash-count | \
36+
sort -nr | \
37+
head -n 1 | \
38+
awk "{ print \$1; }"
39+
'
40+
41+
test_done

t/t0450/txt-help-mismatches

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,6 @@ rebase
4545
remote
4646
remote-ext
4747
remote-fd
48-
repack
4948
reset
5049
restore
5150
rev-parse

0 commit comments

Comments
 (0)