Skip to content

Commit e6c587c

Browse files
torvaldsgitster
authored andcommitted
abbrev: auto size the default abbreviation
In fairly early days we somehow decided to abbreviate object names down to 7-hexdigits, but as projects grow, it is becoming more and more likely to see such a short object names made in earlier days and recorded in the log messages no longer unique. Currently the Linux kernel project needs 11 to 12 hexdigits, while Git itself needs 10 hexdigits to uniquely identify the objects they have, while many smaller projects may still be fine with the original 7-hexdigit default. One-size does not fit all projects. Introduce a mechanism, where we estimate the number of objects in the repository upon the first request to abbreviate an object name with the default setting and come up with a sane default for the repository. Based on the expectation that we would see collision in a repository with 2^(2N) objects when using object names shortened to first N bits, use sufficient number of hexdigits to cover the number of objects in the repository. Each hexdigit (4-bits) we add to the shortened name allows us to have four times (2-bits) as many objects in the repository. Signed-off-by: Linus Torvalds <[email protected]> Signed-off-by: Junio C Hamano <[email protected]>
1 parent 7b5b772 commit e6c587c

File tree

3 files changed

+29
-2
lines changed

3 files changed

+29
-2
lines changed

cache.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1204,6 +1204,7 @@ struct object_context {
12041204
#define GET_SHA1_TREEISH 020
12051205
#define GET_SHA1_BLOB 040
12061206
#define GET_SHA1_FOLLOW_SYMLINKS 0100
1207+
#define GET_SHA1_AUTOMATIC 0200
12071208
#define GET_SHA1_ONLY_TO_DIE 04000
12081209

12091210
#define GET_SHA1_DISAMBIGUATORS \

environment.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ int trust_executable_bit = 1;
1616
int trust_ctime = 1;
1717
int check_stat = 1;
1818
int has_symlinks = 1;
19-
int minimum_abbrev = 4, default_abbrev = FALLBACK_DEFAULT_ABBREV;
19+
int minimum_abbrev = 4, default_abbrev = -1;
2020
int ignore_case;
2121
int assume_unchanged;
2222
int prefer_symlink_refs;

sha1_name.c

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ typedef int (*disambiguate_hint_fn)(const unsigned char *, void *);
1515

1616
struct disambiguate_state {
1717
int len; /* length of prefix in hex chars */
18+
unsigned int nrobjects;
1819
char hex_pfx[GIT_SHA1_HEXSZ + 1];
1920
unsigned char bin_pfx[GIT_SHA1_RAWSZ];
2021

@@ -118,6 +119,14 @@ static void find_short_object_filename(struct disambiguate_state *ds)
118119

119120
if (strlen(de->d_name) != 38)
120121
continue;
122+
123+
/*
124+
* We only look at the one subdirectory, and we assume
125+
* each subdirectory is roughly similar, so each
126+
* object we find probably has 255 other objects in
127+
* the other fan-out directories.
128+
*/
129+
ds->nrobjects += 256;
121130
if (memcmp(de->d_name, ds->hex_pfx + 2, ds->len - 2))
122131
continue;
123132
memcpy(hex + 2, de->d_name, 38);
@@ -151,6 +160,7 @@ static void unique_in_pack(struct packed_git *p,
151160

152161
open_pack_index(p);
153162
num = p->num_objects;
163+
ds->nrobjects += num;
154164
last = num;
155165
while (first < last) {
156166
uint32_t mid = (first + last) / 2;
@@ -380,6 +390,9 @@ static int show_ambiguous_object(const unsigned char *sha1, void *data)
380390
return 0;
381391
}
382392

393+
/* start from our historical default before the automatic abbreviation */
394+
static int default_automatic_abbrev = FALLBACK_DEFAULT_ABBREV;
395+
383396
static int get_short_sha1(const char *name, int len, unsigned char *sha1,
384397
unsigned flags)
385398
{
@@ -426,6 +439,14 @@ static int get_short_sha1(const char *name, int len, unsigned char *sha1,
426439
for_each_abbrev(ds.hex_pfx, show_ambiguous_object, &ds);
427440
}
428441

442+
if (len < 16 && !status && (flags & GET_SHA1_AUTOMATIC)) {
443+
unsigned int expect_collision = 1 << (len * 2);
444+
if (ds.nrobjects > expect_collision) {
445+
default_automatic_abbrev = len+1;
446+
return SHORT_NAME_AMBIGUOUS;
447+
}
448+
}
449+
429450
return status;
430451
}
431452

@@ -458,14 +479,19 @@ int for_each_abbrev(const char *prefix, each_abbrev_fn fn, void *cb_data)
458479
int find_unique_abbrev_r(char *hex, const unsigned char *sha1, int len)
459480
{
460481
int status, exists;
482+
int flags = GET_SHA1_QUIETLY;
461483

484+
if (len < 0) {
485+
flags |= GET_SHA1_AUTOMATIC;
486+
len = default_automatic_abbrev;
487+
}
462488
sha1_to_hex_r(hex, sha1);
463489
if (len == 40 || !len)
464490
return 40;
465491
exists = has_sha1_file(sha1);
466492
while (len < 40) {
467493
unsigned char sha1_ret[20];
468-
status = get_short_sha1(hex, len, sha1_ret, GET_SHA1_QUIETLY);
494+
status = get_short_sha1(hex, len, sha1_ret, flags);
469495
if (exists
470496
? !status
471497
: status == SHORT_NAME_NOT_FOUND) {

0 commit comments

Comments
 (0)