Skip to content

Commit ac77d0c

Browse files
pcloudsgitster
authored andcommitted
pack-objects: shrink size field in struct object_entry
It's very very rare that an uncompressed object is larger than 4GB (partly because Git does not handle those large files very well to begin with). Let's optimize it for the common case where object size is smaller than this limit. Shrink size field down to 31 bits and one overflow bit. If the size is too large, we read it back from disk. As noted in the previous patch, we need to return the delta size instead of canonical size when the to-be-reused object entry type is a delta instead of a canonical one. Add two compare helpers that can take advantage of the overflow bit (e.g. if the file is 4GB+, chances are it's already larger than core.bigFileThreshold and there's no point in comparing the actual value). Another note about oe_get_size_slow(). This function MUST be thread safe because SIZE() macro is used inside try_delta() which may run in parallel. Outside parallel code, no-contention locking should be dirt cheap (or insignificant compared to i/o access anyway). To exercise this code, it's best to run the test suite with something like make test GIT_TEST_OE_SIZE=4 which forces this code on all objects larger than 3 bytes. Signed-off-by: Nguyễn Thái Ngọc Duy <[email protected]> Signed-off-by: Junio C Hamano <[email protected]>
1 parent 27a7d06 commit ac77d0c

File tree

4 files changed

+145
-26
lines changed

4 files changed

+145
-26
lines changed

builtin/pack-objects.c

Lines changed: 80 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@
3232
#include "object-store.h"
3333

3434
#define IN_PACK(obj) oe_in_pack(&to_pack, obj)
35+
#define SIZE(obj) oe_size(&to_pack, obj)
36+
#define SET_SIZE(obj,size) oe_set_size(&to_pack, obj, size)
3537
#define DELTA(obj) oe_delta(&to_pack, obj)
3638
#define DELTA_CHILD(obj) oe_delta_child(&to_pack, obj)
3739
#define DELTA_SIBLING(obj) oe_delta_sibling(&to_pack, obj)
@@ -276,7 +278,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
276278

277279
if (!usable_delta) {
278280
if (oe_type(entry) == OBJ_BLOB &&
279-
entry->size > big_file_threshold &&
281+
oe_size_greater_than(&to_pack, entry, big_file_threshold) &&
280282
(st = open_istream(&entry->idx.oid, &type, &size, NULL)) != NULL)
281283
buf = NULL;
282284
else {
@@ -385,12 +387,13 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
385387
unsigned char header[MAX_PACK_OBJECT_HEADER],
386388
dheader[MAX_PACK_OBJECT_HEADER];
387389
unsigned hdrlen;
390+
unsigned long entry_size = SIZE(entry);
388391

389392
if (DELTA(entry))
390393
type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
391394
OBJ_OFS_DELTA : OBJ_REF_DELTA;
392395
hdrlen = encode_in_pack_object_header(header, sizeof(header),
393-
type, entry->size);
396+
type, entry_size);
394397

395398
offset = entry->in_pack_offset;
396399
revidx = find_pack_revindex(p, offset);
@@ -407,7 +410,7 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
407410
datalen -= entry->in_pack_header_size;
408411

409412
if (!pack_to_stdout && p->index_version == 1 &&
410-
check_pack_inflate(p, &w_curs, offset, datalen, entry->size)) {
413+
check_pack_inflate(p, &w_curs, offset, datalen, entry_size)) {
411414
error("corrupt packed object for %s",
412415
oid_to_hex(&entry->idx.oid));
413416
unuse_pack(&w_curs);
@@ -1408,6 +1411,8 @@ static void cleanup_preferred_base(void)
14081411

14091412
static void check_object(struct object_entry *entry)
14101413
{
1414+
unsigned long canonical_size;
1415+
14111416
if (IN_PACK(entry)) {
14121417
struct packed_git *p = IN_PACK(entry);
14131418
struct pack_window *w_curs = NULL;
@@ -1445,7 +1450,7 @@ static void check_object(struct object_entry *entry)
14451450
default:
14461451
/* Not a delta hence we've already got all we need. */
14471452
oe_set_type(entry, entry->in_pack_type);
1448-
entry->size = in_pack_size;
1453+
SET_SIZE(entry, in_pack_size);
14491454
entry->in_pack_header_size = used;
14501455
if (oe_type(entry) < OBJ_COMMIT || oe_type(entry) > OBJ_BLOB)
14511456
goto give_up;
@@ -1502,9 +1507,9 @@ static void check_object(struct object_entry *entry)
15021507
* circular deltas.
15031508
*/
15041509
oe_set_type(entry, entry->in_pack_type);
1505-
entry->size = in_pack_size; /* delta size */
1510+
SET_SIZE(entry, in_pack_size); /* delta size */
15061511
SET_DELTA(entry, base_entry);
1507-
entry->delta_size = entry->size;
1512+
entry->delta_size = in_pack_size;
15081513
entry->delta_sibling_idx = base_entry->delta_child_idx;
15091514
SET_DELTA_CHILD(base_entry, entry);
15101515
unuse_pack(&w_curs);
@@ -1520,9 +1525,10 @@ static void check_object(struct object_entry *entry)
15201525
* object size from the delta header.
15211526
*/
15221527
delta_pos = entry->in_pack_offset + entry->in_pack_header_size;
1523-
entry->size = get_size_from_delta(p, &w_curs, delta_pos);
1524-
if (entry->size == 0)
1528+
canonical_size = get_size_from_delta(p, &w_curs, delta_pos);
1529+
if (canonical_size == 0)
15251530
goto give_up;
1531+
SET_SIZE(entry, canonical_size);
15261532
unuse_pack(&w_curs);
15271533
return;
15281534
}
@@ -1536,13 +1542,17 @@ static void check_object(struct object_entry *entry)
15361542
unuse_pack(&w_curs);
15371543
}
15381544

1539-
oe_set_type(entry, oid_object_info(&entry->idx.oid, &entry->size));
1540-
/*
1541-
* The error condition is checked in prepare_pack(). This is
1542-
* to permit a missing preferred base object to be ignored
1543-
* as a preferred base. Doing so can result in a larger
1544-
* pack file, but the transfer will still take place.
1545-
*/
1545+
oe_set_type(entry, oid_object_info(&entry->idx.oid, &canonical_size));
1546+
if (entry->type_valid) {
1547+
SET_SIZE(entry, canonical_size);
1548+
} else {
1549+
/*
1550+
* Bad object type is checked in prepare_pack(). This is
1551+
* to permit a missing preferred base object to be ignored
1552+
* as a preferred base. Doing so can result in a larger
1553+
* pack file, but the transfer will still take place.
1554+
*/
1555+
}
15461556
}
15471557

15481558
static int pack_offset_sort(const void *_a, const void *_b)
@@ -1582,6 +1592,7 @@ static void drop_reused_delta(struct object_entry *entry)
15821592
unsigned *idx = &to_pack.objects[entry->delta_idx - 1].delta_child_idx;
15831593
struct object_info oi = OBJECT_INFO_INIT;
15841594
enum object_type type;
1595+
unsigned long size;
15851596

15861597
while (*idx) {
15871598
struct object_entry *oe = &to_pack.objects[*idx - 1];
@@ -1594,7 +1605,7 @@ static void drop_reused_delta(struct object_entry *entry)
15941605
SET_DELTA(entry, NULL);
15951606
entry->depth = 0;
15961607

1597-
oi.sizep = &entry->size;
1608+
oi.sizep = &size;
15981609
oi.typep = &type;
15991610
if (packed_object_info(IN_PACK(entry), entry->in_pack_offset, &oi) < 0) {
16001611
/*
@@ -1603,11 +1614,11 @@ static void drop_reused_delta(struct object_entry *entry)
16031614
* And if that fails, the error will be recorded in oe_type(entry)
16041615
* and dealt with in prepare_pack().
16051616
*/
1606-
oe_set_type(entry, oid_object_info(&entry->idx.oid,
1607-
&entry->size));
1617+
oe_set_type(entry, oid_object_info(&entry->idx.oid, &size));
16081618
} else {
16091619
oe_set_type(entry, type);
16101620
}
1621+
SET_SIZE(entry, size);
16111622
}
16121623

16131624
/*
@@ -1747,7 +1758,8 @@ static void get_object_details(void)
17471758
for (i = 0; i < to_pack.nr_objects; i++) {
17481759
struct object_entry *entry = sorted_by_offset[i];
17491760
check_object(entry);
1750-
if (entry->type_valid && big_file_threshold < entry->size)
1761+
if (entry->type_valid &&
1762+
oe_size_greater_than(&to_pack, entry, big_file_threshold))
17511763
entry->no_try_delta = 1;
17521764
}
17531765

@@ -1776,6 +1788,8 @@ static int type_size_sort(const void *_a, const void *_b)
17761788
const struct object_entry *b = *(struct object_entry **)_b;
17771789
enum object_type a_type = oe_type(a);
17781790
enum object_type b_type = oe_type(b);
1791+
unsigned long a_size = SIZE(a);
1792+
unsigned long b_size = SIZE(b);
17791793

17801794
if (a_type > b_type)
17811795
return -1;
@@ -1789,9 +1803,9 @@ static int type_size_sort(const void *_a, const void *_b)
17891803
return -1;
17901804
if (a->preferred_base < b->preferred_base)
17911805
return 1;
1792-
if (a->size > b->size)
1806+
if (a_size > b_size)
17931807
return -1;
1794-
if (a->size < b->size)
1808+
if (a_size < b_size)
17951809
return 1;
17961810
return a < b ? -1 : (a > b); /* newest first */
17971811
}
@@ -1844,6 +1858,46 @@ static pthread_mutex_t progress_mutex;
18441858

18451859
#endif
18461860

1861+
/*
1862+
* Return the size of the object without doing any delta
1863+
* reconstruction (so non-deltas are true object sizes, but deltas
1864+
* return the size of the delta data).
1865+
*/
1866+
unsigned long oe_get_size_slow(struct packing_data *pack,
1867+
const struct object_entry *e)
1868+
{
1869+
struct packed_git *p;
1870+
struct pack_window *w_curs;
1871+
unsigned char *buf;
1872+
enum object_type type;
1873+
unsigned long used, avail, size;
1874+
1875+
if (e->type_ != OBJ_OFS_DELTA && e->type_ != OBJ_REF_DELTA) {
1876+
read_lock();
1877+
if (oid_object_info(&e->idx.oid, &size) < 0)
1878+
die(_("unable to get size of %s"),
1879+
oid_to_hex(&e->idx.oid));
1880+
read_unlock();
1881+
return size;
1882+
}
1883+
1884+
p = oe_in_pack(pack, e);
1885+
if (!p)
1886+
BUG("when e->type is a delta, it must belong to a pack");
1887+
1888+
read_lock();
1889+
w_curs = NULL;
1890+
buf = use_pack(p, &w_curs, e->in_pack_offset, &avail);
1891+
used = unpack_object_header_buffer(buf, avail, &type, &size);
1892+
if (used == 0)
1893+
die(_("unable to parse object header of %s"),
1894+
oid_to_hex(&e->idx.oid));
1895+
1896+
unuse_pack(&w_curs);
1897+
read_unlock();
1898+
return size;
1899+
}
1900+
18471901
static int try_delta(struct unpacked *trg, struct unpacked *src,
18481902
unsigned max_depth, unsigned long *mem_usage)
18491903
{
@@ -1878,7 +1932,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
18781932
return 0;
18791933

18801934
/* Now some size filtering heuristics. */
1881-
trg_size = trg_entry->size;
1935+
trg_size = SIZE(trg_entry);
18821936
if (!DELTA(trg_entry)) {
18831937
max_size = trg_size/2 - 20;
18841938
ref_depth = 1;
@@ -1890,7 +1944,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
18901944
(max_depth - ref_depth + 1);
18911945
if (max_size == 0)
18921946
return 0;
1893-
src_size = src_entry->size;
1947+
src_size = SIZE(src_entry);
18941948
sizediff = src_size < trg_size ? trg_size - src_size : 0;
18951949
if (sizediff >= max_size)
18961950
return 0;
@@ -2008,7 +2062,7 @@ static unsigned long free_unpacked(struct unpacked *n)
20082062
free_delta_index(n->index);
20092063
n->index = NULL;
20102064
if (n->data) {
2011-
freed_mem += n->entry->size;
2065+
freed_mem += SIZE(n->entry);
20122066
FREE_AND_NULL(n->data);
20132067
}
20142068
n->entry = NULL;
@@ -2458,7 +2512,8 @@ static void prepare_pack(int window, int depth)
24582512
*/
24592513
continue;
24602514

2461-
if (!entry->type_valid || entry->size < 50)
2515+
if (!entry->type_valid ||
2516+
oe_size_less_than(&to_pack, entry, 50))
24622517
continue;
24632518

24642519
if (entry->no_try_delta)

pack-objects.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,9 @@ void prepare_packing_data(struct packing_data *pdata)
143143
} else {
144144
prepare_in_pack_by_idx(pdata);
145145
}
146+
147+
pdata->oe_size_limit = git_env_ulong("GIT_TEST_OE_SIZE",
148+
1U << OE_SIZE_BITS);
146149
}
147150

148151
struct object_entry *packlist_alloc(struct packing_data *pdata,

pack-objects.h

Lines changed: 56 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@
77
#define OE_DEPTH_BITS 12
88
#define OE_IN_PACK_BITS 10
99
#define OE_Z_DELTA_BITS 20
10+
/*
11+
* Note that oe_set_size() becomes expensive when the given size is
12+
* above this limit. Don't lower it too much.
13+
*/
14+
#define OE_SIZE_BITS 31
1015

1116
/*
1217
* State flags for depth-first search used for analyzing delta cycles.
@@ -70,7 +75,8 @@ enum dfs_state {
7075
*/
7176
struct object_entry {
7277
struct pack_idx_entry idx;
73-
unsigned long size; /* uncompressed size */
78+
unsigned size_:OE_SIZE_BITS;
79+
unsigned size_valid:1;
7480
unsigned in_pack_idx:OE_IN_PACK_BITS; /* already in pack */
7581
off_t in_pack_offset;
7682
uint32_t delta_idx; /* delta base object */
@@ -115,6 +121,8 @@ struct packing_data {
115121
*/
116122
struct packed_git **in_pack_by_idx;
117123
struct packed_git **in_pack;
124+
125+
uintmax_t oe_size_limit;
118126
};
119127

120128
void prepare_packing_data(struct packing_data *pdata);
@@ -254,4 +262,51 @@ static inline void oe_set_delta_sibling(struct packing_data *pack,
254262
e->delta_sibling_idx = 0;
255263
}
256264

265+
unsigned long oe_get_size_slow(struct packing_data *pack,
266+
const struct object_entry *e);
267+
static inline unsigned long oe_size(struct packing_data *pack,
268+
const struct object_entry *e)
269+
{
270+
if (e->size_valid)
271+
return e->size_;
272+
273+
return oe_get_size_slow(pack, e);
274+
}
275+
276+
static inline int oe_size_less_than(struct packing_data *pack,
277+
const struct object_entry *lhs,
278+
unsigned long rhs)
279+
{
280+
if (lhs->size_valid)
281+
return lhs->size_ < rhs;
282+
if (rhs < pack->oe_size_limit) /* rhs < 2^x <= lhs ? */
283+
return 0;
284+
return oe_get_size_slow(pack, lhs) < rhs;
285+
}
286+
287+
static inline int oe_size_greater_than(struct packing_data *pack,
288+
const struct object_entry *lhs,
289+
unsigned long rhs)
290+
{
291+
if (lhs->size_valid)
292+
return lhs->size_ > rhs;
293+
if (rhs < pack->oe_size_limit) /* rhs < 2^x <= lhs ? */
294+
return 1;
295+
return oe_get_size_slow(pack, lhs) > rhs;
296+
}
297+
298+
static inline void oe_set_size(struct packing_data *pack,
299+
struct object_entry *e,
300+
unsigned long size)
301+
{
302+
if (size < pack->oe_size_limit) {
303+
e->size_ = size;
304+
e->size_valid = 1;
305+
} else {
306+
e->size_valid = 0;
307+
if (oe_get_size_slow(pack, e) != size)
308+
BUG("'size' is supposed to be the object size!");
309+
}
310+
}
311+
257312
#endif

t/README

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,12 @@ pack-objects code path where there are more than 1024 packs even if
309309
the actual number of packs in repository is below this limit. Accept
310310
any boolean values that are accepted by git-config.
311311

312+
GIT_TEST_OE_SIZE=<n> exercises the uncommon pack-objects code path
313+
where we do not cache object size in memory and read it from existing
314+
packs on demand. This normally only happens when the object size is
315+
over 2GB. This variable forces the code path on any object larger than
316+
<n> bytes.
317+
312318
Naming Tests
313319
------------
314320

0 commit comments

Comments
 (0)