Skip to content

Commit e6e24c9

Browse files
committed
Merge branch 'jk/pack-objects-optim-mru'
"git pack-objects" in a repository with many packfiles used to spend a lot of time looking for/at objects in them; the accesses to the packfiles are now optimized by checking the most-recently-used packfile first. * jk/pack-objects-optim-mru: pack-objects: use mru list when iterating over packs pack-objects: break delta cycles before delta-search phase sha1_file: make packed_object_info public provide an initializer for "struct object_info"
2 parents b8688ad + c9af708 commit e6e24c9

File tree

7 files changed

+227
-12
lines changed

7 files changed

+227
-12
lines changed

builtin/cat-file.c

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ static int cat_one_file(int opt, const char *exp_type, const char *obj_name,
5353
char *buf;
5454
unsigned long size;
5555
struct object_context obj_context;
56-
struct object_info oi = {NULL};
56+
struct object_info oi = OBJECT_INFO_INIT;
5757
struct strbuf sb = STRBUF_INIT;
5858
unsigned flags = LOOKUP_REPLACE_OBJECT;
5959
const char *path = force_path;
@@ -449,8 +449,7 @@ static int batch_objects(struct batch_options *opt)
449449
data.split_on_whitespace = 1;
450450

451451
if (opt->all_objects) {
452-
struct object_info empty;
453-
memset(&empty, 0, sizeof(empty));
452+
struct object_info empty = OBJECT_INFO_INIT;
454453
if (!memcmp(&data.info, &empty, sizeof(empty)))
455454
data.skip_object_info = 1;
456455
}

builtin/pack-objects.c

Lines changed: 90 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include "reachable.h"
2424
#include "sha1-array.h"
2525
#include "argv-array.h"
26+
#include "mru.h"
2627

2728
static const char *pack_usage[] = {
2829
N_("git pack-objects --stdout [<options>...] [< <ref-list> | < <object-list>]"),
@@ -994,7 +995,7 @@ static int want_object_in_pack(const unsigned char *sha1,
994995
struct packed_git **found_pack,
995996
off_t *found_offset)
996997
{
997-
struct packed_git *p;
998+
struct mru_entry *entry;
998999
int want;
9991000

10001001
if (!exclude && local && has_loose_object_nonlocal(sha1))
@@ -1011,7 +1012,8 @@ static int want_object_in_pack(const unsigned char *sha1,
10111012
return want;
10121013
}
10131014

1014-
for (p = packed_git; p; p = p->next) {
1015+
for (entry = packed_git_mru->head; entry; entry = entry->next) {
1016+
struct packed_git *p = entry->item;
10151017
off_t offset;
10161018

10171019
if (p == *found_pack)
@@ -1027,6 +1029,8 @@ static int want_object_in_pack(const unsigned char *sha1,
10271029
*found_pack = p;
10281030
}
10291031
want = want_found_object(exclude, p);
1032+
if (!exclude && want > 0)
1033+
mru_mark(packed_git_mru, entry);
10301034
if (want != -1)
10311035
return want;
10321036
}
@@ -1527,6 +1531,83 @@ static int pack_offset_sort(const void *_a, const void *_b)
15271531
(a->in_pack_offset > b->in_pack_offset);
15281532
}
15291533

1534+
/*
1535+
* Drop an on-disk delta we were planning to reuse. Naively, this would
1536+
* just involve blanking out the "delta" field, but we have to deal
1537+
* with some extra book-keeping:
1538+
*
1539+
* 1. Removing ourselves from the delta_sibling linked list.
1540+
*
1541+
* 2. Updating our size/type to the non-delta representation. These were
1542+
* either not recorded initially (size) or overwritten with the delta type
1543+
* (type) when check_object() decided to reuse the delta.
1544+
*/
1545+
static void drop_reused_delta(struct object_entry *entry)
1546+
{
1547+
struct object_entry **p = &entry->delta->delta_child;
1548+
struct object_info oi = OBJECT_INFO_INIT;
1549+
1550+
while (*p) {
1551+
if (*p == entry)
1552+
*p = (*p)->delta_sibling;
1553+
else
1554+
p = &(*p)->delta_sibling;
1555+
}
1556+
entry->delta = NULL;
1557+
1558+
oi.sizep = &entry->size;
1559+
oi.typep = &entry->type;
1560+
if (packed_object_info(entry->in_pack, entry->in_pack_offset, &oi) < 0) {
1561+
/*
1562+
* We failed to get the info from this pack for some reason;
1563+
* fall back to sha1_object_info, which may find another copy.
1564+
* And if that fails, the error will be recorded in entry->type
1565+
* and dealt with in prepare_pack().
1566+
*/
1567+
entry->type = sha1_object_info(entry->idx.sha1, &entry->size);
1568+
}
1569+
}
1570+
1571+
/*
1572+
* Follow the chain of deltas from this entry onward, throwing away any links
1573+
* that cause us to hit a cycle (as determined by the DFS state flags in
1574+
* the entries).
1575+
*/
1576+
static void break_delta_chains(struct object_entry *entry)
1577+
{
1578+
/* If it's not a delta, it can't be part of a cycle. */
1579+
if (!entry->delta) {
1580+
entry->dfs_state = DFS_DONE;
1581+
return;
1582+
}
1583+
1584+
switch (entry->dfs_state) {
1585+
case DFS_NONE:
1586+
/*
1587+
* This is the first time we've seen the object. We mark it as
1588+
* part of the active potential cycle and recurse.
1589+
*/
1590+
entry->dfs_state = DFS_ACTIVE;
1591+
break_delta_chains(entry->delta);
1592+
entry->dfs_state = DFS_DONE;
1593+
break;
1594+
1595+
case DFS_DONE:
1596+
/* object already examined, and not part of a cycle */
1597+
break;
1598+
1599+
case DFS_ACTIVE:
1600+
/*
1601+
* We found a cycle that needs broken. It would be correct to
1602+
* break any link in the chain, but it's convenient to
1603+
* break this one.
1604+
*/
1605+
drop_reused_delta(entry);
1606+
entry->dfs_state = DFS_DONE;
1607+
break;
1608+
}
1609+
}
1610+
15301611
static void get_object_details(void)
15311612
{
15321613
uint32_t i;
@@ -1544,6 +1625,13 @@ static void get_object_details(void)
15441625
entry->no_try_delta = 1;
15451626
}
15461627

1628+
/*
1629+
* This must happen in a second pass, since we rely on the delta
1630+
* information for the whole list being completed.
1631+
*/
1632+
for (i = 0; i < to_pack.nr_objects; i++)
1633+
break_delta_chains(&to_pack.objects[i]);
1634+
15471635
free(sorted_by_offset);
15481636
}
15491637

cache.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1602,7 +1602,15 @@ struct object_info {
16021602
} packed;
16031603
} u;
16041604
};
1605+
1606+
/*
1607+
* Initializer for a "struct object_info" that wants no items. You may
1608+
* also memset() the memory to all-zeroes.
1609+
*/
1610+
#define OBJECT_INFO_INIT {NULL}
1611+
16051612
extern int sha1_object_info_extended(const unsigned char *, struct object_info *, unsigned flags);
1613+
extern int packed_object_info(struct packed_git *pack, off_t offset, struct object_info *);
16061614

16071615
/* Dumb servers support */
16081616
extern int update_server_info(int);

pack-objects.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,15 @@ struct object_entry {
2727
unsigned no_try_delta:1;
2828
unsigned tagged:1; /* near the very tip of refs */
2929
unsigned filled:1; /* assigned write-order */
30+
31+
/*
32+
* State flags for depth-first search used for analyzing delta cycles.
33+
*/
34+
enum {
35+
DFS_NONE = 0,
36+
DFS_ACTIVE,
37+
DFS_DONE
38+
} dfs_state;
3039
};
3140

3241
struct packing_data {

sha1_file.c

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1826,11 +1826,9 @@ static int parse_sha1_header_extended(const char *hdr, struct object_info *oi,
18261826

18271827
int parse_sha1_header(const char *hdr, unsigned long *sizep)
18281828
{
1829-
struct object_info oi;
1829+
struct object_info oi = OBJECT_INFO_INIT;
18301830

18311831
oi.sizep = sizep;
1832-
oi.typename = NULL;
1833-
oi.typep = NULL;
18341832
return parse_sha1_header_extended(hdr, &oi, LOOKUP_REPLACE_OBJECT);
18351833
}
18361834

@@ -2068,8 +2066,8 @@ static enum object_type packed_to_object_type(struct packed_git *p,
20682066
goto out;
20692067
}
20702068

2071-
static int packed_object_info(struct packed_git *p, off_t obj_offset,
2072-
struct object_info *oi)
2069+
int packed_object_info(struct packed_git *p, off_t obj_offset,
2070+
struct object_info *oi)
20732071
{
20742072
struct pack_window *w_curs = NULL;
20752073
unsigned long size;
@@ -2840,7 +2838,7 @@ int sha1_object_info_extended(const unsigned char *sha1, struct object_info *oi,
28402838
int sha1_object_info(const unsigned char *sha1, unsigned long *sizep)
28412839
{
28422840
enum object_type type;
2843-
struct object_info oi = {NULL};
2841+
struct object_info oi = OBJECT_INFO_INIT;
28442842

28452843
oi.typep = &type;
28462844
oi.sizep = sizep;

streaming.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ struct git_istream *open_istream(const unsigned char *sha1,
135135
struct stream_filter *filter)
136136
{
137137
struct git_istream *st;
138-
struct object_info oi = {NULL};
138+
struct object_info oi = OBJECT_INFO_INIT;
139139
const unsigned char *real = lookup_replace_object(sha1);
140140
enum input_source src = istream_source(real, type, &oi);
141141

t/t5314-pack-cycle-detection.sh

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
#!/bin/sh
2+
3+
test_description='test handling of inter-pack delta cycles during repack
4+
5+
The goal here is to create a situation where we have two blobs, A and B, with A
6+
as a delta against B in one pack, and vice versa in the other. Then if we can
7+
persuade a full repack to find A from one pack and B from the other, that will
8+
give us a cycle when we attempt to reuse those deltas.
9+
10+
The trick is in the "persuade" step, as it depends on the internals of how
11+
pack-objects picks which pack to reuse the deltas from. But we can assume
12+
that it does so in one of two general strategies:
13+
14+
1. Using a static ordering of packs. In this case, no inter-pack cycles can
15+
happen. Any objects with a delta relationship must be present in the same
16+
pack (i.e., no "--thin" packs on disk), so we will find all related objects
17+
from that pack. So assuming there are no cycles within a single pack (and
18+
we avoid generating them via pack-objects or importing them via
19+
index-pack), then our result will have no cycles.
20+
21+
So this case should pass the tests no matter how we arrange things.
22+
23+
2. Picking the next pack to examine based on locality (i.e., where we found
24+
something else recently).
25+
26+
In this case, we want to make sure that we find the delta versions of A and
27+
B and not their base versions. We can do this by putting two blobs in each
28+
pack. The first is a "dummy" blob that can only be found in the pack in
29+
question. And then the second is the actual delta we want to find.
30+
31+
The two blobs must be present in the same tree, not present in other trees,
32+
and the dummy pathname must sort before the delta path.
33+
34+
The setup below focuses on case 2. We have two commits HEAD and HEAD^, each
35+
which has two files: "dummy" and "file". Then we can make two packs which
36+
contain:
37+
38+
[pack one]
39+
HEAD:dummy
40+
HEAD:file (as delta against HEAD^:file)
41+
HEAD^:file (as base)
42+
43+
[pack two]
44+
HEAD^:dummy
45+
HEAD^:file (as delta against HEAD:file)
46+
HEAD:file (as base)
47+
48+
Then no matter which order we start looking at the packs in, we know that we
49+
will always find a delta for "file", because its lookup will always come
50+
immediately after the lookup for "dummy".
51+
'
52+
. ./test-lib.sh
53+
54+
55+
56+
# Create a pack containing the the tree $1 and blob $1:file, with
57+
# the latter stored as a delta against $2:file.
58+
#
59+
# We convince pack-objects to make the delta in the direction of our choosing
60+
# by marking $2 as a preferred-base edge. That results in $1:file as a thin
61+
# delta, and index-pack completes it by adding $2:file as a base.
62+
#
63+
# Note that the two variants of "file" must be similar enough to convince git
64+
# to create the delta.
65+
make_pack () {
66+
{
67+
printf '%s\n' "-$(git rev-parse $2)"
68+
printf '%s dummy\n' "$(git rev-parse $1:dummy)"
69+
printf '%s file\n' "$(git rev-parse $1:file)"
70+
} |
71+
git pack-objects --stdout |
72+
git index-pack --stdin --fix-thin
73+
}
74+
75+
test_expect_success 'setup' '
76+
test-genrandom base 4096 >base &&
77+
for i in one two
78+
do
79+
# we want shared content here to encourage deltas...
80+
cp base file &&
81+
echo $i >>file &&
82+
83+
# ...whereas dummy should be short, because we do not want
84+
# deltas that would create duplicates when we --fix-thin
85+
echo $i >dummy &&
86+
87+
git add file dummy &&
88+
test_tick &&
89+
git commit -m $i ||
90+
return 1
91+
done &&
92+
93+
make_pack HEAD^ HEAD &&
94+
make_pack HEAD HEAD^
95+
'
96+
97+
test_expect_success 'repack' '
98+
# We first want to check that we do not have any internal errors,
99+
# and also that we do not hit the last-ditch cycle-breaking code
100+
# in write_object(), which will issue a warning to stderr.
101+
>expect &&
102+
git repack -ad 2>stderr &&
103+
test_cmp expect stderr &&
104+
105+
# And then double-check that the resulting pack is usable (i.e.,
106+
# we did not fail to notice any cycles). We know we are accessing
107+
# the objects via the new pack here, because "repack -d" will have
108+
# removed the others.
109+
git cat-file blob HEAD:file >/dev/null &&
110+
git cat-file blob HEAD^:file >/dev/null
111+
'
112+
113+
test_done

0 commit comments

Comments
 (0)