Skip to content

Commit fe1ed56

Browse files
derrickstoleegitster
authored andcommitted
midx: sort and deduplicate objects from packfiles
Before writing a list of objects and their offsets to a multi-pack-index, we need to collect the list of objects contained in the packfiles. There may be multiple copies of some objects, so this list must be deduplicated. It is possible to artificially get into a state where there are many duplicate copies of objects. That can create high memory pressure if we are to create a list of all objects before de-duplication. To reduce this memory pressure without a significant performance drop, automatically group objects by the first byte of their object id. Use the IDX fanout tables to group the data, copy to a local array, then sort. Copy only the de-duplicated entries. Select the duplicate based on the most-recent modified time of a packfile containing the object. Signed-off-by: Derrick Stolee <[email protected]> Signed-off-by: Junio C Hamano <[email protected]>
1 parent 3227565 commit fe1ed56

File tree

3 files changed

+147
-0
lines changed

3 files changed

+147
-0
lines changed

midx.c

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include "lockfile.h"
55
#include "packfile.h"
66
#include "object-store.h"
7+
#include "packfile.h"
78
#include "midx.h"
89

910
#define MIDX_SIGNATURE 0x4d494458 /* "MIDX" */
@@ -182,12 +183,21 @@ static void add_pack_to_midx(const char *full_path, size_t full_path_len,
182183
packs->list[packs->nr] = add_packed_git(full_path,
183184
full_path_len,
184185
0);
186+
185187
if (!packs->list[packs->nr]) {
186188
warning(_("failed to add packfile '%s'"),
187189
full_path);
188190
return;
189191
}
190192

193+
if (open_pack_index(packs->list[packs->nr])) {
194+
warning(_("failed to open pack-index '%s'"),
195+
full_path);
196+
close_pack(packs->list[packs->nr]);
197+
FREE_AND_NULL(packs->list[packs->nr]);
198+
return;
199+
}
200+
191201
packs->names[packs->nr] = xstrdup(file_name);
192202
packs->pack_name_concat_len += strlen(file_name) + 1;
193203
packs->nr++;
@@ -228,6 +238,119 @@ static void sort_packs_by_name(char **pack_names, uint32_t nr_packs, uint32_t *p
228238
free(pairs);
229239
}
230240

241+
struct pack_midx_entry {
242+
struct object_id oid;
243+
uint32_t pack_int_id;
244+
time_t pack_mtime;
245+
uint64_t offset;
246+
};
247+
248+
static int midx_oid_compare(const void *_a, const void *_b)
249+
{
250+
const struct pack_midx_entry *a = (const struct pack_midx_entry *)_a;
251+
const struct pack_midx_entry *b = (const struct pack_midx_entry *)_b;
252+
int cmp = oidcmp(&a->oid, &b->oid);
253+
254+
if (cmp)
255+
return cmp;
256+
257+
if (a->pack_mtime > b->pack_mtime)
258+
return -1;
259+
else if (a->pack_mtime < b->pack_mtime)
260+
return 1;
261+
262+
return a->pack_int_id - b->pack_int_id;
263+
}
264+
265+
static void fill_pack_entry(uint32_t pack_int_id,
266+
struct packed_git *p,
267+
uint32_t cur_object,
268+
struct pack_midx_entry *entry)
269+
{
270+
if (!nth_packed_object_oid(&entry->oid, p, cur_object))
271+
die(_("failed to locate object %d in packfile"), cur_object);
272+
273+
entry->pack_int_id = pack_int_id;
274+
entry->pack_mtime = p->mtime;
275+
276+
entry->offset = nth_packed_object_offset(p, cur_object);
277+
}
278+
279+
/*
280+
* It is possible to artificially get into a state where there are many
281+
* duplicate copies of objects. That can create high memory pressure if
282+
* we are to create a list of all objects before de-duplication. To reduce
283+
* this memory pressure without a significant performance drop, automatically
284+
* group objects by the first byte of their object id. Use the IDX fanout
285+
* tables to group the data, copy to a local array, then sort.
286+
*
287+
* Copy only the de-duplicated entries (selected by most-recent modified time
288+
* of a packfile containing the object).
289+
*/
290+
static struct pack_midx_entry *get_sorted_entries(struct packed_git **p,
291+
uint32_t *perm,
292+
uint32_t nr_packs,
293+
uint32_t *nr_objects)
294+
{
295+
uint32_t cur_fanout, cur_pack, cur_object;
296+
uint32_t alloc_fanout, alloc_objects, total_objects = 0;
297+
struct pack_midx_entry *entries_by_fanout = NULL;
298+
struct pack_midx_entry *deduplicated_entries = NULL;
299+
300+
for (cur_pack = 0; cur_pack < nr_packs; cur_pack++)
301+
total_objects += p[cur_pack]->num_objects;
302+
303+
/*
304+
* As we de-duplicate by fanout value, we expect the fanout
305+
* slices to be evenly distributed, with some noise. Hence,
306+
* allocate slightly more than one 256th.
307+
*/
308+
alloc_objects = alloc_fanout = total_objects > 3200 ? total_objects / 200 : 16;
309+
310+
ALLOC_ARRAY(entries_by_fanout, alloc_fanout);
311+
ALLOC_ARRAY(deduplicated_entries, alloc_objects);
312+
*nr_objects = 0;
313+
314+
for (cur_fanout = 0; cur_fanout < 256; cur_fanout++) {
315+
uint32_t nr_fanout = 0;
316+
317+
for (cur_pack = 0; cur_pack < nr_packs; cur_pack++) {
318+
uint32_t start = 0, end;
319+
320+
if (cur_fanout)
321+
start = get_pack_fanout(p[cur_pack], cur_fanout - 1);
322+
end = get_pack_fanout(p[cur_pack], cur_fanout);
323+
324+
for (cur_object = start; cur_object < end; cur_object++) {
325+
ALLOC_GROW(entries_by_fanout, nr_fanout + 1, alloc_fanout);
326+
fill_pack_entry(perm[cur_pack], p[cur_pack], cur_object, &entries_by_fanout[nr_fanout]);
327+
nr_fanout++;
328+
}
329+
}
330+
331+
QSORT(entries_by_fanout, nr_fanout, midx_oid_compare);
332+
333+
/*
334+
* The batch is now sorted by OID and then mtime (descending).
335+
* Take only the first duplicate.
336+
*/
337+
for (cur_object = 0; cur_object < nr_fanout; cur_object++) {
338+
if (cur_object && !oidcmp(&entries_by_fanout[cur_object - 1].oid,
339+
&entries_by_fanout[cur_object].oid))
340+
continue;
341+
342+
ALLOC_GROW(deduplicated_entries, *nr_objects + 1, alloc_objects);
343+
memcpy(&deduplicated_entries[*nr_objects],
344+
&entries_by_fanout[cur_object],
345+
sizeof(struct pack_midx_entry));
346+
(*nr_objects)++;
347+
}
348+
}
349+
350+
free(entries_by_fanout);
351+
return deduplicated_entries;
352+
}
353+
231354
static size_t write_midx_pack_names(struct hashfile *f,
232355
char **pack_names,
233356
uint32_t num_packs)
@@ -271,6 +394,8 @@ int write_midx_file(const char *object_dir)
271394
uint64_t written = 0;
272395
uint32_t chunk_ids[MIDX_MAX_CHUNKS + 1];
273396
uint64_t chunk_offsets[MIDX_MAX_CHUNKS + 1];
397+
uint32_t nr_entries;
398+
struct pack_midx_entry *entries = NULL;
274399

275400
midx_name = get_midx_filename(object_dir);
276401
if (safe_create_leading_directories(midx_name)) {
@@ -296,6 +421,8 @@ int write_midx_file(const char *object_dir)
296421
ALLOC_ARRAY(pack_perm, packs.nr);
297422
sort_packs_by_name(packs.names, packs.nr, pack_perm);
298423

424+
entries = get_sorted_entries(packs.list, pack_perm, packs.nr, &nr_entries);
425+
299426
hold_lock_file_for_update(&lk, midx_name, LOCK_DIE_ON_ERROR);
300427
f = hashfd(lk.tempfile->fd, lk.tempfile->filename.buf);
301428
FREE_AND_NULL(midx_name);
@@ -365,5 +492,6 @@ int write_midx_file(const char *object_dir)
365492

366493
free(packs.list);
367494
free(packs.names);
495+
free(entries);
368496
return 0;
369497
}

packfile.c

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,23 @@ int open_pack_index(struct packed_git *p)
196196
return ret;
197197
}
198198

199+
uint32_t get_pack_fanout(struct packed_git *p, uint32_t value)
200+
{
201+
const uint32_t *level1_ofs = p->index_data;
202+
203+
if (!level1_ofs) {
204+
if (open_pack_index(p))
205+
return 0;
206+
level1_ofs = p->index_data;
207+
}
208+
209+
if (p->index_version > 1) {
210+
level1_ofs += 2;
211+
}
212+
213+
return ntohl(level1_ofs[value]);
214+
}
215+
199216
static struct packed_git *alloc_packed_git(int extra)
200217
{
201218
struct packed_git *p = xmalloc(st_add(sizeof(*p), extra));

packfile.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,8 @@ extern int open_pack_index(struct packed_git *);
6969
*/
7070
extern void close_pack_index(struct packed_git *);
7171

72+
extern uint32_t get_pack_fanout(struct packed_git *p, uint32_t value);
73+
7274
extern unsigned char *use_pack(struct packed_git *, struct pack_window **, off_t, unsigned long *);
7375
extern void close_pack_windows(struct packed_git *);
7476
extern void close_pack(struct packed_git *);

0 commit comments

Comments
 (0)