Skip to content

Commit 2a48c83

Browse files
committed
Merge branch 'hx/unpack-streaming' into seen
Allow large objects read from a packstream to be streamed into a loose object file straight, without having to keep it in-core as a whole. * hx/unpack-streaming: unpack-objects: use stream_loose_object() to unpack large objects core doc: modernize core.bigFileThreshold documentation object-file.c: add "stream_loose_object()" to handle large object object-file.c: factor out deflate part of write_loose_object() object-file.c: refactor write_loose_object() to several steps unpack-objects: low memory footprint for get_data() in dry_run mode
2 parents d75be09 + aaf8122 commit 2a48c83

File tree

5 files changed

+405
-51
lines changed

5 files changed

+405
-51
lines changed

Documentation/config/core.txt

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -444,17 +444,32 @@ You probably do not need to adjust this value.
444444
Common unit suffixes of 'k', 'm', or 'g' are supported.
445445

446446
core.bigFileThreshold::
447-
Files larger than this size are stored deflated, without
448-
attempting delta compression. Storing large files without
449-
delta compression avoids excessive memory usage, at the
450-
slight expense of increased disk usage. Additionally files
451-
larger than this size are always treated as binary.
447+
The size of files considered "big", which as discussed below
448+
changes the behavior of numerous git commands, as well as how
449+
such files are stored within the repository. The default is
450+
512 MiB. Common unit suffixes of 'k', 'm', or 'g' are
451+
supported.
452452
+
453-
Default is 512 MiB on all platforms. This should be reasonable
454-
for most projects as source code and other text files can still
455-
be delta compressed, but larger binary media files won't be.
453+
Files above the configured limit will be:
456454
+
457-
Common unit suffixes of 'k', 'm', or 'g' are supported.
455+
* Stored deflated in packfiles, without attempting delta compression.
456+
+
457+
The default limit is primarily set with this use-case in mind. With it,
458+
most projects will have their source code and other text files delta
459+
compressed, but not larger binary media files.
460+
+
461+
Storing large files without delta compression avoids excessive memory
462+
usage, at the slight expense of increased disk usage.
463+
+
464+
* Will be treated as if they were labeled "binary" (see
465+
linkgit:gitattributes[5]). e.g. linkgit:git-log[1] and
466+
linkgit:git-diff[1] will not compute diffs for files above this limit.
467+
+
468+
* Will generally be streamed when written, which avoids excessive
469+
memory usage, at the cost of some fixed overhead. Commands that make
470+
use of this include linkgit:git-archive[1],
471+
linkgit:git-fast-import[1], linkgit:git-index-pack[1],
472+
linkgit:git-unpack-objects[1] and linkgit:git-fsck[1].
458473

459474
core.excludesFile::
460475
Specifies the pathname to the file that contains patterns to

builtin/unpack-objects.c

Lines changed: 94 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -97,15 +97,27 @@ static void use(int bytes)
9797
display_throughput(progress, consumed_bytes);
9898
}
9999

100+
/*
101+
* Decompress zstream from the standard input into a newly
102+
* allocated buffer of specified size and return the buffer.
103+
* The caller is responsible to free the returned buffer.
104+
*
105+
* But for dry_run mode, "get_data()" is only used to check the
106+
* integrity of data, and the returned buffer is not used at all.
107+
* Therefore, in dry_run mode, "get_data()" will release the small
108+
* allocated buffer which is reused to hold temporary zstream output
109+
* and return NULL instead of returning garbage data.
110+
*/
100111
static void *get_data(unsigned long size)
101112
{
102113
git_zstream stream;
103-
void *buf = xmallocz(size);
114+
unsigned long bufsize = dry_run && size > 8192 ? 8192 : size;
115+
void *buf = xmallocz(bufsize);
104116

105117
memset(&stream, 0, sizeof(stream));
106118

107119
stream.next_out = buf;
108-
stream.avail_out = size;
120+
stream.avail_out = bufsize;
109121
stream.next_in = fill(1);
110122
stream.avail_in = len;
111123
git_inflate_init(&stream);
@@ -125,8 +137,17 @@ static void *get_data(unsigned long size)
125137
}
126138
stream.next_in = fill(1);
127139
stream.avail_in = len;
140+
if (dry_run) {
141+
/* reuse the buffer in dry_run mode */
142+
stream.next_out = buf;
143+
stream.avail_out = bufsize > size - stream.total_out ?
144+
size - stream.total_out :
145+
bufsize;
146+
}
128147
}
129148
git_inflate_end(&stream);
149+
if (dry_run)
150+
FREE_AND_NULL(buf);
130151
return buf;
131152
}
132153

@@ -326,10 +347,70 @@ static void unpack_non_delta_entry(enum object_type type, unsigned long size,
326347
{
327348
void *buf = get_data(size);
328349

329-
if (!dry_run && buf)
350+
if (buf)
330351
write_object(nr, type, buf, size);
331-
else
332-
free(buf);
352+
}
353+
354+
struct input_zstream_data {
355+
git_zstream *zstream;
356+
unsigned char buf[8192];
357+
int status;
358+
};
359+
360+
static const void *feed_input_zstream(struct input_stream *in_stream,
361+
unsigned long *readlen)
362+
{
363+
struct input_zstream_data *data = in_stream->data;
364+
git_zstream *zstream = data->zstream;
365+
void *in = fill(1);
366+
367+
if (in_stream->is_finished) {
368+
*readlen = 0;
369+
return NULL;
370+
}
371+
372+
zstream->next_out = data->buf;
373+
zstream->avail_out = sizeof(data->buf);
374+
zstream->next_in = in;
375+
zstream->avail_in = len;
376+
377+
data->status = git_inflate(zstream, 0);
378+
379+
in_stream->is_finished = data->status != Z_OK;
380+
use(len - zstream->avail_in);
381+
*readlen = sizeof(data->buf) - zstream->avail_out;
382+
383+
return data->buf;
384+
}
385+
386+
static void stream_blob(unsigned long size, unsigned nr)
387+
{
388+
git_zstream zstream = { 0 };
389+
struct input_zstream_data data = { 0 };
390+
struct input_stream in_stream = {
391+
.read = feed_input_zstream,
392+
.data = &data,
393+
};
394+
struct obj_info *info = &obj_list[nr];
395+
396+
data.zstream = &zstream;
397+
git_inflate_init(&zstream);
398+
399+
if (stream_loose_object(&in_stream, size, &info->oid))
400+
die(_("failed to write object in stream"));
401+
402+
if (data.status != Z_STREAM_END)
403+
die(_("inflate returned (%d)"), data.status);
404+
git_inflate_end(&zstream);
405+
406+
if (strict) {
407+
struct blob *blob = lookup_blob(the_repository, &info->oid);
408+
409+
if (!blob)
410+
die(_("invalid blob object from stream"));
411+
blob->object.flags |= FLAG_WRITTEN;
412+
}
413+
info->obj = NULL;
333414
}
334415

335416
static int resolve_against_held(unsigned nr, const struct object_id *base,
@@ -359,10 +440,8 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
359440
oidread(&base_oid, fill(the_hash_algo->rawsz));
360441
use(the_hash_algo->rawsz);
361442
delta_data = get_data(delta_size);
362-
if (dry_run || !delta_data) {
363-
free(delta_data);
443+
if (!delta_data)
364444
return;
365-
}
366445
if (has_object_file(&base_oid))
367446
; /* Ok we have this one */
368447
else if (resolve_against_held(nr, &base_oid,
@@ -398,10 +477,8 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
398477
die("offset value out of bound for delta base object");
399478

400479
delta_data = get_data(delta_size);
401-
if (dry_run || !delta_data) {
402-
free(delta_data);
480+
if (!delta_data)
403481
return;
404-
}
405482
lo = 0;
406483
hi = nr;
407484
while (lo < hi) {
@@ -468,9 +545,14 @@ static void unpack_one(unsigned nr)
468545
}
469546

470547
switch (type) {
548+
case OBJ_BLOB:
549+
if (!dry_run && size > big_file_threshold) {
550+
stream_blob(size, nr);
551+
return;
552+
}
553+
/* fallthrough */
471554
case OBJ_COMMIT:
472555
case OBJ_TREE:
473-
case OBJ_BLOB:
474556
case OBJ_TAG:
475557
unpack_non_delta_entry(type, size, nr);
476558
return;

0 commit comments

Comments
 (0)