Skip to content

Commit f0f3caf

Browse files
committed
Merge branch 'hx/unpack-streaming' into seen
source: <[email protected]> * hx/unpack-streaming: unpack-objects: use stream_loose_object() to unpack large objects core doc: modernize core.bigFileThreshold documentation object-file.c: add "stream_loose_object()" to handle large object object-file.c: factor out deflate part of write_loose_object() object-file.c: refactor write_loose_object() to several steps object-file.c: do fsync() and close() before post-write die() unpack-objects: low memory footprint for get_data() in dry_run mode
2 parents b372a93 + 54b13f9 commit f0f3caf

File tree

5 files changed

+387
-55
lines changed

5 files changed

+387
-55
lines changed

Documentation/config/core.txt

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -444,17 +444,32 @@ You probably do not need to adjust this value.
444444
Common unit suffixes of 'k', 'm', or 'g' are supported.
445445

446446
core.bigFileThreshold::
447-
Files larger than this size are stored deflated, without
448-
attempting delta compression. Storing large files without
449-
delta compression avoids excessive memory usage, at the
450-
slight expense of increased disk usage. Additionally files
451-
larger than this size are always treated as binary.
447+
The size of files considered "big", which as discussed below
448+
changes the behavior of numerous git commands, as well as how
449+
such files are stored within the repository. The default is
450+
512 MiB. Common unit suffixes of 'k', 'm', or 'g' are
451+
supported.
452452
+
453-
Default is 512 MiB on all platforms. This should be reasonable
454-
for most projects as source code and other text files can still
455-
be delta compressed, but larger binary media files won't be.
453+
Files above the configured limit will be:
456454
+
457-
Common unit suffixes of 'k', 'm', or 'g' are supported.
455+
* Stored deflated, without attempting delta compression.
456+
+
457+
The default limit is primarily set with this use-case in mind. With it
458+
most projects will have their source code and other text files delta
459+
compressed, but not larger binary media files.
460+
+
461+
Storing large files without delta compression avoids excessive memory
462+
usage, at the slight expense of increased disk usage.
463+
+
464+
* Will be treated as if though they were labeled "binary" (see
465+
linkgit:gitattributes[5]). This means that e.g. linkgit:git-log[1]
466+
and linkgit:git-diff[1] will not diffs for files above this limit.
467+
+
468+
* Will be generally be streamed when written, which avoids excessive
469+
memory usage, at the cost of some fixed overhead. Commands that make
470+
use of this include linkgit:git-archive[1],
471+
linkgit:git-fast-import[1], linkgit:git-index-pack[1],
472+
linkgit:git-unpack-objects[1] and linkgit:git-fsck[1].
458473

459474
core.excludesFile::
460475
Specifies the pathname to the file that contains patterns to

builtin/unpack-objects.c

Lines changed: 91 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -97,15 +97,26 @@ static void use(int bytes)
9797
display_throughput(progress, consumed_bytes);
9898
}
9999

100+
/*
101+
* Decompress zstream from stdin and return specific size of data.
102+
* The caller is responsible to free the returned buffer.
103+
*
104+
* But for dry_run mode, "get_data()" is only used to check the
105+
* integrity of data, and the returned buffer is not used at all.
106+
* Therefore, in dry_run mode, "get_data()" will release the small
107+
* allocated buffer which is reused to hold temporary zstream output
108+
* and return NULL instead of returning garbage data.
109+
*/
100110
static void *get_data(unsigned long size)
101111
{
102112
git_zstream stream;
103-
void *buf = xmallocz(size);
113+
unsigned long bufsize = dry_run && size > 8192 ? 8192 : size;
114+
void *buf = xmallocz(bufsize);
104115

105116
memset(&stream, 0, sizeof(stream));
106117

107118
stream.next_out = buf;
108-
stream.avail_out = size;
119+
stream.avail_out = bufsize;
109120
stream.next_in = fill(1);
110121
stream.avail_in = len;
111122
git_inflate_init(&stream);
@@ -125,8 +136,15 @@ static void *get_data(unsigned long size)
125136
}
126137
stream.next_in = fill(1);
127138
stream.avail_in = len;
139+
if (dry_run) {
140+
/* reuse the buffer in dry_run mode */
141+
stream.next_out = buf;
142+
stream.avail_out = bufsize;
143+
}
128144
}
129145
git_inflate_end(&stream);
146+
if (dry_run)
147+
FREE_AND_NULL(buf);
130148
return buf;
131149
}
132150

@@ -326,10 +344,70 @@ static void unpack_non_delta_entry(enum object_type type, unsigned long size,
326344
{
327345
void *buf = get_data(size);
328346

329-
if (!dry_run && buf)
347+
if (buf)
330348
write_object(nr, type, buf, size);
331-
else
332-
free(buf);
349+
}
350+
351+
struct input_zstream_data {
352+
git_zstream *zstream;
353+
unsigned char buf[8192];
354+
int status;
355+
};
356+
357+
static const void *feed_input_zstream(struct input_stream *in_stream,
358+
unsigned long *readlen)
359+
{
360+
struct input_zstream_data *data = in_stream->data;
361+
git_zstream *zstream = data->zstream;
362+
void *in = fill(1);
363+
364+
if (in_stream->is_finished) {
365+
*readlen = 0;
366+
return NULL;
367+
}
368+
369+
zstream->next_out = data->buf;
370+
zstream->avail_out = sizeof(data->buf);
371+
zstream->next_in = in;
372+
zstream->avail_in = len;
373+
374+
data->status = git_inflate(zstream, 0);
375+
376+
in_stream->is_finished = data->status != Z_OK;
377+
use(len - zstream->avail_in);
378+
*readlen = sizeof(data->buf) - zstream->avail_out;
379+
380+
return data->buf;
381+
}
382+
383+
static void stream_blob(unsigned long size, unsigned nr)
384+
{
385+
git_zstream zstream = { 0 };
386+
struct input_zstream_data data = { 0 };
387+
struct input_stream in_stream = {
388+
.read = feed_input_zstream,
389+
.data = &data,
390+
};
391+
struct obj_info *info = &obj_list[nr];
392+
393+
data.zstream = &zstream;
394+
git_inflate_init(&zstream);
395+
396+
if (stream_loose_object(&in_stream, size, &info->oid))
397+
die(_("failed to write object in stream"));
398+
399+
if (data.status != Z_STREAM_END)
400+
die(_("inflate returned (%d)"), data.status);
401+
git_inflate_end(&zstream);
402+
403+
if (strict) {
404+
struct blob *blob = lookup_blob(the_repository, &info->oid);
405+
406+
if (!blob)
407+
die(_("invalid blob object from stream"));
408+
blob->object.flags |= FLAG_WRITTEN;
409+
}
410+
info->obj = NULL;
333411
}
334412

335413
static int resolve_against_held(unsigned nr, const struct object_id *base,
@@ -359,10 +437,8 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
359437
oidread(&base_oid, fill(the_hash_algo->rawsz));
360438
use(the_hash_algo->rawsz);
361439
delta_data = get_data(delta_size);
362-
if (dry_run || !delta_data) {
363-
free(delta_data);
440+
if (!delta_data)
364441
return;
365-
}
366442
if (has_object_file(&base_oid))
367443
; /* Ok we have this one */
368444
else if (resolve_against_held(nr, &base_oid,
@@ -398,10 +474,8 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
398474
die("offset value out of bound for delta base object");
399475

400476
delta_data = get_data(delta_size);
401-
if (dry_run || !delta_data) {
402-
free(delta_data);
477+
if (!delta_data)
403478
return;
404-
}
405479
lo = 0;
406480
hi = nr;
407481
while (lo < hi) {
@@ -468,9 +542,14 @@ static void unpack_one(unsigned nr)
468542
}
469543

470544
switch (type) {
545+
case OBJ_BLOB:
546+
if (!dry_run && size > big_file_threshold) {
547+
stream_blob(size, nr);
548+
return;
549+
}
550+
/* fallthrough */
471551
case OBJ_COMMIT:
472552
case OBJ_TREE:
473-
case OBJ_BLOB:
474553
case OBJ_TAG:
475554
unpack_non_delta_entry(type, size, nr);
476555
return;

0 commit comments

Comments
 (0)