Skip to content

Commit 31e3d83

Browse files
committed
Merge branch 'jk/maint-avoid-streaming-filtered-contents'
* jk/maint-avoid-streaming-filtered-contents: do not stream large files to pack when filters are in use teach dry-run convert_to_git not to require a src buffer teach convert_to_git a "dry run" mode
2 parents ce25054 + 4f22b10 commit 31e3d83

File tree

4 files changed

+125
-9
lines changed

4 files changed

+125
-9
lines changed

convert.c

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -196,9 +196,17 @@ static int crlf_to_git(const char *path, const char *src, size_t len,
196196
char *dst;
197197

198198
if (crlf_action == CRLF_BINARY ||
199-
(crlf_action == CRLF_GUESS && auto_crlf == AUTO_CRLF_FALSE) || !len)
199+
(crlf_action == CRLF_GUESS && auto_crlf == AUTO_CRLF_FALSE) ||
200+
(src && !len))
200201
return 0;
201202

203+
/*
204+
* If we are doing a dry-run and have no source buffer, there is
205+
* nothing to analyze; we must assume we would convert.
206+
*/
207+
if (!buf && !src)
208+
return 1;
209+
202210
gather_stats(src, len, &stats);
203211

204212
if (crlf_action == CRLF_AUTO || crlf_action == CRLF_GUESS) {
@@ -232,6 +240,13 @@ static int crlf_to_git(const char *path, const char *src, size_t len,
232240
if (!stats.cr)
233241
return 0;
234242

243+
/*
244+
* At this point all of our source analysis is done, and we are sure we
245+
* would convert. If we are in dry-run mode, we can give an answer.
246+
*/
247+
if (!buf)
248+
return 1;
249+
235250
/* only grow if not in place */
236251
if (strbuf_avail(buf) + buf->len < len)
237252
strbuf_grow(buf, len - buf->len);
@@ -396,6 +411,9 @@ static int apply_filter(const char *path, const char *src, size_t len,
396411
if (!cmd)
397412
return 0;
398413

414+
if (!dst)
415+
return 1;
416+
399417
memset(&async, 0, sizeof(async));
400418
async.proc = filter_buffer;
401419
async.data = &params;
@@ -527,9 +545,12 @@ static int ident_to_git(const char *path, const char *src, size_t len,
527545
{
528546
char *dst, *dollar;
529547

530-
if (!ident || !count_ident(src, len))
548+
if (!ident || (src && !count_ident(src, len)))
531549
return 0;
532550

551+
if (!buf)
552+
return 1;
553+
533554
/* only grow if not in place */
534555
if (strbuf_avail(buf) + buf->len < len)
535556
strbuf_grow(buf, len - buf->len);
@@ -759,13 +780,13 @@ int convert_to_git(const char *path, const char *src, size_t len,
759780
filter = ca.drv->clean;
760781

761782
ret |= apply_filter(path, src, len, dst, filter);
762-
if (ret) {
783+
if (ret && dst) {
763784
src = dst->buf;
764785
len = dst->len;
765786
}
766787
ca.crlf_action = input_crlf_action(ca.crlf_action, ca.eol_attr);
767788
ret |= crlf_to_git(path, src, len, dst, ca.crlf_action, checksafe);
768-
if (ret) {
789+
if (ret && dst) {
769790
src = dst->buf;
770791
len = dst->len;
771792
}

convert.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,11 @@ extern int convert_to_working_tree(const char *path, const char *src,
4040
size_t len, struct strbuf *dst);
4141
extern int renormalize_buffer(const char *path, const char *src, size_t len,
4242
struct strbuf *dst);
43+
static inline int would_convert_to_git(const char *path, const char *src,
44+
size_t len, enum safe_crlf checksafe)
45+
{
46+
return convert_to_git(path, src, len, NULL, checksafe);
47+
}
4348

4449
/*****************************************************************
4550
*

sha1_file.c

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2700,10 +2700,13 @@ static int index_core(unsigned char *sha1, int fd, size_t size,
27002700
* This also bypasses the usual "convert-to-git" dance, and that is on
27012701
* purpose. We could write a streaming version of the converting
27022702
* functions and insert that before feeding the data to fast-import
2703-
* (or equivalent in-core API described above), but the primary
2704-
* motivation for trying to stream from the working tree file and to
2705-
* avoid mmaping it in core is to deal with large binary blobs, and
2706-
* by definition they do _not_ want to get any conversion.
2703+
* (or equivalent in-core API described above). However, that is
2704+
* somewhat complicated, as we do not know the size of the filter
2705+
* result, which we need to know beforehand when writing a git object.
2706+
* Since the primary motivation for trying to stream from the working
2707+
* tree file and to avoid mmaping it in core is to deal with large
2708+
* binary blobs, they generally do not want to get any conversion, and
2709+
* callers should avoid this code path when filters are requested.
27072710
*/
27082711
static int index_stream(unsigned char *sha1, int fd, size_t size,
27092712
enum object_type type, const char *path,
@@ -2720,7 +2723,8 @@ int index_fd(unsigned char *sha1, int fd, struct stat *st,
27202723

27212724
if (!S_ISREG(st->st_mode))
27222725
ret = index_pipe(sha1, fd, type, path, flags);
2723-
else if (size <= big_file_threshold || type != OBJ_BLOB)
2726+
else if (size <= big_file_threshold || type != OBJ_BLOB ||
2727+
(path && would_convert_to_git(path, NULL, 0, 0)))
27242728
ret = index_core(sha1, fd, size, type, path, flags);
27252729
else
27262730
ret = index_stream(sha1, fd, size, type, path, flags);

t/t1051-large-conversion.sh

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
#!/bin/sh
2+
3+
test_description='test conversion filters on large files'
4+
. ./test-lib.sh
5+
6+
set_attr() {
7+
test_when_finished 'rm -f .gitattributes' &&
8+
echo "* $*" >.gitattributes
9+
}
10+
11+
check_input() {
12+
git read-tree --empty &&
13+
git add small large &&
14+
git cat-file blob :small >small.index &&
15+
git cat-file blob :large | head -n 1 >large.index &&
16+
test_cmp small.index large.index
17+
}
18+
19+
check_output() {
20+
rm -f small large &&
21+
git checkout small large &&
22+
head -n 1 large >large.head &&
23+
test_cmp small large.head
24+
}
25+
26+
test_expect_success 'setup input tests' '
27+
printf "\$Id: foo\$\\r\\n" >small &&
28+
cat small small >large &&
29+
git config core.bigfilethreshold 20 &&
30+
git config filter.test.clean "sed s/.*/CLEAN/"
31+
'
32+
33+
test_expect_success 'autocrlf=true converts on input' '
34+
test_config core.autocrlf true &&
35+
check_input
36+
'
37+
38+
test_expect_success 'eol=crlf converts on input' '
39+
set_attr eol=crlf &&
40+
check_input
41+
'
42+
43+
test_expect_success 'ident converts on input' '
44+
set_attr ident &&
45+
check_input
46+
'
47+
48+
test_expect_success 'user-defined filters convert on input' '
49+
set_attr filter=test &&
50+
check_input
51+
'
52+
53+
test_expect_success 'setup output tests' '
54+
echo "\$Id\$" >small &&
55+
cat small small >large &&
56+
git add small large &&
57+
git config core.bigfilethreshold 7 &&
58+
git config filter.test.smudge "sed s/.*/SMUDGE/"
59+
'
60+
61+
test_expect_success 'autocrlf=true converts on output' '
62+
test_config core.autocrlf true &&
63+
check_output
64+
'
65+
66+
test_expect_success 'eol=crlf converts on output' '
67+
set_attr eol=crlf &&
68+
check_output
69+
'
70+
71+
test_expect_success 'user-defined filters convert on output' '
72+
set_attr filter=test &&
73+
check_output
74+
'
75+
76+
test_expect_success 'ident converts on output' '
77+
set_attr ident &&
78+
rm -f small large &&
79+
git checkout small large &&
80+
sed -n "s/Id: .*/Id: SHA/p" <small >small.clean &&
81+
head -n 1 large >large.head &&
82+
sed -n "s/Id: .*/Id: SHA/p" <large.head >large.clean &&
83+
test_cmp small.clean large.clean
84+
'
85+
86+
test_done

0 commit comments

Comments
 (0)