Skip to content

Commit 96b967b

Browse files
tboegigitster
authored andcommitted
convert.c: stream and fast search for binary
When statistics are done for the autocrlf handling, the search in the content can be stopped, if e.g - a search for binary is done, and a NUL character is found - a search for CRLF is done, and the first CRLF is found. Similar when statistics for binary vs non-binary are gathered: Whenever a lone CR or NUL is found, the search can be aborted. When checking out files in "auto" mode, any file that has a "lone CR" or a CRLF will not be converted, so the search can be aborted early. Add the new bit, CONVERT_STAT_BITS_ANY_CR, which is set for either lone CR or CRLF. Many binary files have a NUL very early and it is often not necessary to load the whole content of a file or blob into memory. Split gather_stats() into gather_all_stats() and gather_stats_partly() to do a streaming handling for blobs and files in the worktree. Signed-off-by: Torsten Bögershausen <[email protected]> Signed-off-by: Junio C Hamano <[email protected]>
1 parent 8980690 commit 96b967b

File tree

1 file changed

+129
-62
lines changed

1 file changed

+129
-62
lines changed

convert.c

Lines changed: 129 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#include "run-command.h"
44
#include "quote.h"
55
#include "sigchain.h"
6+
#include "streaming.h"
67

78
/*
89
* convert.c - convert a file when checking it out and checking it in.
@@ -13,10 +14,12 @@
1314
* translation when the "text" attribute or "auto_crlf" option is set.
1415
*/
1516

16-
/* Stat bits: When BIN is set, the txt bits are unset */
1717
#define CONVERT_STAT_BITS_TXT_LF 0x1
1818
#define CONVERT_STAT_BITS_TXT_CRLF 0x2
1919
#define CONVERT_STAT_BITS_BIN 0x4
20+
#define CONVERT_STAT_BITS_ANY_CR 0x8
21+
22+
#define STREAM_BUFFER_SIZE (1024*16)
2023

2124
enum crlf_action {
2225
CRLF_UNDEFINED,
@@ -31,30 +34,36 @@ enum crlf_action {
3134

3235
struct text_stat {
3336
/* NUL, CR, LF and CRLF counts */
34-
unsigned nul, lonecr, lonelf, crlf;
37+
unsigned stat_bits, lonecr, lonelf, crlf;
3538

3639
/* These are just approximations! */
3740
unsigned printable, nonprintable;
3841
};
3942

40-
static void gather_stats(const char *buf, unsigned long size, struct text_stat *stats)
43+
static void gather_stats_partly(const char *buf, unsigned long size,
44+
struct text_stat *stats, unsigned search_only)
4145
{
4246
unsigned long i;
4347

44-
memset(stats, 0, sizeof(*stats));
45-
48+
if (!buf || !size)
49+
return;
4650
for (i = 0; i < size; i++) {
4751
unsigned char c = buf[i];
4852
if (c == '\r') {
53+
stats->stat_bits |= CONVERT_STAT_BITS_ANY_CR;
4954
if (i+1 < size && buf[i+1] == '\n') {
5055
stats->crlf++;
5156
i++;
52-
} else
57+
stats->stat_bits |= CONVERT_STAT_BITS_TXT_CRLF;
58+
} else {
5359
stats->lonecr++;
60+
stats->stat_bits |= CONVERT_STAT_BITS_BIN;
61+
}
5462
continue;
5563
}
5664
if (c == '\n') {
5765
stats->lonelf++;
66+
stats->stat_bits |= CONVERT_STAT_BITS_TXT_LF;
5867
continue;
5968
}
6069
if (c == 127)
@@ -67,14 +76,16 @@ static void gather_stats(const char *buf, unsigned long size, struct text_stat *
6776
stats->printable++;
6877
break;
6978
case 0:
70-
stats->nul++;
79+
stats->stat_bits |= CONVERT_STAT_BITS_BIN;
7180
/* fall through */
7281
default:
7382
stats->nonprintable++;
7483
}
7584
}
7685
else
7786
stats->printable++;
87+
if (stats->stat_bits & search_only)
88+
break; /* We found what we have been searching for */
7889
}
7990

8091
/* If file ends with EOF then don't count this EOF as non-printable. */
@@ -86,41 +97,62 @@ static void gather_stats(const char *buf, unsigned long size, struct text_stat *
8697
* The same heuristics as diff.c::mmfile_is_binary()
8798
* We treat files with bare CR as binary
8899
*/
89-
static int convert_is_binary(unsigned long size, const struct text_stat *stats)
100+
static void convert_nonprintable(struct text_stat *stats)
90101
{
91-
if (stats->lonecr)
92-
return 1;
93-
if (stats->nul)
94-
return 1;
95102
if ((stats->printable >> 7) < stats->nonprintable)
96-
return 1;
97-
return 0;
103+
stats->stat_bits |= CONVERT_STAT_BITS_BIN;
98104
}
99105

100-
static unsigned int gather_convert_stats(const char *data, unsigned long size)
106+
static void gather_all_stats(const char *buf, unsigned long size,
107+
struct text_stat *stats, unsigned search_only)
101108
{
109+
memset(stats, 0, sizeof(*stats));
110+
gather_stats_partly(buf, size, stats, search_only);
111+
convert_nonprintable(stats);
112+
}
113+
114+
115+
static unsigned get_convert_stats_sha1(unsigned const char *sha1,
116+
unsigned search_only)
117+
{
118+
struct git_istream *st;
102119
struct text_stat stats;
103-
int ret = 0;
104-
if (!data || !size)
105-
return 0;
106-
gather_stats(data, size, &stats);
107-
if (convert_is_binary(size, &stats))
108-
ret |= CONVERT_STAT_BITS_BIN;
109-
if (stats.crlf)
110-
ret |= CONVERT_STAT_BITS_TXT_CRLF;
111-
if (stats.lonelf)
112-
ret |= CONVERT_STAT_BITS_TXT_LF;
120+
enum object_type type;
121+
unsigned long sz;
113122

114-
return ret;
123+
if (!sha1)
124+
return 0;
125+
memset(&stats, 0, sizeof(stats));
126+
st = open_istream(sha1, &type, &sz, NULL);
127+
if (!st) {
128+
return 0;
129+
}
130+
if (type != OBJ_BLOB)
131+
goto close_and_exit_i;
132+
for (;;) {
133+
char buf[STREAM_BUFFER_SIZE];
134+
ssize_t readlen = read_istream(st, buf, sizeof(buf));
135+
if (readlen < 0)
136+
break;
137+
if (!readlen)
138+
break;
139+
gather_stats_partly(buf, (unsigned long)readlen, &stats, search_only);
140+
if (stats.stat_bits & search_only)
141+
break; /* We found what we have been searching for */
142+
}
143+
close_and_exit_i:
144+
close_istream(st);
145+
convert_nonprintable(&stats);
146+
return stats.stat_bits;
115147
}
116148

117-
static const char *gather_convert_stats_ascii(const char *data, unsigned long size)
149+
static const char *convert_stats_ascii(unsigned convert_stats)
118150
{
119-
unsigned int convert_stats = gather_convert_stats(data, size);
120-
151+
const unsigned eol_bits = CONVERT_STAT_BITS_TXT_LF |
152+
CONVERT_STAT_BITS_TXT_CRLF;
121153
if (convert_stats & CONVERT_STAT_BITS_BIN)
122154
return "-text";
123-
switch (convert_stats) {
155+
switch (convert_stats & eol_bits) {
124156
case CONVERT_STAT_BITS_TXT_LF:
125157
return "lf";
126158
case CONVERT_STAT_BITS_TXT_CRLF:
@@ -132,24 +164,45 @@ static const char *gather_convert_stats_ascii(const char *data, unsigned long si
132164
}
133165
}
134166

167+
static unsigned get_convert_stats_wt(const char *path)
168+
{
169+
struct text_stat stats;
170+
unsigned search_only = CONVERT_STAT_BITS_BIN;
171+
int fd;
172+
memset(&stats, 0, sizeof(stats));
173+
fd = open(path, O_RDONLY);
174+
if (fd < 0)
175+
return 0;
176+
for (;;) {
177+
char buf[STREAM_BUFFER_SIZE];
178+
ssize_t readlen = read(fd, buf, sizeof(buf));
179+
if (readlen < 0)
180+
break;
181+
if (!readlen)
182+
break;
183+
gather_stats_partly(buf, (unsigned long)readlen, &stats, search_only);
184+
if (stats.stat_bits & search_only)
185+
break; /* We found what we have been searching for */
186+
}
187+
close(fd);
188+
convert_nonprintable(&stats);
189+
return stats.stat_bits;
190+
}
191+
135192
const char *get_cached_convert_stats_ascii(const char *path)
136193
{
137-
const char *ret;
138-
unsigned long sz;
139-
void *data = read_blob_data_from_cache(path, &sz);
140-
ret = gather_convert_stats_ascii(data, sz);
141-
free(data);
142-
return ret;
194+
unsigned convert_stats;
195+
unsigned search_only = CONVERT_STAT_BITS_BIN;
196+
convert_stats = get_convert_stats_sha1(get_sha1_from_cache(path),
197+
search_only);
198+
return convert_stats_ascii(convert_stats);
143199
}
144200

145201
const char *get_wt_convert_stats_ascii(const char *path)
146202
{
147-
const char *ret = "";
148-
struct strbuf sb = STRBUF_INIT;
149-
if (strbuf_read_file(&sb, path, 0) >= 0)
150-
ret = gather_convert_stats_ascii(sb.buf, sb.len);
151-
strbuf_release(&sb);
152-
return ret;
203+
unsigned convert_stats;
204+
convert_stats = get_convert_stats_wt(path);
205+
return convert_stats_ascii(convert_stats);
153206
}
154207

155208
static int text_eol_is_crlf(void)
@@ -213,16 +266,10 @@ static void check_safe_crlf(const char *path, enum crlf_action crlf_action,
213266

214267
static int has_cr_in_index(const char *path)
215268
{
216-
unsigned long sz;
217-
void *data;
218-
int has_cr;
219-
220-
data = read_blob_data_from_cache(path, &sz);
221-
if (!data)
222-
return 0;
223-
has_cr = memchr(data, '\r', sz) != NULL;
224-
free(data);
225-
return has_cr;
269+
unsigned convert_stats;
270+
convert_stats = get_convert_stats_sha1(get_sha1_from_cache(path),
271+
CONVERT_STAT_BITS_ANY_CR);
272+
return convert_stats & CONVERT_STAT_BITS_ANY_CR;
226273
}
227274

228275
static int will_convert_lf_to_crlf(size_t len, struct text_stat *stats,
@@ -234,13 +281,13 @@ static int will_convert_lf_to_crlf(size_t len, struct text_stat *stats,
234281
if (!stats->lonelf)
235282
return 0;
236283

237-
if (crlf_action == CRLF_AUTO || crlf_action == CRLF_AUTO_INPUT || crlf_action == CRLF_AUTO_CRLF) {
284+
if (crlf_action == CRLF_AUTO || crlf_action == CRLF_AUTO_CRLF) {
238285
/* If we have any CR or CRLF line endings, we do not touch it */
239286
/* This is the new safer autocrlf-handling */
240287
if (stats->lonecr || stats->crlf)
241288
return 0;
242289

243-
if (convert_is_binary(len, stats))
290+
if (stats->stat_bits & CONVERT_STAT_BITS_BIN)
244291
return 0;
245292
}
246293
return 1;
@@ -253,7 +300,8 @@ static int crlf_to_git(const char *path, const char *src, size_t len,
253300
{
254301
struct text_stat stats;
255302
char *dst;
256-
int convert_crlf_into_lf;
303+
int has_crlf_to_convert;
304+
unsigned search_only = 0;
257305

258306
if (crlf_action == CRLF_BINARY ||
259307
(src && !len))
@@ -266,12 +314,16 @@ static int crlf_to_git(const char *path, const char *src, size_t len,
266314
if (!buf && !src)
267315
return 1;
268316

269-
gather_stats(src, len, &stats);
317+
if (crlf_action == CRLF_AUTO || crlf_action == CRLF_AUTO_INPUT || crlf_action == CRLF_AUTO_CRLF)
318+
search_only = CONVERT_STAT_BITS_BIN;
319+
320+
gather_all_stats(src, len, &stats, search_only);
321+
270322
/* Optimization: No CRLF? Nothing to convert, regardless. */
271-
convert_crlf_into_lf = !!stats.crlf;
323+
has_crlf_to_convert = !!stats.crlf;
272324

273325
if (crlf_action == CRLF_AUTO || crlf_action == CRLF_AUTO_INPUT || crlf_action == CRLF_AUTO_CRLF) {
274-
if (convert_is_binary(len, &stats))
326+
if (stats.stat_bits & CONVERT_STAT_BITS_BIN)
275327
return 0;
276328
/*
277329
* If the file in the index has any CR in it, do not convert.
@@ -280,24 +332,35 @@ static int crlf_to_git(const char *path, const char *src, size_t len,
280332
if (checksafe == SAFE_CRLF_RENORMALIZE)
281333
checksafe = SAFE_CRLF_FALSE;
282334
else if (has_cr_in_index(path))
283-
convert_crlf_into_lf = 0;
335+
has_crlf_to_convert = 0;
284336
}
285337
if (checksafe && len) {
286338
struct text_stat new_stats;
287339
memcpy(&new_stats, &stats, sizeof(new_stats));
288340
/* simulate "git add" */
289-
if (convert_crlf_into_lf) {
341+
if (has_crlf_to_convert) {
290342
new_stats.lonelf += new_stats.crlf;
291343
new_stats.crlf = 0;
344+
/* all crlf, if any, are gone. Update the bits */
345+
new_stats.stat_bits = stats.stat_bits & CONVERT_STAT_BITS_BIN;
346+
if (new_stats.lonelf)
347+
new_stats.stat_bits |= CONVERT_STAT_BITS_TXT_LF;
348+
if (new_stats.lonecr)
349+
new_stats.stat_bits |= CONVERT_STAT_BITS_ANY_CR;
292350
}
293351
/* simulate "git checkout" */
294352
if (will_convert_lf_to_crlf(len, &new_stats, crlf_action)) {
295353
new_stats.crlf += new_stats.lonelf;
296354
new_stats.lonelf = 0;
355+
new_stats.stat_bits = stats.stat_bits & CONVERT_STAT_BITS_BIN;
356+
if (new_stats.crlf)
357+
new_stats.stat_bits |= CONVERT_STAT_BITS_TXT_CRLF | CONVERT_STAT_BITS_ANY_CR;
358+
if (new_stats.lonecr)
359+
new_stats.stat_bits |= CONVERT_STAT_BITS_ANY_CR;
297360
}
298361
check_safe_crlf(path, crlf_action, &stats, &new_stats, checksafe);
299362
}
300-
if (!convert_crlf_into_lf)
363+
if (!has_crlf_to_convert)
301364
return 0;
302365

303366
/*
@@ -338,11 +401,15 @@ static int crlf_to_worktree(const char *path, const char *src, size_t len,
338401
{
339402
char *to_free = NULL;
340403
struct text_stat stats;
404+
unsigned search_only = 0;
341405

342406
if (!len || output_eol(crlf_action) != EOL_CRLF)
343407
return 0;
344408

345-
gather_stats(src, len, &stats);
409+
if (crlf_action == CRLF_AUTO || crlf_action == CRLF_AUTO_CRLF)
410+
search_only = CONVERT_STAT_BITS_ANY_CR | CONVERT_STAT_BITS_BIN;
411+
412+
gather_all_stats(src, len, &stats, search_only);
346413
if (!will_convert_lf_to_crlf(len, &stats, crlf_action))
347414
return 0;
348415

0 commit comments

Comments
 (0)