Skip to content

Commit 275abed

Browse files
committed
Merge branch 'tb/convert-stream-check' into pu
End-of-line conversion sometimes needs to see if the current blob in the index has NULs and CRs to base its decision. We used to always get a full statistics over the blob, but in many cases we can return early when we have seen "enough" (e.g. if we see a single NUL, the blob will be handled as binary). The codepaths have been optimized by using streaming interface. * tb/convert-stream-check: convert.c: stream and fast search for binary read-cache: factor out get_sha1_from_index() helper
2 parents a05ac4b + 737ff06 commit 275abed

File tree

3 files changed

+150
-73
lines changed

3 files changed

+150
-73
lines changed

cache.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -380,6 +380,7 @@ extern void free_name_hash(struct index_state *istate);
380380
#define unmerge_cache_entry_at(at) unmerge_index_entry_at(&the_index, at)
381381
#define unmerge_cache(pathspec) unmerge_index(&the_index, pathspec)
382382
#define read_blob_data_from_cache(path, sz) read_blob_data_from_index(&the_index, (path), (sz))
383+
#define get_sha1_from_cache(path) get_sha1_from_index(&the_index, (path))
383384
#endif
384385

385386
enum object_type {
@@ -1092,6 +1093,8 @@ static inline void *read_sha1_file(const unsigned char *sha1, enum object_type *
10921093
return read_sha1_file_extended(sha1, type, size, LOOKUP_REPLACE_OBJECT);
10931094
}
10941095

1096+
const unsigned char *get_sha1_from_index(struct index_state *istate, const char *path);
1097+
10951098
/*
10961099
* This internal function is only declared here for the benefit of
10971100
* lookup_replace_object(). Please do not call it directly.

convert.c

Lines changed: 129 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include "quote.h"
55
#include "sigchain.h"
66
#include "pkt-line.h"
7+
#include "streaming.h"
78

89
/*
910
* convert.c - convert a file when checking it out and checking it in.
@@ -14,10 +15,12 @@
1415
* translation when the "text" attribute or "auto_crlf" option is set.
1516
*/
1617

17-
/* Stat bits: When BIN is set, the txt bits are unset */
1818
#define CONVERT_STAT_BITS_TXT_LF 0x1
1919
#define CONVERT_STAT_BITS_TXT_CRLF 0x2
2020
#define CONVERT_STAT_BITS_BIN 0x4
21+
#define CONVERT_STAT_BITS_ANY_CR 0x8
22+
23+
#define STREAM_BUFFER_SIZE (1024*16)
2124

2225
enum crlf_action {
2326
CRLF_UNDEFINED,
@@ -32,30 +35,36 @@ enum crlf_action {
3235

3336
struct text_stat {
3437
/* NUL, CR, LF and CRLF counts */
35-
unsigned nul, lonecr, lonelf, crlf;
38+
unsigned stat_bits, lonecr, lonelf, crlf;
3639

3740
/* These are just approximations! */
3841
unsigned printable, nonprintable;
3942
};
4043

41-
static void gather_stats(const char *buf, unsigned long size, struct text_stat *stats)
44+
static void gather_stats_partly(const char *buf, unsigned long size,
45+
struct text_stat *stats, unsigned search_only)
4246
{
4347
unsigned long i;
4448

45-
memset(stats, 0, sizeof(*stats));
46-
49+
if (!buf || !size)
50+
return;
4751
for (i = 0; i < size; i++) {
4852
unsigned char c = buf[i];
4953
if (c == '\r') {
54+
stats->stat_bits |= CONVERT_STAT_BITS_ANY_CR;
5055
if (i+1 < size && buf[i+1] == '\n') {
5156
stats->crlf++;
5257
i++;
53-
} else
58+
stats->stat_bits |= CONVERT_STAT_BITS_TXT_CRLF;
59+
} else {
5460
stats->lonecr++;
61+
stats->stat_bits |= CONVERT_STAT_BITS_BIN;
62+
}
5563
continue;
5664
}
5765
if (c == '\n') {
5866
stats->lonelf++;
67+
stats->stat_bits |= CONVERT_STAT_BITS_TXT_LF;
5968
continue;
6069
}
6170
if (c == 127)
@@ -68,14 +77,16 @@ static void gather_stats(const char *buf, unsigned long size, struct text_stat *
6877
stats->printable++;
6978
break;
7079
case 0:
71-
stats->nul++;
80+
stats->stat_bits |= CONVERT_STAT_BITS_BIN;
7281
/* fall through */
7382
default:
7483
stats->nonprintable++;
7584
}
7685
}
7786
else
7887
stats->printable++;
88+
if (stats->stat_bits & search_only)
89+
break; /* We found what we have been searching for */
7990
}
8091

8192
/* If file ends with EOF then don't count this EOF as non-printable. */
@@ -87,41 +98,62 @@ static void gather_stats(const char *buf, unsigned long size, struct text_stat *
8798
* The same heuristics as diff.c::mmfile_is_binary()
8899
* We treat files with bare CR as binary
89100
*/
90-
static int convert_is_binary(unsigned long size, const struct text_stat *stats)
101+
static void convert_nonprintable(struct text_stat *stats)
91102
{
92-
if (stats->lonecr)
93-
return 1;
94-
if (stats->nul)
95-
return 1;
96103
if ((stats->printable >> 7) < stats->nonprintable)
97-
return 1;
98-
return 0;
104+
stats->stat_bits |= CONVERT_STAT_BITS_BIN;
105+
}
106+
107+
static void gather_all_stats(const char *buf, unsigned long size,
108+
struct text_stat *stats, unsigned search_only)
109+
{
110+
memset(stats, 0, sizeof(*stats));
111+
gather_stats_partly(buf, size, stats, search_only);
112+
convert_nonprintable(stats);
99113
}
100114

101-
static unsigned int gather_convert_stats(const char *data, unsigned long size)
115+
116+
static unsigned get_convert_stats_sha1(unsigned const char *sha1,
117+
unsigned search_only)
102118
{
119+
struct git_istream *st;
103120
struct text_stat stats;
104-
int ret = 0;
105-
if (!data || !size)
121+
enum object_type type;
122+
unsigned long sz;
123+
124+
if (!sha1)
125+
return 0;
126+
memset(&stats, 0, sizeof(stats));
127+
st = open_istream(sha1, &type, &sz, NULL);
128+
if (!st) {
106129
return 0;
107-
gather_stats(data, size, &stats);
108-
if (convert_is_binary(size, &stats))
109-
ret |= CONVERT_STAT_BITS_BIN;
110-
if (stats.crlf)
111-
ret |= CONVERT_STAT_BITS_TXT_CRLF;
112-
if (stats.lonelf)
113-
ret |= CONVERT_STAT_BITS_TXT_LF;
114-
115-
return ret;
130+
}
131+
if (type != OBJ_BLOB)
132+
goto close_and_exit_i;
133+
for (;;) {
134+
char buf[STREAM_BUFFER_SIZE];
135+
ssize_t readlen = read_istream(st, buf, sizeof(buf));
136+
if (readlen < 0)
137+
break;
138+
if (!readlen)
139+
break;
140+
gather_stats_partly(buf, (unsigned long)readlen, &stats, search_only);
141+
if (stats.stat_bits & search_only)
142+
break; /* We found what we have been searching for */
143+
}
144+
close_and_exit_i:
145+
close_istream(st);
146+
convert_nonprintable(&stats);
147+
return stats.stat_bits;
116148
}
117149

118-
static const char *gather_convert_stats_ascii(const char *data, unsigned long size)
150+
static const char *convert_stats_ascii(unsigned convert_stats)
119151
{
120-
unsigned int convert_stats = gather_convert_stats(data, size);
121-
152+
const unsigned eol_bits = CONVERT_STAT_BITS_TXT_LF |
153+
CONVERT_STAT_BITS_TXT_CRLF;
122154
if (convert_stats & CONVERT_STAT_BITS_BIN)
123155
return "-text";
124-
switch (convert_stats) {
156+
switch (convert_stats & eol_bits) {
125157
case CONVERT_STAT_BITS_TXT_LF:
126158
return "lf";
127159
case CONVERT_STAT_BITS_TXT_CRLF:
@@ -133,24 +165,45 @@ static const char *gather_convert_stats_ascii(const char *data, unsigned long si
133165
}
134166
}
135167

168+
static unsigned get_convert_stats_wt(const char *path)
169+
{
170+
struct text_stat stats;
171+
unsigned search_only = CONVERT_STAT_BITS_BIN;
172+
int fd;
173+
memset(&stats, 0, sizeof(stats));
174+
fd = open(path, O_RDONLY);
175+
if (fd < 0)
176+
return 0;
177+
for (;;) {
178+
char buf[STREAM_BUFFER_SIZE];
179+
ssize_t readlen = read(fd, buf, sizeof(buf));
180+
if (readlen < 0)
181+
break;
182+
if (!readlen)
183+
break;
184+
gather_stats_partly(buf, (unsigned long)readlen, &stats, search_only);
185+
if (stats.stat_bits & search_only)
186+
break; /* We found what we have been searching for */
187+
}
188+
close(fd);
189+
convert_nonprintable(&stats);
190+
return stats.stat_bits;
191+
}
192+
136193
const char *get_cached_convert_stats_ascii(const char *path)
137194
{
138-
const char *ret;
139-
unsigned long sz;
140-
void *data = read_blob_data_from_cache(path, &sz);
141-
ret = gather_convert_stats_ascii(data, sz);
142-
free(data);
143-
return ret;
195+
unsigned convert_stats;
196+
unsigned search_only = CONVERT_STAT_BITS_BIN;
197+
convert_stats = get_convert_stats_sha1(get_sha1_from_cache(path),
198+
search_only);
199+
return convert_stats_ascii(convert_stats);
144200
}
145201

146202
const char *get_wt_convert_stats_ascii(const char *path)
147203
{
148-
const char *ret = "";
149-
struct strbuf sb = STRBUF_INIT;
150-
if (strbuf_read_file(&sb, path, 0) >= 0)
151-
ret = gather_convert_stats_ascii(sb.buf, sb.len);
152-
strbuf_release(&sb);
153-
return ret;
204+
unsigned convert_stats;
205+
convert_stats = get_convert_stats_wt(path);
206+
return convert_stats_ascii(convert_stats);
154207
}
155208

156209
static int text_eol_is_crlf(void)
@@ -218,16 +271,10 @@ static void check_safe_crlf(const char *path, enum crlf_action crlf_action,
218271

219272
static int has_cr_in_index(const char *path)
220273
{
221-
unsigned long sz;
222-
void *data;
223-
int has_cr;
224-
225-
data = read_blob_data_from_cache(path, &sz);
226-
if (!data)
227-
return 0;
228-
has_cr = memchr(data, '\r', sz) != NULL;
229-
free(data);
230-
return has_cr;
274+
unsigned convert_stats;
275+
convert_stats = get_convert_stats_sha1(get_sha1_from_cache(path),
276+
CONVERT_STAT_BITS_ANY_CR);
277+
return convert_stats & CONVERT_STAT_BITS_ANY_CR;
231278
}
232279

233280
static int will_convert_lf_to_crlf(size_t len, struct text_stat *stats,
@@ -239,13 +286,13 @@ static int will_convert_lf_to_crlf(size_t len, struct text_stat *stats,
239286
if (!stats->lonelf)
240287
return 0;
241288

242-
if (crlf_action == CRLF_AUTO || crlf_action == CRLF_AUTO_INPUT || crlf_action == CRLF_AUTO_CRLF) {
289+
if (crlf_action == CRLF_AUTO || crlf_action == CRLF_AUTO_CRLF) {
243290
/* If we have any CR or CRLF line endings, we do not touch it */
244291
/* This is the new safer autocrlf-handling */
245292
if (stats->lonecr || stats->crlf)
246293
return 0;
247294

248-
if (convert_is_binary(len, stats))
295+
if (stats->stat_bits & CONVERT_STAT_BITS_BIN)
249296
return 0;
250297
}
251298
return 1;
@@ -258,7 +305,8 @@ static int crlf_to_git(const char *path, const char *src, size_t len,
258305
{
259306
struct text_stat stats;
260307
char *dst;
261-
int convert_crlf_into_lf;
308+
int has_crlf_to_convert;
309+
unsigned search_only = 0;
262310

263311
if (crlf_action == CRLF_BINARY ||
264312
(src && !len))
@@ -271,12 +319,16 @@ static int crlf_to_git(const char *path, const char *src, size_t len,
271319
if (!buf && !src)
272320
return 1;
273321

274-
gather_stats(src, len, &stats);
322+
if (crlf_action == CRLF_AUTO || crlf_action == CRLF_AUTO_INPUT || crlf_action == CRLF_AUTO_CRLF)
323+
search_only = CONVERT_STAT_BITS_BIN;
324+
325+
gather_all_stats(src, len, &stats, search_only);
326+
275327
/* Optimization: No CRLF? Nothing to convert, regardless. */
276-
convert_crlf_into_lf = !!stats.crlf;
328+
has_crlf_to_convert = !!stats.crlf;
277329

278330
if (crlf_action == CRLF_AUTO || crlf_action == CRLF_AUTO_INPUT || crlf_action == CRLF_AUTO_CRLF) {
279-
if (convert_is_binary(len, &stats))
331+
if (stats.stat_bits & CONVERT_STAT_BITS_BIN)
280332
return 0;
281333
/*
282334
* If the file in the index has any CR in it, do not
@@ -285,25 +337,36 @@ static int crlf_to_git(const char *path, const char *src, size_t len,
285337
* cherry-pick.
286338
*/
287339
if ((checksafe != SAFE_CRLF_RENORMALIZE) && has_cr_in_index(path))
288-
convert_crlf_into_lf = 0;
340+
has_crlf_to_convert = 0;
289341
}
290342
if ((checksafe == SAFE_CRLF_WARN ||
291343
(checksafe == SAFE_CRLF_FAIL)) && len) {
292344
struct text_stat new_stats;
293345
memcpy(&new_stats, &stats, sizeof(new_stats));
294346
/* simulate "git add" */
295-
if (convert_crlf_into_lf) {
347+
if (has_crlf_to_convert) {
296348
new_stats.lonelf += new_stats.crlf;
297349
new_stats.crlf = 0;
350+
/* all crlf, if any, are gone. Update the bits */
351+
new_stats.stat_bits = stats.stat_bits & CONVERT_STAT_BITS_BIN;
352+
if (new_stats.lonelf)
353+
new_stats.stat_bits |= CONVERT_STAT_BITS_TXT_LF;
354+
if (new_stats.lonecr)
355+
new_stats.stat_bits |= CONVERT_STAT_BITS_ANY_CR;
298356
}
299357
/* simulate "git checkout" */
300358
if (will_convert_lf_to_crlf(len, &new_stats, crlf_action)) {
301359
new_stats.crlf += new_stats.lonelf;
302360
new_stats.lonelf = 0;
361+
new_stats.stat_bits = stats.stat_bits & CONVERT_STAT_BITS_BIN;
362+
if (new_stats.crlf)
363+
new_stats.stat_bits |= CONVERT_STAT_BITS_TXT_CRLF | CONVERT_STAT_BITS_ANY_CR;
364+
if (new_stats.lonecr)
365+
new_stats.stat_bits |= CONVERT_STAT_BITS_ANY_CR;
303366
}
304367
check_safe_crlf(path, crlf_action, &stats, &new_stats, checksafe);
305368
}
306-
if (!convert_crlf_into_lf)
369+
if (!has_crlf_to_convert)
307370
return 0;
308371

309372
/*
@@ -344,11 +407,15 @@ static int crlf_to_worktree(const char *path, const char *src, size_t len,
344407
{
345408
char *to_free = NULL;
346409
struct text_stat stats;
410+
unsigned search_only = 0;
347411

348412
if (!len || output_eol(crlf_action) != EOL_CRLF)
349413
return 0;
350414

351-
gather_stats(src, len, &stats);
415+
if (crlf_action == CRLF_AUTO || crlf_action == CRLF_AUTO_CRLF)
416+
search_only = CONVERT_STAT_BITS_ANY_CR | CONVERT_STAT_BITS_BIN;
417+
418+
gather_all_stats(src, len, &stats, search_only);
352419
if (!will_convert_lf_to_crlf(len, &stats, crlf_action))
353420
return 0;
354421

0 commit comments

Comments
 (0)