Skip to content

Commit 0bdb089

Browse files
committed
Merge branch 'tb/convert-stream-check' into pu
End-of-line conversion sometimes needs to see if the current blob in the index has NULs and CRs to base its decision. We used to always get a full statistics over the blob, but in many cases we can return early when we have seen "enough" (e.g. if we see a single NUL, the blob will be handled as binary). The codepaths have been optimized by using streaming interface. * tb/convert-stream-check: convert.c: stream and fast search for binary read-cache: factor out get_sha1_from_index() helper
2 parents ce3eac6 + 96b967b commit 0bdb089

File tree

3 files changed

+150
-73
lines changed

3 files changed

+150
-73
lines changed

cache.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -380,6 +380,7 @@ extern void free_name_hash(struct index_state *istate);
380380
#define unmerge_cache_entry_at(at) unmerge_index_entry_at(&the_index, at)
381381
#define unmerge_cache(pathspec) unmerge_index(&the_index, pathspec)
382382
#define read_blob_data_from_cache(path, sz) read_blob_data_from_index(&the_index, (path), (sz))
383+
#define get_sha1_from_cache(path) get_sha1_from_index (&the_index, (path))
383384
#endif
384385

385386
enum object_type {
@@ -1093,6 +1094,8 @@ static inline void *read_sha1_file(const unsigned char *sha1, enum object_type *
10931094
return read_sha1_file_extended(sha1, type, size, LOOKUP_REPLACE_OBJECT);
10941095
}
10951096

1097+
const unsigned char *get_sha1_from_index(struct index_state *istate, const char *path);
1098+
10961099
/*
10971100
* This internal function is only declared here for the benefit of
10981101
* lookup_replace_object(). Please do not call it directly.

convert.c

Lines changed: 129 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include "quote.h"
55
#include "sigchain.h"
66
#include "pkt-line.h"
7+
#include "streaming.h"
78

89
/*
910
* convert.c - convert a file when checking it out and checking it in.
@@ -14,10 +15,12 @@
1415
* translation when the "text" attribute or "auto_crlf" option is set.
1516
*/
1617

17-
/* Stat bits: When BIN is set, the txt bits are unset */
1818
#define CONVERT_STAT_BITS_TXT_LF 0x1
1919
#define CONVERT_STAT_BITS_TXT_CRLF 0x2
2020
#define CONVERT_STAT_BITS_BIN 0x4
21+
#define CONVERT_STAT_BITS_ANY_CR 0x8
22+
23+
#define STREAM_BUFFER_SIZE (1024*16)
2124

2225
enum crlf_action {
2326
CRLF_UNDEFINED,
@@ -32,30 +35,36 @@ enum crlf_action {
3235

3336
struct text_stat {
3437
/* NUL, CR, LF and CRLF counts */
35-
unsigned nul, lonecr, lonelf, crlf;
38+
unsigned stat_bits, lonecr, lonelf, crlf;
3639

3740
/* These are just approximations! */
3841
unsigned printable, nonprintable;
3942
};
4043

41-
static void gather_stats(const char *buf, unsigned long size, struct text_stat *stats)
44+
static void gather_stats_partly(const char *buf, unsigned long size,
45+
struct text_stat *stats, unsigned search_only)
4246
{
4347
unsigned long i;
4448

45-
memset(stats, 0, sizeof(*stats));
46-
49+
if (!buf || !size)
50+
return;
4751
for (i = 0; i < size; i++) {
4852
unsigned char c = buf[i];
4953
if (c == '\r') {
54+
stats->stat_bits |= CONVERT_STAT_BITS_ANY_CR;
5055
if (i+1 < size && buf[i+1] == '\n') {
5156
stats->crlf++;
5257
i++;
53-
} else
58+
stats->stat_bits |= CONVERT_STAT_BITS_TXT_CRLF;
59+
} else {
5460
stats->lonecr++;
61+
stats->stat_bits |= CONVERT_STAT_BITS_BIN;
62+
}
5563
continue;
5664
}
5765
if (c == '\n') {
5866
stats->lonelf++;
67+
stats->stat_bits |= CONVERT_STAT_BITS_TXT_LF;
5968
continue;
6069
}
6170
if (c == 127)
@@ -68,14 +77,16 @@ static void gather_stats(const char *buf, unsigned long size, struct text_stat *
6877
stats->printable++;
6978
break;
7079
case 0:
71-
stats->nul++;
80+
stats->stat_bits |= CONVERT_STAT_BITS_BIN;
7281
/* fall through */
7382
default:
7483
stats->nonprintable++;
7584
}
7685
}
7786
else
7887
stats->printable++;
88+
if (stats->stat_bits & search_only)
89+
break; /* We found what we have been searching for */
7990
}
8091

8192
/* If file ends with EOF then don't count this EOF as non-printable. */
@@ -87,41 +98,62 @@ static void gather_stats(const char *buf, unsigned long size, struct text_stat *
8798
* The same heuristics as diff.c::mmfile_is_binary()
8899
* We treat files with bare CR as binary
89100
*/
90-
static int convert_is_binary(unsigned long size, const struct text_stat *stats)
101+
static void convert_nonprintable(struct text_stat *stats)
91102
{
92-
if (stats->lonecr)
93-
return 1;
94-
if (stats->nul)
95-
return 1;
96103
if ((stats->printable >> 7) < stats->nonprintable)
97-
return 1;
98-
return 0;
104+
stats->stat_bits |= CONVERT_STAT_BITS_BIN;
105+
}
106+
107+
static void gather_all_stats(const char *buf, unsigned long size,
108+
struct text_stat *stats, unsigned search_only)
109+
{
110+
memset(stats, 0, sizeof(*stats));
111+
gather_stats_partly(buf, size, stats, search_only);
112+
convert_nonprintable(stats);
99113
}
100114

101-
static unsigned int gather_convert_stats(const char *data, unsigned long size)
115+
116+
static unsigned get_convert_stats_sha1(unsigned const char *sha1,
117+
unsigned search_only)
102118
{
119+
struct git_istream *st;
103120
struct text_stat stats;
104-
int ret = 0;
105-
if (!data || !size)
121+
enum object_type type;
122+
unsigned long sz;
123+
124+
if (!sha1)
125+
return 0;
126+
memset(&stats, 0, sizeof(stats));
127+
st = open_istream(sha1, &type, &sz, NULL);
128+
if (!st) {
106129
return 0;
107-
gather_stats(data, size, &stats);
108-
if (convert_is_binary(size, &stats))
109-
ret |= CONVERT_STAT_BITS_BIN;
110-
if (stats.crlf)
111-
ret |= CONVERT_STAT_BITS_TXT_CRLF;
112-
if (stats.lonelf)
113-
ret |= CONVERT_STAT_BITS_TXT_LF;
114-
115-
return ret;
130+
}
131+
if (type != OBJ_BLOB)
132+
goto close_and_exit_i;
133+
for (;;) {
134+
char buf[STREAM_BUFFER_SIZE];
135+
ssize_t readlen = read_istream(st, buf, sizeof(buf));
136+
if (readlen < 0)
137+
break;
138+
if (!readlen)
139+
break;
140+
gather_stats_partly(buf, (unsigned long)readlen, &stats, search_only);
141+
if (stats.stat_bits & search_only)
142+
break; /* We found what we have been searching for */
143+
}
144+
close_and_exit_i:
145+
close_istream(st);
146+
convert_nonprintable(&stats);
147+
return stats.stat_bits;
116148
}
117149

118-
static const char *gather_convert_stats_ascii(const char *data, unsigned long size)
150+
static const char *convert_stats_ascii(unsigned convert_stats)
119151
{
120-
unsigned int convert_stats = gather_convert_stats(data, size);
121-
152+
const unsigned eol_bits = CONVERT_STAT_BITS_TXT_LF |
153+
CONVERT_STAT_BITS_TXT_CRLF;
122154
if (convert_stats & CONVERT_STAT_BITS_BIN)
123155
return "-text";
124-
switch (convert_stats) {
156+
switch (convert_stats & eol_bits) {
125157
case CONVERT_STAT_BITS_TXT_LF:
126158
return "lf";
127159
case CONVERT_STAT_BITS_TXT_CRLF:
@@ -133,24 +165,45 @@ static const char *gather_convert_stats_ascii(const char *data, unsigned long si
133165
}
134166
}
135167

168+
static unsigned get_convert_stats_wt(const char *path)
169+
{
170+
struct text_stat stats;
171+
unsigned search_only = CONVERT_STAT_BITS_BIN;
172+
int fd;
173+
memset(&stats, 0, sizeof(stats));
174+
fd = open(path, O_RDONLY);
175+
if (fd < 0)
176+
return 0;
177+
for (;;) {
178+
char buf[STREAM_BUFFER_SIZE];
179+
ssize_t readlen = read(fd, buf, sizeof(buf));
180+
if (readlen < 0)
181+
break;
182+
if (!readlen)
183+
break;
184+
gather_stats_partly(buf, (unsigned long)readlen, &stats, search_only);
185+
if (stats.stat_bits & search_only)
186+
break; /* We found what we have been searching for */
187+
}
188+
close(fd);
189+
convert_nonprintable(&stats);
190+
return stats.stat_bits;
191+
}
192+
136193
const char *get_cached_convert_stats_ascii(const char *path)
137194
{
138-
const char *ret;
139-
unsigned long sz;
140-
void *data = read_blob_data_from_cache(path, &sz);
141-
ret = gather_convert_stats_ascii(data, sz);
142-
free(data);
143-
return ret;
195+
unsigned convert_stats;
196+
unsigned search_only = CONVERT_STAT_BITS_BIN;
197+
convert_stats = get_convert_stats_sha1(get_sha1_from_cache(path),
198+
search_only);
199+
return convert_stats_ascii(convert_stats);
144200
}
145201

146202
const char *get_wt_convert_stats_ascii(const char *path)
147203
{
148-
const char *ret = "";
149-
struct strbuf sb = STRBUF_INIT;
150-
if (strbuf_read_file(&sb, path, 0) >= 0)
151-
ret = gather_convert_stats_ascii(sb.buf, sb.len);
152-
strbuf_release(&sb);
153-
return ret;
204+
unsigned convert_stats;
205+
convert_stats = get_convert_stats_wt(path);
206+
return convert_stats_ascii(convert_stats);
154207
}
155208

156209
static int text_eol_is_crlf(void)
@@ -214,16 +267,10 @@ static void check_safe_crlf(const char *path, enum crlf_action crlf_action,
214267

215268
static int has_cr_in_index(const char *path)
216269
{
217-
unsigned long sz;
218-
void *data;
219-
int has_cr;
220-
221-
data = read_blob_data_from_cache(path, &sz);
222-
if (!data)
223-
return 0;
224-
has_cr = memchr(data, '\r', sz) != NULL;
225-
free(data);
226-
return has_cr;
270+
unsigned convert_stats;
271+
convert_stats = get_convert_stats_sha1(get_sha1_from_cache(path),
272+
CONVERT_STAT_BITS_ANY_CR);
273+
return convert_stats & CONVERT_STAT_BITS_ANY_CR;
227274
}
228275

229276
static int will_convert_lf_to_crlf(size_t len, struct text_stat *stats,
@@ -235,13 +282,13 @@ static int will_convert_lf_to_crlf(size_t len, struct text_stat *stats,
235282
if (!stats->lonelf)
236283
return 0;
237284

238-
if (crlf_action == CRLF_AUTO || crlf_action == CRLF_AUTO_INPUT || crlf_action == CRLF_AUTO_CRLF) {
285+
if (crlf_action == CRLF_AUTO || crlf_action == CRLF_AUTO_CRLF) {
239286
/* If we have any CR or CRLF line endings, we do not touch it */
240287
/* This is the new safer autocrlf-handling */
241288
if (stats->lonecr || stats->crlf)
242289
return 0;
243290

244-
if (convert_is_binary(len, stats))
291+
if (stats->stat_bits & CONVERT_STAT_BITS_BIN)
245292
return 0;
246293
}
247294
return 1;
@@ -254,7 +301,8 @@ static int crlf_to_git(const char *path, const char *src, size_t len,
254301
{
255302
struct text_stat stats;
256303
char *dst;
257-
int convert_crlf_into_lf;
304+
int has_crlf_to_convert;
305+
unsigned search_only = 0;
258306

259307
if (crlf_action == CRLF_BINARY ||
260308
(src && !len))
@@ -267,12 +315,16 @@ static int crlf_to_git(const char *path, const char *src, size_t len,
267315
if (!buf && !src)
268316
return 1;
269317

270-
gather_stats(src, len, &stats);
318+
if (crlf_action == CRLF_AUTO || crlf_action == CRLF_AUTO_INPUT || crlf_action == CRLF_AUTO_CRLF)
319+
search_only = CONVERT_STAT_BITS_BIN;
320+
321+
gather_all_stats(src, len, &stats, search_only);
322+
271323
/* Optimization: No CRLF? Nothing to convert, regardless. */
272-
convert_crlf_into_lf = !!stats.crlf;
324+
has_crlf_to_convert = !!stats.crlf;
273325

274326
if (crlf_action == CRLF_AUTO || crlf_action == CRLF_AUTO_INPUT || crlf_action == CRLF_AUTO_CRLF) {
275-
if (convert_is_binary(len, &stats))
327+
if (stats.stat_bits & CONVERT_STAT_BITS_BIN)
276328
return 0;
277329
/*
278330
* If the file in the index has any CR in it, do not convert.
@@ -281,24 +333,35 @@ static int crlf_to_git(const char *path, const char *src, size_t len,
281333
if (checksafe == SAFE_CRLF_RENORMALIZE)
282334
checksafe = SAFE_CRLF_FALSE;
283335
else if (has_cr_in_index(path))
284-
convert_crlf_into_lf = 0;
336+
has_crlf_to_convert = 0;
285337
}
286338
if (checksafe && len) {
287339
struct text_stat new_stats;
288340
memcpy(&new_stats, &stats, sizeof(new_stats));
289341
/* simulate "git add" */
290-
if (convert_crlf_into_lf) {
342+
if (has_crlf_to_convert) {
291343
new_stats.lonelf += new_stats.crlf;
292344
new_stats.crlf = 0;
345+
/* all crlf, if any, are gone. Update the bits */
346+
new_stats.stat_bits = stats.stat_bits & CONVERT_STAT_BITS_BIN;
347+
if (new_stats.lonelf)
348+
new_stats.stat_bits |= CONVERT_STAT_BITS_TXT_LF;
349+
if (new_stats.lonecr)
350+
new_stats.stat_bits |= CONVERT_STAT_BITS_ANY_CR;
293351
}
294352
/* simulate "git checkout" */
295353
if (will_convert_lf_to_crlf(len, &new_stats, crlf_action)) {
296354
new_stats.crlf += new_stats.lonelf;
297355
new_stats.lonelf = 0;
356+
new_stats.stat_bits = stats.stat_bits & CONVERT_STAT_BITS_BIN;
357+
if (new_stats.crlf)
358+
new_stats.stat_bits |= CONVERT_STAT_BITS_TXT_CRLF | CONVERT_STAT_BITS_ANY_CR;
359+
if (new_stats.lonecr)
360+
new_stats.stat_bits |= CONVERT_STAT_BITS_ANY_CR;
298361
}
299362
check_safe_crlf(path, crlf_action, &stats, &new_stats, checksafe);
300363
}
301-
if (!convert_crlf_into_lf)
364+
if (!has_crlf_to_convert)
302365
return 0;
303366

304367
/*
@@ -339,11 +402,15 @@ static int crlf_to_worktree(const char *path, const char *src, size_t len,
339402
{
340403
char *to_free = NULL;
341404
struct text_stat stats;
405+
unsigned search_only = 0;
342406

343407
if (!len || output_eol(crlf_action) != EOL_CRLF)
344408
return 0;
345409

346-
gather_stats(src, len, &stats);
410+
if (crlf_action == CRLF_AUTO || crlf_action == CRLF_AUTO_CRLF)
411+
search_only = CONVERT_STAT_BITS_ANY_CR | CONVERT_STAT_BITS_BIN;
412+
413+
gather_all_stats(src, len, &stats, search_only);
347414
if (!will_convert_lf_to_crlf(len, &stats, crlf_action))
348415
return 0;
349416

0 commit comments

Comments
 (0)