Skip to content

Commit 6c510be

Browse files
torvaldsJunio C Hamano
authored andcommitted
Lazy man's auto-CRLF
It currently does NOT know about file attributes, so it does its conversion purely based on content. Maybe that is more in the "git philosophy" anyway, since content is king, but I think we should try to do the file attributes to turn it off on demand. Anyway, BY DEFAULT it is off regardless, because it requires a [core] AutoCRLF = true in your config file to be enabled. We could make that the default for Windows, of course, the same way we do some other things (filemode etc). But you can actually enable it on UNIX, and it will cause: - "git update-index" will write blobs without CRLF - "git diff" will diff working tree files without CRLF - "git checkout" will write files to the working tree _with_ CRLF and things work fine. Funnily, it actually shows an odd file in git itself: git clone -n git test-crlf cd test-crlf git config core.autocrlf true git checkout git diff shows a diff for "Documentation/docbook-xsl.css". Why? Because we have actually checked in that file *with* CRLF! So when "core.autocrlf" is true, we'll always generate a *different* hash for it in the index, because the index hash will be for the content _without_ CRLF. Is this complete? I dunno. It seems to work for me. It doesn't use the filename at all right now, and that's probably a deficiency (we could certainly make the "is_binary()" heuristics also take standard filename heuristics into account). I don't pass in the filename at all for the "index_fd()" case (git-update-index), so that would need to be passed around, but this actually works fine. NOTE NOTE NOTE! The "is_binary()" heuristics are totally made-up by yours truly. I will not guarantee that they work at all reasonable. Caveat emptor. But it _is_ simple, and it _is_ safe, since it's all off by default. The patch is pretty simple - the biggest part is the new "convert.c" file, but even that is really just basic stuff that anybody can write in "Teaching C 101" as a final project for their first class in programming. Not to say that it's bug-free, of course - but at least we're not talking about rocket surgery here. Signed-off-by: Linus Torvalds <[email protected]> Signed-off-by: Junio C Hamano <[email protected]>
1 parent 437b1b2 commit 6c510be

File tree

8 files changed

+251
-5
lines changed

8 files changed

+251
-5
lines changed

Makefile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -262,7 +262,8 @@ LIB_OBJS = \
262262
revision.o pager.o tree-walk.o xdiff-interface.o \
263263
write_or_die.o trace.o list-objects.o grep.o \
264264
alloc.o merge-file.o path-list.o help.o unpack-trees.o $(DIFF_OBJS) \
265-
color.o wt-status.o archive-zip.o archive-tar.o shallow.o utf8.o
265+
color.o wt-status.o archive-zip.o archive-tar.o shallow.o utf8.o \
266+
convert.o
266267

267268
BUILTIN_OBJS = \
268269
builtin-add.o \

cache.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,7 @@ extern const char *apply_default_whitespace;
201201
extern int zlib_compression_level;
202202
extern size_t packed_git_window_size;
203203
extern size_t packed_git_limit;
204+
extern int auto_crlf;
204205

205206
#define GIT_REPO_VERSION 0
206207
extern int repository_format_version;
@@ -468,4 +469,8 @@ extern int nfvasprintf(char **str, const char *fmt, va_list va);
468469
extern void trace_printf(const char *format, ...);
469470
extern void trace_argv_printf(const char **argv, int count, const char *format, ...);
470471

472+
/* convert.c */
473+
extern int convert_to_git(const char *path, char **bufp, unsigned long *sizep);
474+
extern int convert_to_working_tree(const char *path, char **bufp, unsigned long *sizep);
475+
471476
#endif /* CACHE_H */

config.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -324,6 +324,11 @@ int git_default_config(const char *var, const char *value)
324324
return 0;
325325
}
326326

327+
if (!strcmp(var, "core.autocrlf")) {
328+
auto_crlf = git_config_bool(var, value);
329+
return 0;
330+
}
331+
327332
if (!strcmp(var, "user.name")) {
328333
strlcpy(git_default_name, value, sizeof(git_default_name));
329334
return 0;

convert.c

Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
#include "cache.h"
2+
/*
3+
* convert.c - convert a file when checking it out and checking it in.
4+
*
5+
* This should use the pathname to decide on whether it wants to do some
6+
* more interesting conversions (automatic gzip/unzip, general format
7+
* conversions etc etc), but by default it just does automatic CRLF<->LF
8+
* translation when the "auto_crlf" option is set.
9+
*/
10+
11+
struct text_stat {
12+
/* CR, LF and CRLF counts */
13+
unsigned cr, lf, crlf;
14+
15+
/* These are just approximations! */
16+
unsigned printable, nonprintable;
17+
};
18+
19+
static void gather_stats(const char *buf, unsigned long size, struct text_stat *stats)
20+
{
21+
unsigned long i;
22+
23+
memset(stats, 0, sizeof(*stats));
24+
25+
for (i = 0; i < size; i++) {
26+
unsigned char c = buf[i];
27+
if (c == '\r') {
28+
stats->cr++;
29+
if (i+1 < size && buf[i+1] == '\n')
30+
stats->crlf++;
31+
continue;
32+
}
33+
if (c == '\n') {
34+
stats->lf++;
35+
continue;
36+
}
37+
if (c == 127)
38+
/* DEL */
39+
stats->nonprintable++;
40+
else if (c < 32) {
41+
switch (c) {
42+
/* BS, HT, ESC and FF */
43+
case '\b': case '\t': case '\033': case '\014':
44+
stats->printable++;
45+
break;
46+
default:
47+
stats->nonprintable++;
48+
}
49+
}
50+
else
51+
stats->printable++;
52+
}
53+
}
54+
55+
/*
56+
* The same heuristics as diff.c::mmfile_is_binary()
57+
*/
58+
static int is_binary(unsigned long size, struct text_stat *stats)
59+
{
60+
61+
if ((stats->printable >> 7) < stats->nonprintable)
62+
return 1;
63+
/*
64+
* Other heuristics? Average line length might be relevant,
65+
* as might LF vs CR vs CRLF counts..
66+
*
67+
* NOTE! It might be normal to have a low ratio of CRLF to LF
68+
* (somebody starts with a LF-only file and edits it with an editor
69+
* that adds CRLF only to lines that are added..). But do we
70+
* want to support CR-only? Probably not.
71+
*/
72+
return 0;
73+
}
74+
75+
int convert_to_git(const char *path, char **bufp, unsigned long *sizep)
76+
{
77+
char *buffer, *nbuf;
78+
unsigned long size, nsize;
79+
struct text_stat stats;
80+
81+
/*
82+
* FIXME! Other pluggable conversions should go here,
83+
* based on filename patterns. Right now we just do the
84+
* stupid auto-CRLF one.
85+
*/
86+
if (!auto_crlf)
87+
return 0;
88+
89+
size = *sizep;
90+
if (!size)
91+
return 0;
92+
buffer = *bufp;
93+
94+
gather_stats(buffer, size, &stats);
95+
96+
/* No CR? Nothing to convert, regardless. */
97+
if (!stats.cr)
98+
return 0;
99+
100+
/*
101+
* We're currently not going to even try to convert stuff
102+
* that has bare CR characters. Does anybody do that crazy
103+
* stuff?
104+
*/
105+
if (stats.cr != stats.crlf)
106+
return 0;
107+
108+
/*
109+
* And add some heuristics for binary vs text, of course...
110+
*/
111+
if (is_binary(size, &stats))
112+
return 0;
113+
114+
/*
115+
* Ok, allocate a new buffer, fill it in, and return true
116+
* to let the caller know that we switched buffers on it.
117+
*/
118+
nsize = size - stats.crlf;
119+
nbuf = xmalloc(nsize);
120+
*bufp = nbuf;
121+
*sizep = nsize;
122+
do {
123+
unsigned char c = *buffer++;
124+
if (c != '\r')
125+
*nbuf++ = c;
126+
} while (--size);
127+
128+
return 1;
129+
}
130+
131+
int convert_to_working_tree(const char *path, char **bufp, unsigned long *sizep)
132+
{
133+
char *buffer, *nbuf;
134+
unsigned long size, nsize;
135+
struct text_stat stats;
136+
unsigned char last;
137+
138+
/*
139+
* FIXME! Other pluggable conversions should go here,
140+
* based on filename patterns. Right now we just do the
141+
* stupid auto-CRLF one.
142+
*/
143+
if (!auto_crlf)
144+
return 0;
145+
146+
size = *sizep;
147+
if (!size)
148+
return 0;
149+
buffer = *bufp;
150+
151+
gather_stats(buffer, size, &stats);
152+
153+
/* No LF? Nothing to convert, regardless. */
154+
if (!stats.lf)
155+
return 0;
156+
157+
/* Was it already in CRLF format? */
158+
if (stats.lf == stats.crlf)
159+
return 0;
160+
161+
/* If we have any bare CR characters, we're not going to touch it */
162+
if (stats.cr != stats.crlf)
163+
return 0;
164+
165+
if (is_binary(size, &stats))
166+
return 0;
167+
168+
/*
169+
* Ok, allocate a new buffer, fill it in, and return true
170+
* to let the caller know that we switched buffers on it.
171+
*/
172+
nsize = size + stats.lf - stats.crlf;
173+
nbuf = xmalloc(nsize);
174+
*bufp = nbuf;
175+
*sizep = nsize;
176+
last = 0;
177+
do {
178+
unsigned char c = *buffer++;
179+
if (c == '\n' && last != '\r')
180+
*nbuf++ = '\r';
181+
*nbuf++ = c;
182+
last = c;
183+
} while (--size);
184+
185+
return 1;
186+
}

diff.c

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1332,6 +1332,9 @@ int diff_populate_filespec(struct diff_filespec *s, int size_only)
13321332
reuse_worktree_file(s->path, s->sha1, 0)) {
13331333
struct stat st;
13341334
int fd;
1335+
char *buf;
1336+
unsigned long size;
1337+
13351338
if (lstat(s->path, &st) < 0) {
13361339
if (errno == ENOENT) {
13371340
err_empty:
@@ -1364,7 +1367,19 @@ int diff_populate_filespec(struct diff_filespec *s, int size_only)
13641367
s->data = xmmap(NULL, s->size, PROT_READ, MAP_PRIVATE, fd, 0);
13651368
close(fd);
13661369
s->should_munmap = 1;
1367-
/* FIXME! CRLF -> LF conversion goes here, based on "s->path" */
1370+
1371+
/*
1372+
* Convert from working tree format to canonical git format
1373+
*/
1374+
buf = s->data;
1375+
size = s->size;
1376+
if (convert_to_git(s->path, &buf, &size)) {
1377+
munmap(s->data, s->size);
1378+
s->should_munmap = 0;
1379+
s->data = buf;
1380+
s->size = size;
1381+
s->should_free = 1;
1382+
}
13681383
}
13691384
else {
13701385
char type[20];

entry.c

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,9 @@ static int write_entry(struct cache_entry *ce, char *path, struct checkout *stat
7878
path, sha1_to_hex(ce->sha1));
7979
}
8080
switch (ntohl(ce->ce_mode) & S_IFMT) {
81+
char *buf;
82+
unsigned long nsize;
83+
8184
case S_IFREG:
8285
if (to_tempfile) {
8386
strcpy(path, ".merge_file_XXXXXX");
@@ -89,7 +92,18 @@ static int write_entry(struct cache_entry *ce, char *path, struct checkout *stat
8992
return error("git-checkout-index: unable to create file %s (%s)",
9093
path, strerror(errno));
9194
}
92-
/* FIXME: LF -> CRLF conversion goes here, based on "ce->name" */
95+
96+
/*
97+
* Convert from git internal format to working tree format
98+
*/
99+
buf = new;
100+
nsize = size;
101+
if (convert_to_working_tree(ce->name, &buf, &nsize)) {
102+
free(new);
103+
new = buf;
104+
size = nsize;
105+
}
106+
93107
wrote = write_in_full(fd, new, size);
94108
close(fd);
95109
free(new);

environment.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ size_t packed_git_window_size = DEFAULT_PACKED_GIT_WINDOW_SIZE;
2828
size_t packed_git_limit = DEFAULT_PACKED_GIT_LIMIT;
2929
int pager_in_use;
3030
int pager_use_color = 1;
31+
int auto_crlf = 0;
3132

3233
static const char *git_dir;
3334
static char *git_object_dir, *git_index_file, *git_refs_dir, *git_graft_file;

sha1_file.c

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2082,7 +2082,7 @@ int index_fd(unsigned char *sha1, int fd, struct stat *st, int write_object, con
20822082
{
20832083
unsigned long size = st->st_size;
20842084
void *buf;
2085-
int ret;
2085+
int ret, re_allocated = 0;
20862086

20872087
buf = "";
20882088
if (size)
@@ -2091,11 +2091,30 @@ int index_fd(unsigned char *sha1, int fd, struct stat *st, int write_object, con
20912091

20922092
if (!type)
20932093
type = blob_type;
2094-
/* FIXME: CRLF -> LF conversion here for blobs! We'll need the path! */
2094+
2095+
/*
2096+
* Convert blobs to git internal format
2097+
*/
2098+
if (!strcmp(type, blob_type)) {
2099+
unsigned long nsize = size;
2100+
char *nbuf = buf;
2101+
if (convert_to_git(NULL, &nbuf, &nsize)) {
2102+
if (size)
2103+
munmap(buf, size);
2104+
size = nsize;
2105+
buf = nbuf;
2106+
re_allocated = 1;
2107+
}
2108+
}
2109+
20952110
if (write_object)
20962111
ret = write_sha1_file(buf, size, type, sha1);
20972112
else
20982113
ret = hash_sha1_file(buf, size, type, sha1);
2114+
if (re_allocated) {
2115+
free(buf);
2116+
return ret;
2117+
}
20992118
if (size)
21002119
munmap(buf, size);
21012120
return ret;

0 commit comments

Comments
 (0)