Skip to content

Commit aab2a1a

Browse files
tboegigitster
authored andcommitted
Support working-tree-encoding "UTF-16LE-BOM"
Users who want UTF-16 files in the working tree set the .gitattributes like this: test.txt working-tree-encoding=UTF-16 The unicode standard itself defines 3 allowed ways how to encode UTF-16. The following 3 versions convert all back to 'g' 'i' 't' in UTF-8: a) UTF-16, without BOM, big endian: $ printf "\000g\000i\000t" | iconv -f UTF-16 -t UTF-8 | od -c 0000000 g i t b) UTF-16, with BOM, little endian: $ printf "\377\376g\000i\000t\000" | iconv -f UTF-16 -t UTF-8 | od -c 0000000 g i t c) UTF-16, with BOM, big endian: $ printf "\376\377\000g\000i\000t" | iconv -f UTF-16 -t UTF-8 | od -c 0000000 g i t Git uses libiconv to convert from UTF-8 in the index into ITF-16 in the working tree. After a checkout, the resulting file has a BOM and is encoded in "UTF-16", in the version (c) above. This is what iconv generates, more details follow below. iconv (and libiconv) can generate UTF-16, UTF-16LE or UTF-16BE: d) UTF-16 $ printf 'git' | iconv -f UTF-8 -t UTF-16 | od -c 0000000 376 377 \0 g \0 i \0 t e) UTF-16LE $ printf 'git' | iconv -f UTF-8 -t UTF-16LE | od -c 0000000 g \0 i \0 t \0 f) UTF-16BE $ printf 'git' | iconv -f UTF-8 -t UTF-16BE | od -c 0000000 \0 g \0 i \0 t There is no way to generate version (b) from above in a Git working tree, but that is what some applications need. (All fully unicode aware applications should be able to read all 3 variants, but in practise we are not there yet). When producing UTF-16 as an output, iconv generates the big endian version with a BOM. (big endian is probably chosen for historical reasons). iconv can produce UTF-16 files with little endianess by using "UTF-16LE" as encoding, and that file does not have a BOM. Not all users (especially under Windows) are happy with this. Some tools are not fully unicode aware and can only handle version (b). Today there is no way to produce version (b) with iconv (or libiconv). Looking into the history of iconv, it seems as if version (c) will be used in all future iconv versions (for compatibility reasons). Solve this dilemma and introduce a Git-specific "UTF-16LE-BOM". libiconv can not handle the encoding, so Git pick it up, handles the BOM and uses libiconv to convert the rest of the stream. (UTF-16BE-BOM is added for consistency) Rported-by: Adrián Gimeno Balaguer <[email protected]> Signed-off-by: Torsten Bögershausen <[email protected]> Signed-off-by: Junio C Hamano <[email protected]>
1 parent 0d0ac38 commit aab2a1a

File tree

5 files changed

+48
-14
lines changed

5 files changed

+48
-14
lines changed

Documentation/gitattributes.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -344,7 +344,9 @@ automatic line ending conversion based on your platform.
344344

345345
Use the following attributes if your '*.ps1' files are UTF-16 little
346346
endian encoded without BOM and you want Git to use Windows line endings
347-
in the working directory. Please note, it is highly recommended to
347+
in the working directory (use `UTF-16-LE-BOM` instead of `UTF-16LE` if
348+
you want UTF-16 little endian with BOM).
349+
Please note, it is highly recommended to
348350
explicitly define the line endings with `eol` if the `working-tree-encoding`
349351
attribute is used to avoid ambiguity.
350352

compat/precompose_utf8.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ void precompose_argv(int argc, const char **argv)
7979
size_t namelen;
8080
oldarg = argv[i];
8181
if (has_non_ascii(oldarg, (size_t)-1, &namelen)) {
82-
newarg = reencode_string_iconv(oldarg, namelen, ic_precompose, NULL);
82+
newarg = reencode_string_iconv(oldarg, namelen, ic_precompose, 0, NULL);
8383
if (newarg)
8484
argv[i] = newarg;
8585
}

t/t0028-working-tree-encoding.sh

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,12 @@ test_expect_success 'setup test files' '
1111
1212
text="hallo there!\ncan you read me?" &&
1313
echo "*.utf16 text working-tree-encoding=utf-16" >.gitattributes &&
14+
echo "*.utf16lebom text working-tree-encoding=UTF-16LE-BOM" >>.gitattributes &&
1415
printf "$text" >test.utf8.raw &&
1516
printf "$text" | iconv -f UTF-8 -t UTF-16 >test.utf16.raw &&
1617
printf "$text" | iconv -f UTF-8 -t UTF-32 >test.utf32.raw &&
18+
printf "\377\376" >test.utf16lebom.raw &&
19+
printf "$text" | iconv -f UTF-8 -t UTF-32LE >>test.utf16lebom.raw &&
1720
1821
# Line ending tests
1922
printf "one\ntwo\nthree\n" >lf.utf8.raw &&
@@ -32,7 +35,8 @@ test_expect_success 'setup test files' '
3235
# Add only UTF-16 file, we will add the UTF-32 file later
3336
cp test.utf16.raw test.utf16 &&
3437
cp test.utf32.raw test.utf32 &&
35-
git add .gitattributes test.utf16 &&
38+
cp test.utf16lebom.raw test.utf16lebom &&
39+
git add .gitattributes test.utf16 test.utf16lebom &&
3640
git commit -m initial
3741
'
3842

@@ -51,6 +55,12 @@ test_expect_success 're-encode to UTF-16 on checkout' '
5155
test_cmp_bin test.utf16.raw test.utf16
5256
'
5357

58+
test_expect_success 're-encode to UTF-16-LE-BOM on checkout' '
59+
rm test.utf16lebom &&
60+
git checkout test.utf16lebom &&
61+
test_cmp_bin test.utf16lebom.raw test.utf16lebom
62+
'
63+
5464
test_expect_success 'check $GIT_DIR/info/attributes support' '
5565
test_when_finished "rm -f test.utf32.git" &&
5666
test_when_finished "git reset --hard HEAD" &&

utf8.c

Lines changed: 32 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,11 @@
44

55
/* This code is originally from http://www.cl.cam.ac.uk/~mgk25/ucs/ */
66

7+
static const char utf16_be_bom[] = {'\xFE', '\xFF'};
8+
static const char utf16_le_bom[] = {'\xFF', '\xFE'};
9+
static const char utf32_be_bom[] = {'\0', '\0', '\xFE', '\xFF'};
10+
static const char utf32_le_bom[] = {'\xFF', '\xFE', '\0', '\0'};
11+
712
struct interval {
813
ucs_char_t first;
914
ucs_char_t last;
@@ -470,16 +475,17 @@ int utf8_fprintf(FILE *stream, const char *format, ...)
470475
#else
471476
typedef char * iconv_ibp;
472477
#endif
473-
char *reencode_string_iconv(const char *in, size_t insz, iconv_t conv, size_t *outsz_p)
478+
char *reencode_string_iconv(const char *in, size_t insz, iconv_t conv,
479+
size_t bom_len, size_t *outsz_p)
474480
{
475481
size_t outsz, outalloc;
476482
char *out, *outpos;
477483
iconv_ibp cp;
478484

479485
outsz = insz;
480-
outalloc = st_add(outsz, 1); /* for terminating NUL */
486+
outalloc = st_add(outsz, 1 + bom_len); /* for terminating NUL */
481487
out = xmalloc(outalloc);
482-
outpos = out;
488+
outpos = out + bom_len;
483489
cp = (iconv_ibp)in;
484490

485491
while (1) {
@@ -540,10 +546,30 @@ char *reencode_string_len(const char *in, size_t insz,
540546
{
541547
iconv_t conv;
542548
char *out;
549+
const char *bom_str = NULL;
550+
size_t bom_len = 0;
543551

544552
if (!in_encoding)
545553
return NULL;
546554

555+
/* UTF-16LE-BOM is the same as UTF-16 for reading */
556+
if (same_utf_encoding("UTF-16LE-BOM", in_encoding))
557+
in_encoding = "UTF-16";
558+
559+
/*
560+
* For writing, UTF-16 iconv typically creates "UTF-16BE-BOM"
561+
* Some users under Windows want the little endian version
562+
*/
563+
if (same_utf_encoding("UTF-16LE-BOM", out_encoding)) {
564+
bom_str = utf16_le_bom;
565+
bom_len = sizeof(utf16_le_bom);
566+
out_encoding = "UTF-16LE";
567+
} else if (same_utf_encoding("UTF-16BE-BOM", out_encoding)) {
568+
bom_str = utf16_be_bom;
569+
bom_len = sizeof(utf16_be_bom);
570+
out_encoding = "UTF-16BE";
571+
}
572+
547573
conv = iconv_open(out_encoding, in_encoding);
548574
if (conv == (iconv_t) -1) {
549575
in_encoding = fallback_encoding(in_encoding);
@@ -553,9 +579,10 @@ char *reencode_string_len(const char *in, size_t insz,
553579
if (conv == (iconv_t) -1)
554580
return NULL;
555581
}
556-
557-
out = reencode_string_iconv(in, insz, conv, outsz);
582+
out = reencode_string_iconv(in, insz, conv, bom_len, outsz);
558583
iconv_close(conv);
584+
if (out && bom_str && bom_len)
585+
memcpy(out, bom_str, bom_len);
559586
return out;
560587
}
561588
#endif
@@ -566,11 +593,6 @@ static int has_bom_prefix(const char *data, size_t len,
566593
return data && bom && (len >= bom_len) && !memcmp(data, bom, bom_len);
567594
}
568595

569-
static const char utf16_be_bom[] = {'\xFE', '\xFF'};
570-
static const char utf16_le_bom[] = {'\xFF', '\xFE'};
571-
static const char utf32_be_bom[] = {'\0', '\0', '\xFE', '\xFF'};
572-
static const char utf32_le_bom[] = {'\xFF', '\xFE', '\0', '\0'};
573-
574596
int has_prohibited_utf_bom(const char *enc, const char *data, size_t len)
575597
{
576598
return (

utf8.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ void strbuf_utf8_replace(struct strbuf *sb, int pos, int width,
2727

2828
#ifndef NO_ICONV
2929
char *reencode_string_iconv(const char *in, size_t insz,
30-
iconv_t conv, size_t *outsz);
30+
iconv_t conv, size_t bom_len, size_t *outsz);
3131
char *reencode_string_len(const char *in, size_t insz,
3232
const char *out_encoding,
3333
const char *in_encoding,

0 commit comments

Comments
 (0)