Skip to content

Commit 1c950a5

Browse files
kbleesgitster
authored andcommitted
Win32: add Unicode conversion functions
Add Unicode conversion functions to convert between Windows native UTF-16LE encoding to UTF-8 and back. To support repositories with legacy-encoded file names, the UTF-8 to UTF-16 conversion function tries to create valid, unique file names even for invalid UTF-8 byte sequences, so that these repositories can be checked out without error. The current implementation leaves invalid UTF-8 bytes in range 0xa0 - 0xff as is (producing printable Unicode chars \u00a0 - \u00ff, equivalent to ISO-8859-1), and converts 0x80 - 0x9f to hex-code (\u0080 - \u009f are control chars). The Windows MultiByteToWideChar API was not used as it either drops invalid UTF-8 sequences (on Win2k/XP; producing non-unique or even empty file names) or converts them to the replacement char \ufffd (Vista/7; causing ERROR_INVALID_NAME in subsequent calls to file system APIs). Signed-off-by: Karsten Blees <[email protected]> Signed-off-by: Stepan Kasal <[email protected]> Signed-off-by: Junio C Hamano <[email protected]>
1 parent 1edeb9a commit 1c950a5

File tree

2 files changed

+189
-0
lines changed

2 files changed

+189
-0
lines changed

compat/mingw.c

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1848,6 +1848,91 @@ int mingw_offset_1st_component(const char *path)
18481848
return offset + is_dir_sep(path[offset]);
18491849
}
18501850

1851+
int xutftowcsn(wchar_t *wcs, const char *utfs, size_t wcslen, int utflen)
1852+
{
1853+
int upos = 0, wpos = 0;
1854+
const unsigned char *utf = (const unsigned char*) utfs;
1855+
if (!utf || !wcs || wcslen < 1) {
1856+
errno = EINVAL;
1857+
return -1;
1858+
}
1859+
/* reserve space for \0 */
1860+
wcslen--;
1861+
if (utflen < 0)
1862+
utflen = INT_MAX;
1863+
1864+
while (upos < utflen) {
1865+
int c = utf[upos++] & 0xff;
1866+
if (utflen == INT_MAX && c == 0)
1867+
break;
1868+
1869+
if (wpos >= wcslen) {
1870+
wcs[wpos] = 0;
1871+
errno = ERANGE;
1872+
return -1;
1873+
}
1874+
1875+
if (c < 0x80) {
1876+
/* ASCII */
1877+
wcs[wpos++] = c;
1878+
} else if (c >= 0xc2 && c < 0xe0 && upos < utflen &&
1879+
(utf[upos] & 0xc0) == 0x80) {
1880+
/* 2-byte utf-8 */
1881+
c = ((c & 0x1f) << 6);
1882+
c |= (utf[upos++] & 0x3f);
1883+
wcs[wpos++] = c;
1884+
} else if (c >= 0xe0 && c < 0xf0 && upos + 1 < utflen &&
1885+
!(c == 0xe0 && utf[upos] < 0xa0) && /* over-long encoding */
1886+
(utf[upos] & 0xc0) == 0x80 &&
1887+
(utf[upos + 1] & 0xc0) == 0x80) {
1888+
/* 3-byte utf-8 */
1889+
c = ((c & 0x0f) << 12);
1890+
c |= ((utf[upos++] & 0x3f) << 6);
1891+
c |= (utf[upos++] & 0x3f);
1892+
wcs[wpos++] = c;
1893+
} else if (c >= 0xf0 && c < 0xf5 && upos + 2 < utflen &&
1894+
wpos + 1 < wcslen &&
1895+
!(c == 0xf0 && utf[upos] < 0x90) && /* over-long encoding */
1896+
!(c == 0xf4 && utf[upos] >= 0x90) && /* > \u10ffff */
1897+
(utf[upos] & 0xc0) == 0x80 &&
1898+
(utf[upos + 1] & 0xc0) == 0x80 &&
1899+
(utf[upos + 2] & 0xc0) == 0x80) {
1900+
/* 4-byte utf-8: convert to \ud8xx \udcxx surrogate pair */
1901+
c = ((c & 0x07) << 18);
1902+
c |= ((utf[upos++] & 0x3f) << 12);
1903+
c |= ((utf[upos++] & 0x3f) << 6);
1904+
c |= (utf[upos++] & 0x3f);
1905+
c -= 0x10000;
1906+
wcs[wpos++] = 0xd800 | (c >> 10);
1907+
wcs[wpos++] = 0xdc00 | (c & 0x3ff);
1908+
} else if (c >= 0xa0) {
1909+
/* invalid utf-8 byte, printable unicode char: convert 1:1 */
1910+
wcs[wpos++] = c;
1911+
} else {
1912+
/* invalid utf-8 byte, non-printable unicode: convert to hex */
1913+
static const char *hex = "0123456789abcdef";
1914+
wcs[wpos++] = hex[c >> 4];
1915+
if (wpos < wcslen)
1916+
wcs[wpos++] = hex[c & 0x0f];
1917+
}
1918+
}
1919+
wcs[wpos] = 0;
1920+
return wpos;
1921+
}
1922+
1923+
int xwcstoutf(char *utf, const wchar_t *wcs, size_t utflen)
1924+
{
1925+
if (!wcs || !utf || utflen < 1) {
1926+
errno = EINVAL;
1927+
return -1;
1928+
}
1929+
utflen = WideCharToMultiByte(CP_UTF8, 0, wcs, -1, utf, utflen, NULL, NULL);
1930+
if (utflen)
1931+
return utflen - 1;
1932+
errno = ERANGE;
1933+
return -1;
1934+
}
1935+
18511936
void mingw_startup()
18521937
{
18531938
/* copy executable name to argv[0] */

compat/mingw.h

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -357,6 +357,110 @@ void mingw_open_html(const char *path);
357357
char **make_augmented_environ(const char *const *vars);
358358
void free_environ(char **env);
359359

360+
/**
361+
* Converts UTF-8 encoded string to UTF-16LE.
362+
*
363+
* To support repositories with legacy-encoded file names, invalid UTF-8 bytes
364+
* 0xa0 - 0xff are converted to corresponding printable Unicode chars \u00a0 -
365+
* \u00ff, and invalid UTF-8 bytes 0x80 - 0x9f (which would make non-printable
366+
* Unicode) are converted to hex-code.
367+
*
368+
* Lead-bytes not followed by an appropriate number of trail-bytes, over-long
369+
* encodings and 4-byte encodings > \u10ffff are detected as invalid UTF-8.
370+
*
371+
* Maximum space requirement for the target buffer is two wide chars per UTF-8
372+
* char (((strlen(utf) * 2) + 1) [* sizeof(wchar_t)]).
373+
*
374+
* The maximum space is needed only if the entire input string consists of
375+
* invalid UTF-8 bytes in range 0x80-0x9f, as per the following table:
376+
*
377+
* | | UTF-8 | UTF-16 |
378+
* Code point | UTF-8 sequence | bytes | words | ratio
379+
* --------------+-------------------+-------+--------+-------
380+
* 000000-00007f | 0-7f | 1 | 1 | 1
381+
* 000080-0007ff | c2-df + 80-bf | 2 | 1 | 0.5
382+
* 000800-00ffff | e0-ef + 2 * 80-bf | 3 | 1 | 0.33
383+
* 010000-10ffff | f0-f4 + 3 * 80-bf | 4 | 2 (a) | 0.5
384+
* invalid | 80-9f | 1 | 2 (b) | 2
385+
* invalid | a0-ff | 1 | 1 | 1
386+
*
387+
* (a) encoded as UTF-16 surrogate pair
388+
* (b) encoded as two hex digits
389+
*
390+
* Note that, while the UTF-8 encoding scheme can be extended to 5-byte, 6-byte
391+
* or even indefinite-byte sequences, the largest valid code point \u10ffff
392+
* encodes as only 4 UTF-8 bytes.
393+
*
394+
* Parameters:
395+
* wcs: wide char target buffer
396+
* utf: string to convert
397+
* wcslen: size of target buffer (in wchar_t's)
398+
* utflen: size of string to convert, or -1 if 0-terminated
399+
*
400+
* Returns:
401+
* length of converted string (_wcslen(wcs)), or -1 on failure
402+
*
403+
* Errors:
404+
* EINVAL: one of the input parameters is invalid (e.g. NULL)
405+
* ERANGE: the output buffer is too small
406+
*/
407+
int xutftowcsn(wchar_t *wcs, const char *utf, size_t wcslen, int utflen);
408+
409+
/**
410+
* Simplified variant of xutftowcsn, assumes input string is \0-terminated.
411+
*/
412+
static inline int xutftowcs(wchar_t *wcs, const char *utf, size_t wcslen)
413+
{
414+
return xutftowcsn(wcs, utf, wcslen, -1);
415+
}
416+
417+
/**
418+
* Simplified file system specific variant of xutftowcsn, assumes output
419+
* buffer size is MAX_PATH wide chars and input string is \0-terminated,
420+
* fails with ENAMETOOLONG if input string is too long.
421+
*/
422+
static inline int xutftowcs_path(wchar_t *wcs, const char *utf)
423+
{
424+
int result = xutftowcsn(wcs, utf, MAX_PATH, -1);
425+
if (result < 0 && errno == ERANGE)
426+
errno = ENAMETOOLONG;
427+
return result;
428+
}
429+
430+
/**
431+
* Converts UTF-16LE encoded string to UTF-8.
432+
*
433+
* Maximum space requirement for the target buffer is three UTF-8 chars per
434+
* wide char ((_wcslen(wcs) * 3) + 1).
435+
*
436+
* The maximum space is needed only if the entire input string consists of
437+
* UTF-16 words in range 0x0800-0xd7ff or 0xe000-0xffff (i.e. \u0800-\uffff
438+
* modulo surrogate pairs), as per the following table:
439+
*
440+
* | | UTF-16 | UTF-8 |
441+
* Code point | UTF-16 sequence | words | bytes | ratio
442+
* --------------+-----------------------+--------+-------+-------
443+
* 000000-00007f | 0000-007f | 1 | 1 | 1
444+
* 000080-0007ff | 0080-07ff | 1 | 2 | 2
445+
* 000800-00ffff | 0800-d7ff / e000-ffff | 1 | 3 | 3
446+
* 010000-10ffff | d800-dbff + dc00-dfff | 2 | 4 | 2
447+
*
448+
* Note that invalid code points > 10ffff cannot be represented in UTF-16.
449+
*
450+
* Parameters:
451+
* utf: target buffer
452+
* wcs: wide string to convert
453+
* utflen: size of target buffer
454+
*
455+
* Returns:
456+
* length of converted string, or -1 on failure
457+
*
458+
* Errors:
459+
* EINVAL: one of the input parameters is invalid (e.g. NULL)
460+
* ERANGE: the output buffer is too small
461+
*/
462+
int xwcstoutf(char *utf, const wchar_t *wcs, size_t utflen);
463+
360464
/*
361465
* A critical section used in the implementation of the spawn
362466
* functions (mingw_spawnv[p]e()) and waitpid(). Intialised in

0 commit comments

Comments
 (0)