diff options
author | Karsten Blees <blees@dcon.de> | 2011-11-25 21:05:06 +0100 |
---|---|---|
committer | Junio C Hamano <gitster@pobox.com> | 2014-06-10 13:32:59 -0700 |
commit | 1c950a594c40db7a946616cbc6cc5f9e25926a20 (patch) | |
tree | 592879456ccabc56c1a76259a123bde51cfca85a /compat/mingw.c | |
parent | 1edeb9abf5828e317999b4ebe8b7472c494341f2 (diff) | |
download | git-1c950a594c40db7a946616cbc6cc5f9e25926a20.tar.gz git-1c950a594c40db7a946616cbc6cc5f9e25926a20.tar.xz |
Win32: add Unicode conversion functions
Add Unicode conversion functions to convert between Windows native UTF-16LE
encoding to UTF-8 and back.
To support repositories with legacy-encoded file names, the UTF-8 to UTF-16
conversion function tries to create valid, unique file names even for
invalid UTF-8 byte sequences, so that these repositories can be checked out
without error.
The current implementation leaves invalid UTF-8 bytes in range 0xa0 - 0xff
as is (producing printable Unicode chars \u00a0 - \u00ff, equivalent to
ISO-8859-1), and converts 0x80 - 0x9f to hex-code (\u0080 - \u009f are
control chars).
The Windows MultiByteToWideChar API was not used as it either drops invalid
UTF-8 sequences (on Win2k/XP; producing non-unique or even empty file
names) or converts them to the replacement char \ufffd (Vista/7; causing
ERROR_INVALID_NAME in subsequent calls to file system APIs).
Signed-off-by: Karsten Blees <blees@dcon.de>
Signed-off-by: Stepan Kasal <kasal@ucw.cz>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
Diffstat (limited to 'compat/mingw.c')
-rw-r--r-- | compat/mingw.c | 85 |
1 files changed, 85 insertions, 0 deletions
diff --git a/compat/mingw.c b/compat/mingw.c index c03bafa9c..6f1fb108e 100644 --- a/compat/mingw.c +++ b/compat/mingw.c @@ -1848,6 +1848,91 @@ int mingw_offset_1st_component(const char *path) return offset + is_dir_sep(path[offset]); } +int xutftowcsn(wchar_t *wcs, const char *utfs, size_t wcslen, int utflen) +{ + int upos = 0, wpos = 0; + const unsigned char *utf = (const unsigned char*) utfs; + if (!utf || !wcs || wcslen < 1) { + errno = EINVAL; + return -1; + } + /* reserve space for \0 */ + wcslen--; + if (utflen < 0) + utflen = INT_MAX; + + while (upos < utflen) { + int c = utf[upos++] & 0xff; + if (utflen == INT_MAX && c == 0) + break; + + if (wpos >= wcslen) { + wcs[wpos] = 0; + errno = ERANGE; + return -1; + } + + if (c < 0x80) { + /* ASCII */ + wcs[wpos++] = c; + } else if (c >= 0xc2 && c < 0xe0 && upos < utflen && + (utf[upos] & 0xc0) == 0x80) { + /* 2-byte utf-8 */ + c = ((c & 0x1f) << 6); + c |= (utf[upos++] & 0x3f); + wcs[wpos++] = c; + } else if (c >= 0xe0 && c < 0xf0 && upos + 1 < utflen && + !(c == 0xe0 && utf[upos] < 0xa0) && /* over-long encoding */ + (utf[upos] & 0xc0) == 0x80 && + (utf[upos + 1] & 0xc0) == 0x80) { + /* 3-byte utf-8 */ + c = ((c & 0x0f) << 12); + c |= ((utf[upos++] & 0x3f) << 6); + c |= (utf[upos++] & 0x3f); + wcs[wpos++] = c; + } else if (c >= 0xf0 && c < 0xf5 && upos + 2 < utflen && + wpos + 1 < wcslen && + !(c == 0xf0 && utf[upos] < 0x90) && /* over-long encoding */ + !(c == 0xf4 && utf[upos] >= 0x90) && /* > \u10ffff */ + (utf[upos] & 0xc0) == 0x80 && + (utf[upos + 1] & 0xc0) == 0x80 && + (utf[upos + 2] & 0xc0) == 0x80) { + /* 4-byte utf-8: convert to \ud8xx \udcxx surrogate pair */ + c = ((c & 0x07) << 18); + c |= ((utf[upos++] & 0x3f) << 12); + c |= ((utf[upos++] & 0x3f) << 6); + c |= (utf[upos++] & 0x3f); + c -= 0x10000; + wcs[wpos++] = 0xd800 | (c >> 10); + wcs[wpos++] = 0xdc00 | (c & 0x3ff); + } else if (c >= 0xa0) { + /* invalid utf-8 byte, printable unicode char: convert 1:1 */ + wcs[wpos++] = c; + } else { + /* invalid utf-8 byte, non-printable unicode: convert to hex */ + static const char *hex = "0123456789abcdef"; + wcs[wpos++] = hex[c >> 4]; + if (wpos < wcslen) + wcs[wpos++] = hex[c & 0x0f]; + } + } + wcs[wpos] = 0; + return wpos; +} + +int xwcstoutf(char *utf, const wchar_t *wcs, size_t utflen) +{ + if (!wcs || !utf || utflen < 1) { + errno = EINVAL; + return -1; + } + utflen = WideCharToMultiByte(CP_UTF8, 0, wcs, -1, utf, utflen, NULL, NULL); + if (utflen) + return utflen - 1; + errno = ERANGE; + return -1; +} + void mingw_startup() { /* copy executable name to argv[0] */ |