aboutsummaryrefslogtreecommitdiff
path: root/compat/mingw.c
diff options
context:
space:
mode:
authorKarsten Blees <blees@dcon.de>2011-11-25 21:05:06 +0100
committerJunio C Hamano <gitster@pobox.com>2014-06-10 13:32:59 -0700
commit1c950a594c40db7a946616cbc6cc5f9e25926a20 (patch)
tree592879456ccabc56c1a76259a123bde51cfca85a /compat/mingw.c
parent1edeb9abf5828e317999b4ebe8b7472c494341f2 (diff)
downloadgit-1c950a594c40db7a946616cbc6cc5f9e25926a20.tar.gz
git-1c950a594c40db7a946616cbc6cc5f9e25926a20.tar.xz
Win32: add Unicode conversion functions
Add Unicode conversion functions to convert between Windows native UTF-16LE encoding to UTF-8 and back. To support repositories with legacy-encoded file names, the UTF-8 to UTF-16 conversion function tries to create valid, unique file names even for invalid UTF-8 byte sequences, so that these repositories can be checked out without error. The current implementation leaves invalid UTF-8 bytes in range 0xa0 - 0xff as is (producing printable Unicode chars \u00a0 - \u00ff, equivalent to ISO-8859-1), and converts 0x80 - 0x9f to hex-code (\u0080 - \u009f are control chars). The Windows MultiByteToWideChar API was not used as it either drops invalid UTF-8 sequences (on Win2k/XP; producing non-unique or even empty file names) or converts them to the replacement char \ufffd (Vista/7; causing ERROR_INVALID_NAME in subsequent calls to file system APIs). Signed-off-by: Karsten Blees <blees@dcon.de> Signed-off-by: Stepan Kasal <kasal@ucw.cz> Signed-off-by: Junio C Hamano <gitster@pobox.com>
Diffstat (limited to 'compat/mingw.c')
-rw-r--r--compat/mingw.c85
1 files changed, 85 insertions, 0 deletions
diff --git a/compat/mingw.c b/compat/mingw.c
index c03bafa9c..6f1fb108e 100644
--- a/compat/mingw.c
+++ b/compat/mingw.c
@@ -1848,6 +1848,91 @@ int mingw_offset_1st_component(const char *path)
return offset + is_dir_sep(path[offset]);
}
+int xutftowcsn(wchar_t *wcs, const char *utfs, size_t wcslen, int utflen)
+{
+ int upos = 0, wpos = 0;
+ const unsigned char *utf = (const unsigned char*) utfs;
+ if (!utf || !wcs || wcslen < 1) {
+ errno = EINVAL;
+ return -1;
+ }
+ /* reserve space for \0 */
+ wcslen--;
+ if (utflen < 0)
+ utflen = INT_MAX;
+
+ while (upos < utflen) {
+ int c = utf[upos++] & 0xff;
+ if (utflen == INT_MAX && c == 0)
+ break;
+
+ if (wpos >= wcslen) {
+ wcs[wpos] = 0;
+ errno = ERANGE;
+ return -1;
+ }
+
+ if (c < 0x80) {
+ /* ASCII */
+ wcs[wpos++] = c;
+ } else if (c >= 0xc2 && c < 0xe0 && upos < utflen &&
+ (utf[upos] & 0xc0) == 0x80) {
+ /* 2-byte utf-8 */
+ c = ((c & 0x1f) << 6);
+ c |= (utf[upos++] & 0x3f);
+ wcs[wpos++] = c;
+ } else if (c >= 0xe0 && c < 0xf0 && upos + 1 < utflen &&
+ !(c == 0xe0 && utf[upos] < 0xa0) && /* over-long encoding */
+ (utf[upos] & 0xc0) == 0x80 &&
+ (utf[upos + 1] & 0xc0) == 0x80) {
+ /* 3-byte utf-8 */
+ c = ((c & 0x0f) << 12);
+ c |= ((utf[upos++] & 0x3f) << 6);
+ c |= (utf[upos++] & 0x3f);
+ wcs[wpos++] = c;
+ } else if (c >= 0xf0 && c < 0xf5 && upos + 2 < utflen &&
+ wpos + 1 < wcslen &&
+ !(c == 0xf0 && utf[upos] < 0x90) && /* over-long encoding */
+ !(c == 0xf4 && utf[upos] >= 0x90) && /* > \u10ffff */
+ (utf[upos] & 0xc0) == 0x80 &&
+ (utf[upos + 1] & 0xc0) == 0x80 &&
+ (utf[upos + 2] & 0xc0) == 0x80) {
+ /* 4-byte utf-8: convert to \ud8xx \udcxx surrogate pair */
+ c = ((c & 0x07) << 18);
+ c |= ((utf[upos++] & 0x3f) << 12);
+ c |= ((utf[upos++] & 0x3f) << 6);
+ c |= (utf[upos++] & 0x3f);
+ c -= 0x10000;
+ wcs[wpos++] = 0xd800 | (c >> 10);
+ wcs[wpos++] = 0xdc00 | (c & 0x3ff);
+ } else if (c >= 0xa0) {
+ /* invalid utf-8 byte, printable unicode char: convert 1:1 */
+ wcs[wpos++] = c;
+ } else {
+ /* invalid utf-8 byte, non-printable unicode: convert to hex */
+ static const char *hex = "0123456789abcdef";
+ wcs[wpos++] = hex[c >> 4];
+ if (wpos < wcslen)
+ wcs[wpos++] = hex[c & 0x0f];
+ }
+ }
+ wcs[wpos] = 0;
+ return wpos;
+}
+
+int xwcstoutf(char *utf, const wchar_t *wcs, size_t utflen)
+{
+ if (!wcs || !utf || utflen < 1) {
+ errno = EINVAL;
+ return -1;
+ }
+ utflen = WideCharToMultiByte(CP_UTF8, 0, wcs, -1, utf, utflen, NULL, NULL);
+ if (utflen)
+ return utflen - 1;
+ errno = ERANGE;
+ return -1;
+}
+
void mingw_startup()
{
/* copy executable name to argv[0] */