Win32: add Unicode conversion functions

Add Unicode conversion functions to convert between Windows native UTF-16LE encoding to UTF-8 and back. To support repositories with legacy-encoded file names, the UTF-8 to UTF-16 conversion function tries to create valid, unique file names even for invalid UTF-8 byte sequences, so that these repositories can be checked out without error. The current implementation leaves invalid UTF-8 bytes in range 0xa0 - 0xff as is (producing printable Unicode chars \u00a0 - \u00ff, equivalent to ISO-8859-1), and converts 0x80 - 0x9f to hex-code (\u0080 - \u009f are control chars). The Windows MultiByteToWideChar API was not used as it either drops invalid UTF-8 sequences (on Win2k/XP; producing non-unique or even empty file names) or converts them to the replacement char \ufffd (Vista/7; causing ERROR_INVALID_NAME in subsequent calls to file system APIs). Signed-off-by: Karsten Blees <blees@dcon.de> Signed-off-by: Stepan Kasal <kasal@ucw.cz> Signed-off-by: Junio C Hamano <gitster@pobox.com>
author: Karsten Blees <blees@dcon.de> 2011-11-25 21:05:06 +0100
committer: Junio C Hamano <gitster@pobox.com> 2014-06-10 13:32:59 -0700
commit: 1c950a594c40db7a946616cbc6cc5f9e25926a20 (patch)
tree: 592879456ccabc56c1a76259a123bde51cfca85a /compat/mingw.c
parent: 1edeb9abf5828e317999b4ebe8b7472c494341f2 (diff)
download: git-1c950a594c40db7a946616cbc6cc5f9e25926a20.tar.gz
git-1c950a594c40db7a946616cbc6cc5f9e25926a20.tar.xz
1 files changed, 85 insertions, 0 deletions
diff --git a/compat/mingw.c b/compat/mingw.c
index c03bafa9c..6f1fb108e 100644
--- a/compat/mingw.c
+++ b/compat/mingw.c
@@ -1848,6 +1848,91 @@ int mingw_offset_1st_component(const char *path)
 	return offset + is_dir_sep(path[offset]);
 }
 
+int xutftowcsn(wchar_t *wcs, const char *utfs, size_t wcslen, int utflen)
+{
+	int upos = 0, wpos = 0;
+	const unsigned char *utf = (const unsigned char*) utfs;
+	if (!utf || !wcs || wcslen < 1) {
+		errno = EINVAL;
+		return -1;
+	}
+	/* reserve space for \0 */
+	wcslen--;
+	if (utflen < 0)
+		utflen = INT_MAX;
+
+	while (upos < utflen) {
+		int c = utf[upos++] & 0xff;
+		if (utflen == INT_MAX && c == 0)
+			break;
+
+		if (wpos >= wcslen) {
+			wcs[wpos] = 0;
+			errno = ERANGE;
+			return -1;
+		}
+
+		if (c < 0x80) {
+			/* ASCII */
+			wcs[wpos++] = c;
+		} else if (c >= 0xc2 && c < 0xe0 && upos < utflen &&
+				(utf[upos] & 0xc0) == 0x80) {
+			/* 2-byte utf-8 */
+			c = ((c & 0x1f) << 6);
+			c |= (utf[upos++] & 0x3f);
+			wcs[wpos++] = c;
+		} else if (c >= 0xe0 && c < 0xf0 && upos + 1 < utflen &&
+				!(c == 0xe0 && utf[upos] < 0xa0) && /* over-long encoding */
+				(utf[upos] & 0xc0) == 0x80 &&
+				(utf[upos + 1] & 0xc0) == 0x80) {
+			/* 3-byte utf-8 */
+			c = ((c & 0x0f) << 12);
+			c |= ((utf[upos++] & 0x3f) << 6);
+			c |= (utf[upos++] & 0x3f);
+			wcs[wpos++] = c;
+		} else if (c >= 0xf0 && c < 0xf5 && upos + 2 < utflen &&
+				wpos + 1 < wcslen &&
+				!(c == 0xf0 && utf[upos] < 0x90) && /* over-long encoding */
+				!(c == 0xf4 && utf[upos] >= 0x90) && /* > \u10ffff */
+				(utf[upos] & 0xc0) == 0x80 &&
+				(utf[upos + 1] & 0xc0) == 0x80 &&
+				(utf[upos + 2] & 0xc0) == 0x80) {
+			/* 4-byte utf-8: convert to \ud8xx \udcxx surrogate pair */
+			c = ((c & 0x07) << 18);
+			c |= ((utf[upos++] & 0x3f) << 12);
+			c |= ((utf[upos++] & 0x3f) << 6);
+			c |= (utf[upos++] & 0x3f);
+			c -= 0x10000;
+			wcs[wpos++] = 0xd800 | (c >> 10);
+			wcs[wpos++] = 0xdc00 | (c & 0x3ff);
+		} else if (c >= 0xa0) {
+			/* invalid utf-8 byte, printable unicode char: convert 1:1 */
+			wcs[wpos++] = c;
+		} else {
+			/* invalid utf-8 byte, non-printable unicode: convert to hex */
+			static const char *hex = "0123456789abcdef";
+			wcs[wpos++] = hex[c >> 4];
+			if (wpos < wcslen)
+				wcs[wpos++] = hex[c & 0x0f];
+		}
+	}
+	wcs[wpos] = 0;
+	return wpos;
+}
+
+int xwcstoutf(char *utf, const wchar_t *wcs, size_t utflen)
+{
+	if (!wcs || !utf || utflen < 1) {
+		errno = EINVAL;
+		return -1;
+	}
+	utflen = WideCharToMultiByte(CP_UTF8, 0, wcs, -1, utf, utflen, NULL, NULL);
+	if (utflen)
+		return utflen - 1;
+	errno = ERANGE;
+	return -1;
+}
+
 void mingw_startup()
 {
 	/* copy executable name to argv[0] */
author	Karsten Blees <blees@dcon.de>	2011-11-25 21:05:06 +0100
committer	Junio C Hamano <gitster@pobox.com>	2014-06-10 13:32:59 -0700
commit	1c950a594c40db7a946616cbc6cc5f9e25926a20 (patch)
tree	592879456ccabc56c1a76259a123bde51cfca85a /compat/mingw.c
parent	1edeb9abf5828e317999b4ebe8b7472c494341f2 (diff)
download	git-1c950a594c40db7a946616cbc6cc5f9e25926a20.tar.gz git-1c950a594c40db7a946616cbc6cc5f9e25926a20.tar.xz