aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJunio C Hamano <gitster@pobox.com>2012-07-13 15:37:51 -0700
committerJunio C Hamano <gitster@pobox.com>2012-07-13 15:37:51 -0700
commitb856ad623e4f686815986c0b9341dd1bfd791e71 (patch)
treeddce49b4afdb7428c1cba49628020bd796704661
parent6a9aa0c9b224517db0549d9252fdbc5177e6c0e2 (diff)
parent76759c7dff53e8c84e975b88cb8245587c14c7ba (diff)
downloadgit-b856ad623e4f686815986c0b9341dd1bfd791e71.tar.gz
git-b856ad623e4f686815986c0b9341dd1bfd791e71.tar.xz
Merge branch 'tb/sanitize-decomposed-utf-8-pathname'
Teaches git to normalize pathnames read from readdir(3) and all arguments from the command line into precomposed UTF-8 (assuming that they come as decomposed UTF-8) to work around issues on Mac OS. I think there still are other places that need conversion (e.g. paths that are read from stdin for some commands), but this should be a good first step in the right direction. * tb/sanitize-decomposed-utf-8-pathname: git on Mac OS and precomposed unicode
-rw-r--r--Documentation/config.txt9
-rw-r--r--Makefile3
-rw-r--r--builtin/init-db.c1
-rw-r--r--cache.h1
-rw-r--r--compat/precompose_utf8.c190
-rw-r--r--compat/precompose_utf8.h45
-rw-r--r--config.c5
-rw-r--r--environment.c1
-rw-r--r--git-compat-util.h9
-rw-r--r--parse-options.c1
-rwxr-xr-xt/t3910-mac-os-precompose.sh164
-rw-r--r--utf8.c26
-rw-r--r--utf8.h1
13 files changed, 446 insertions, 10 deletions
diff --git a/Documentation/config.txt b/Documentation/config.txt
index c6ff15e59..7bc0e5384 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -211,6 +211,15 @@ The default is false, except linkgit:git-clone[1] or linkgit:git-init[1]
will probe and set core.ignorecase true if appropriate when the repository
is created.
+core.precomposeunicode::
+ This option is only used by Mac OS implementation of git.
+ When core.precomposeunicode=true, git reverts the unicode decomposition
+ of filenames done by Mac OS. This is useful when sharing a repository
+ between Mac OS and Linux or Windows.
+ (Git for Windows 1.7.10 or higher is needed, or git under cygwin 1.7).
+ When false, file names are handled fully transparent by git,
+ which is backward compatible with older versions of git.
+
core.trustctime::
If false, the ctime differences between the index and the
working tree are ignored; useful when the inode change time
diff --git a/Makefile b/Makefile
index 169dda545..dc7090296 100644
--- a/Makefile
+++ b/Makefile
@@ -607,6 +607,7 @@ LIB_H += compat/bswap.h
LIB_H += compat/cygwin.h
LIB_H += compat/mingw.h
LIB_H += compat/obstack.h
+LIB_H += compat/precompose_utf8.h
LIB_H += compat/terminal.h
LIB_H += compat/win32/dirent.h
LIB_H += compat/win32/poll.h
@@ -1001,6 +1002,8 @@ ifeq ($(uname_S),Darwin)
NO_MEMMEM = YesPlease
USE_ST_TIMESPEC = YesPlease
HAVE_DEV_TTY = YesPlease
+ COMPAT_OBJS += compat/precompose_utf8.o
+ BASIC_CFLAGS += -DPRECOMPOSE_UNICODE
endif
ifeq ($(uname_S),SunOS)
NEEDS_SOCKET = YesPlease
diff --git a/builtin/init-db.c b/builtin/init-db.c
index 0dacb8b79..244fb7fc3 100644
--- a/builtin/init-db.c
+++ b/builtin/init-db.c
@@ -290,6 +290,7 @@ static int create_default_files(const char *template_path)
strcpy(path + len, "CoNfIg");
if (!access(path, F_OK))
git_config_set("core.ignorecase", "true");
+ probe_utf8_pathname_composition(path, len);
}
return reinit;
diff --git a/cache.h b/cache.h
index c22b92898..6a1aff5e2 100644
--- a/cache.h
+++ b/cache.h
@@ -563,6 +563,7 @@ extern int read_replace_refs;
extern int fsync_object_files;
extern int core_preload_index;
extern int core_apply_sparse_checkout;
+extern int precomposed_unicode;
enum branch_track {
BRANCH_TRACK_UNSPECIFIED = -1,
diff --git a/compat/precompose_utf8.c b/compat/precompose_utf8.c
new file mode 100644
index 000000000..d40d1b380
--- /dev/null
+++ b/compat/precompose_utf8.c
@@ -0,0 +1,190 @@
+/*
+ * Converts filenames from decomposed unicode into precomposed unicode.
+ * Used on MacOS X.
+*/
+
+
+#define PRECOMPOSE_UNICODE_C
+
+#include "cache.h"
+#include "utf8.h"
+#include "precompose_utf8.h"
+
+typedef char *iconv_ibp;
+const static char *repo_encoding = "UTF-8";
+const static char *path_encoding = "UTF-8-MAC";
+
+
+static size_t has_utf8(const char *s, size_t maxlen, size_t *strlen_c)
+{
+ const uint8_t *utf8p = (const uint8_t*) s;
+ size_t strlen_chars = 0;
+ size_t ret = 0;
+
+ if ((!utf8p) || (!*utf8p)) {
+ return 0;
+ }
+
+ while((*utf8p) && maxlen) {
+ if (*utf8p & 0x80)
+ ret++;
+ strlen_chars++;
+ utf8p++;
+ maxlen--;
+ }
+ if (strlen_c)
+ *strlen_c = strlen_chars;
+
+ return ret;
+}
+
+
+void probe_utf8_pathname_composition(char *path, int len)
+{
+ const static char *auml_nfc = "\xc3\xa4";
+ const static char *auml_nfd = "\x61\xcc\x88";
+ int output_fd;
+ if (precomposed_unicode != -1)
+ return; /* We found it defined in the global config, respect it */
+ path[len] = 0;
+ strcpy(path + len, auml_nfc);
+ output_fd = open(path, O_CREAT|O_EXCL|O_RDWR, 0600);
+ if (output_fd >=0) {
+ close(output_fd);
+ path[len] = 0;
+ strcpy(path + len, auml_nfd);
+ /* Indicate to the user, that we can configure it to true */
+ if (0 == access(path, R_OK))
+ git_config_set("core.precomposeunicode", "false");
+ /* To be backward compatible, set precomposed_unicode to 0 */
+ precomposed_unicode = 0;
+ path[len] = 0;
+ strcpy(path + len, auml_nfc);
+ unlink(path);
+ }
+}
+
+
+void precompose_argv(int argc, const char **argv)
+{
+ int i = 0;
+ const char *oldarg;
+ char *newarg;
+ iconv_t ic_precompose;
+
+ if (precomposed_unicode != 1)
+ return;
+
+ ic_precompose = iconv_open(repo_encoding, path_encoding);
+ if (ic_precompose == (iconv_t) -1)
+ return;
+
+ while (i < argc) {
+ size_t namelen;
+ oldarg = argv[i];
+ if (has_utf8(oldarg, (size_t)-1, &namelen)) {
+ newarg = reencode_string_iconv(oldarg, namelen, ic_precompose);
+ if (newarg)
+ argv[i] = newarg;
+ }
+ i++;
+ }
+ iconv_close(ic_precompose);
+}
+
+
+PREC_DIR *precompose_utf8_opendir(const char *dirname)
+{
+ PREC_DIR *prec_dir = xmalloc(sizeof(PREC_DIR));
+ prec_dir->dirent_nfc = xmalloc(sizeof(dirent_prec_psx));
+ prec_dir->dirent_nfc->max_name_len = sizeof(prec_dir->dirent_nfc->d_name);
+
+ prec_dir->dirp = opendir(dirname);
+ if (!prec_dir->dirp) {
+ free(prec_dir->dirent_nfc);
+ free(prec_dir);
+ return NULL;
+ } else {
+ int ret_errno = errno;
+ prec_dir->ic_precompose = iconv_open(repo_encoding, path_encoding);
+ /* if iconv_open() fails, die() in readdir() if needed */
+ errno = ret_errno;
+ }
+
+ return prec_dir;
+}
+
+struct dirent_prec_psx *precompose_utf8_readdir(PREC_DIR *prec_dir)
+{
+ struct dirent *res;
+ res = readdir(prec_dir->dirp);
+ if (res) {
+ size_t namelenz = strlen(res->d_name) + 1; /* \0 */
+ size_t new_maxlen = namelenz;
+
+ int ret_errno = errno;
+
+ if (new_maxlen > prec_dir->dirent_nfc->max_name_len) {
+ size_t new_len = sizeof(dirent_prec_psx) + new_maxlen -
+ sizeof(prec_dir->dirent_nfc->d_name);
+
+ prec_dir->dirent_nfc = xrealloc(prec_dir->dirent_nfc, new_len);
+ prec_dir->dirent_nfc->max_name_len = new_maxlen;
+ }
+
+ prec_dir->dirent_nfc->d_ino = res->d_ino;
+ prec_dir->dirent_nfc->d_type = res->d_type;
+
+ if ((precomposed_unicode == 1) && has_utf8(res->d_name, (size_t)-1, NULL)) {
+ if (prec_dir->ic_precompose == (iconv_t)-1) {
+ die("iconv_open(%s,%s) failed, but needed:\n"
+ " precomposed unicode is not supported.\n"
+ " If you wnat to use decomposed unicode, run\n"
+ " \"git config core.precomposeunicode false\"\n",
+ repo_encoding, path_encoding);
+ } else {
+ iconv_ibp cp = (iconv_ibp)res->d_name;
+ size_t inleft = namelenz;
+ char *outpos = &prec_dir->dirent_nfc->d_name[0];
+ size_t outsz = prec_dir->dirent_nfc->max_name_len;
+ size_t cnt;
+ errno = 0;
+ cnt = iconv(prec_dir->ic_precompose, &cp, &inleft, &outpos, &outsz);
+ if (errno || inleft) {
+ /*
+ * iconv() failed and errno could be E2BIG, EILSEQ, EINVAL, EBADF
+ * MacOS X avoids illegal byte sequemces.
+ * If they occur on a mounted drive (e.g. NFS) it is not worth to
+ * die() for that, but rather let the user see the original name
+ */
+ namelenz = 0; /* trigger strlcpy */
+ }
+ }
+ }
+ else
+ namelenz = 0;
+
+ if (!namelenz)
+ strlcpy(prec_dir->dirent_nfc->d_name, res->d_name,
+ prec_dir->dirent_nfc->max_name_len);
+
+ errno = ret_errno;
+ return prec_dir->dirent_nfc;
+ }
+ return NULL;
+}
+
+
+int precompose_utf8_closedir(PREC_DIR *prec_dir)
+{
+ int ret_value;
+ int ret_errno;
+ ret_value = closedir(prec_dir->dirp);
+ ret_errno = errno;
+ if (prec_dir->ic_precompose != (iconv_t)-1)
+ iconv_close(prec_dir->ic_precompose);
+ free(prec_dir->dirent_nfc);
+ free(prec_dir);
+ errno = ret_errno;
+ return ret_value;
+}
diff --git a/compat/precompose_utf8.h b/compat/precompose_utf8.h
new file mode 100644
index 000000000..3b73585fc
--- /dev/null
+++ b/compat/precompose_utf8.h
@@ -0,0 +1,45 @@
+#ifndef PRECOMPOSE_UNICODE_H
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <iconv.h>
+
+
+typedef struct dirent_prec_psx {
+ ino_t d_ino; /* Posix */
+ size_t max_name_len; /* See below */
+ unsigned char d_type; /* available on all systems git runs on */
+
+ /*
+ * See http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/dirent.h.html
+ * NAME_MAX + 1 should be enough, but some systems have
+ * NAME_MAX=255 and strlen(d_name) may return 508 or 510
+ * Solution: allocate more when needed, see precompose_utf8_readdir()
+ */
+ char d_name[NAME_MAX+1];
+} dirent_prec_psx;
+
+
+typedef struct {
+ iconv_t ic_precompose;
+ DIR *dirp;
+ struct dirent_prec_psx *dirent_nfc;
+} PREC_DIR;
+
+void precompose_argv(int argc, const char **argv);
+void probe_utf8_pathname_composition(char *, int);
+
+PREC_DIR *precompose_utf8_opendir(const char *dirname);
+struct dirent_prec_psx *precompose_utf8_readdir(PREC_DIR *dirp);
+int precompose_utf8_closedir(PREC_DIR *dirp);
+
+#ifndef PRECOMPOSE_UNICODE_C
+#define dirent dirent_prec_psx
+#define opendir(n) precompose_utf8_opendir(n)
+#define readdir(d) precompose_utf8_readdir(d)
+#define closedir(d) precompose_utf8_closedir(d)
+#define DIR PREC_DIR
+#endif /* PRECOMPOSE_UNICODE_C */
+
+#define PRECOMPOSE_UNICODE_H
+#endif /* PRECOMPOSE_UNICODE_H */
diff --git a/config.c b/config.c
index d28a499b0..40818e872 100644
--- a/config.c
+++ b/config.c
@@ -758,6 +758,11 @@ static int git_default_core_config(const char *var, const char *value)
return 0;
}
+ if (!strcmp(var, "core.precomposeunicode")) {
+ precomposed_unicode = git_config_bool(var, value);
+ return 0;
+ }
+
/* Add other config variables here and to Documentation/config.txt. */
return 0;
}
diff --git a/environment.c b/environment.c
index 669e498f5..85edd7f95 100644
--- a/environment.c
+++ b/environment.c
@@ -58,6 +58,7 @@ char *notes_ref_name;
int grafts_replace_parents = 1;
int core_apply_sparse_checkout;
int merge_log_config = -1;
+int precomposed_unicode = -1; /* see probe_utf8_pathname_composition() */
struct startup_info *startup_info;
unsigned long pack_size_limit_cfg;
diff --git a/git-compat-util.h b/git-compat-util.h
index 5bd9ad7d2..35b095e8a 100644
--- a/git-compat-util.h
+++ b/git-compat-util.h
@@ -153,6 +153,15 @@
#endif
#endif
+/* used on Mac OS X */
+#ifdef PRECOMPOSE_UNICODE
+#include "compat/precompose_utf8.h"
+#else
+#define precompose_str(in,i_nfd2nfc)
+#define precompose_argv(c,v)
+#define probe_utf8_pathname_composition(a,b)
+#endif
+
#ifndef NO_LIBGEN_H
#include <libgen.h>
#else
diff --git a/parse-options.c b/parse-options.c
index ab70c29c4..c1c66bd40 100644
--- a/parse-options.c
+++ b/parse-options.c
@@ -476,6 +476,7 @@ int parse_options(int argc, const char **argv, const char *prefix,
usage_with_options(usagestr, options);
}
+ precompose_argv(argc, argv);
return parse_options_end(&ctx);
}
diff --git a/t/t3910-mac-os-precompose.sh b/t/t3910-mac-os-precompose.sh
new file mode 100755
index 000000000..88b7a20c1
--- /dev/null
+++ b/t/t3910-mac-os-precompose.sh
@@ -0,0 +1,164 @@
+#!/bin/sh
+#
+# Copyright (c) 2012 Torsten Bögershausen
+#
+
+test_description='utf-8 decomposed (nfd) converted to precomposed (nfc)'
+
+. ./test-lib.sh
+
+Adiarnfc=`printf '\303\204'`
+Adiarnfd=`printf 'A\314\210'`
+
+# check if the feature is compiled in
+mkdir junk &&
+>junk/"$Adiarnfc" &&
+case "$(cd junk && echo *)" in
+ "$Adiarnfd")
+ test_nfd=1
+ ;;
+ *) ;;
+esac
+rm -rf junk
+
+
+if test "$test_nfd"
+then
+ # create more utf-8 variables
+ Odiarnfc=`printf '\303\226'`
+ Odiarnfd=`printf 'O\314\210'`
+ AEligatu=`printf '\303\206'`
+ Invalidu=`printf '\303\377'`
+
+
+ #Create a string with 255 bytes (decomposed)
+ Alongd=$Adiarnfd$Adiarnfd$Adiarnfd$Adiarnfd$Adiarnfd$Adiarnfd$Adiarnfd #21 Byte
+ Alongd=$Alongd$Alongd$Alongd #63 Byte
+ Alongd=$Alongd$Alongd$Alongd$Alongd$Adiarnfd #255 Byte
+
+ #Create a string with 254 bytes (precomposed)
+ Alongc=$AEligatu$AEligatu$AEligatu$AEligatu$AEligatu #10 Byte
+ Alongc=$Alongc$Alongc$Alongc$Alongc$Alongc #50 Byte
+ Alongc=$Alongc$Alongc$Alongc$Alongc$Alongc #250 Byte
+ Alongc=$Alongc$AEligatu$AEligatu #254 Byte
+
+ test_expect_success "detect if nfd needed" '
+ precomposeunicode=`git config core.precomposeunicode` &&
+ test "$precomposeunicode" = false &&
+ git config core.precomposeunicode true
+ '
+ test_expect_success "setup" '
+ >x &&
+ git add x &&
+ git commit -m "1st commit" &&
+ git rm x &&
+ git commit -m "rm x"
+ '
+ test_expect_success "setup case mac" '
+ git checkout -b mac_os
+ '
+ # This will test nfd2nfc in readdir()
+ test_expect_success "add file Adiarnfc" '
+ echo f.Adiarnfc >f.$Adiarnfc &&
+ git add f.$Adiarnfc &&
+ git commit -m "add f.$Adiarnfc"
+ '
+ # This will test nfd2nfc in git stage()
+ test_expect_success "stage file d.Adiarnfd/f.Adiarnfd" '
+ mkdir d.$Adiarnfd &&
+ echo d.$Adiarnfd/f.$Adiarnfd >d.$Adiarnfd/f.$Adiarnfd &&
+ git stage d.$Adiarnfd/f.$Adiarnfd &&
+ git commit -m "add d.$Adiarnfd/f.$Adiarnfd"
+ '
+ test_expect_success "add link Adiarnfc" '
+ ln -s d.$Adiarnfd/f.$Adiarnfd l.$Adiarnfc &&
+ git add l.$Adiarnfc &&
+ git commit -m "add l.Adiarnfc"
+ '
+ # This will test git log
+ test_expect_success "git log f.Adiar" '
+ git log f.$Adiarnfc > f.Adiarnfc.log &&
+ git log f.$Adiarnfd > f.Adiarnfd.log &&
+ test -s f.Adiarnfc.log &&
+ test -s f.Adiarnfd.log &&
+ test_cmp f.Adiarnfc.log f.Adiarnfd.log &&
+ rm f.Adiarnfc.log f.Adiarnfd.log
+ '
+ # This will test git ls-files
+ test_expect_success "git lsfiles f.Adiar" '
+ git ls-files f.$Adiarnfc > f.Adiarnfc.log &&
+ git ls-files f.$Adiarnfd > f.Adiarnfd.log &&
+ test -s f.Adiarnfc.log &&
+ test -s f.Adiarnfd.log &&
+ test_cmp f.Adiarnfc.log f.Adiarnfd.log &&
+ rm f.Adiarnfc.log f.Adiarnfd.log
+ '
+ # This will test git mv
+ test_expect_success "git mv" '
+ git mv f.$Adiarnfd f.$Odiarnfc &&
+ git mv d.$Adiarnfd d.$Odiarnfc &&
+ git mv l.$Adiarnfd l.$Odiarnfc &&
+ git commit -m "mv Adiarnfd Odiarnfc"
+ '
+ # Files can be checked out as nfc
+ # And the link has been corrected from nfd to nfc
+ test_expect_success "git checkout nfc" '
+ rm f.$Odiarnfc &&
+ git checkout f.$Odiarnfc
+ '
+ # Make it possible to checkout files with their NFD names
+ test_expect_success "git checkout file nfd" '
+ rm -f f.* &&
+ git checkout f.$Odiarnfd
+ '
+ # Make it possible to checkout links with their NFD names
+ test_expect_success "git checkout link nfd" '
+ rm l.* &&
+ git checkout l.$Odiarnfd
+ '
+ test_expect_success "setup case mac2" '
+ git checkout master &&
+ git reset --hard &&
+ git checkout -b mac_os_2
+ '
+ # This will test nfd2nfc in git commit
+ test_expect_success "commit file d2.Adiarnfd/f.Adiarnfd" '
+ mkdir d2.$Adiarnfd &&
+ echo d2.$Adiarnfd/f.$Adiarnfd >d2.$Adiarnfd/f.$Adiarnfd &&
+ git add d2.$Adiarnfd/f.$Adiarnfd &&
+ git commit -m "add d2.$Adiarnfd/f.$Adiarnfd" -- d2.$Adiarnfd/f.$Adiarnfd
+ '
+ test_expect_success "setup for long decomposed filename" '
+ git checkout master &&
+ git reset --hard &&
+ git checkout -b mac_os_long_nfd_fn
+ '
+ test_expect_success "Add long decomposed filename" '
+ echo longd >$Alongd &&
+ git add * &&
+ git commit -m "Long filename"
+ '
+ test_expect_success "setup for long precomposed filename" '
+ git checkout master &&
+ git reset --hard &&
+ git checkout -b mac_os_long_nfc_fn
+ '
+ test_expect_success "Add long precomposed filename" '
+ echo longc >$Alongc &&
+ git add * &&
+ git commit -m "Long filename"
+ '
+ # Test if the global core.precomposeunicode stops autosensing
+ # Must be the last test case
+ test_expect_success "respect git config --global core.precomposeunicode" '
+ git config --global core.precomposeunicode true &&
+ rm -rf .git &&
+ git init &&
+ precomposeunicode=`git config core.precomposeunicode` &&
+ test "$precomposeunicode" = "true"
+ '
+else
+ say "Skipping nfc/nfd tests"
+fi
+
+test_done
diff --git a/utf8.c b/utf8.c
index 8acbc660d..a544f1545 100644
--- a/utf8.c
+++ b/utf8.c
@@ -433,19 +433,12 @@ int is_encoding_utf8(const char *name)
#else
typedef char * iconv_ibp;
#endif
-char *reencode_string(const char *in, const char *out_encoding, const char *in_encoding)
+char *reencode_string_iconv(const char *in, size_t insz, iconv_t conv)
{
- iconv_t conv;
- size_t insz, outsz, outalloc;
+ size_t outsz, outalloc;
char *out, *outpos;
iconv_ibp cp;
- if (!in_encoding)
- return NULL;
- conv = iconv_open(out_encoding, in_encoding);
- if (conv == (iconv_t) -1)
- return NULL;
- insz = strlen(in);
outsz = insz;
outalloc = outsz + 1; /* for terminating NUL */
out = xmalloc(outalloc);
@@ -459,7 +452,6 @@ char *reencode_string(const char *in, const char *out_encoding, const char *in_e
size_t sofar;
if (errno != E2BIG) {
free(out);
- iconv_close(conv);
return NULL;
}
/* insz has remaining number of bytes.
@@ -478,6 +470,20 @@ char *reencode_string(const char *in, const char *out_encoding, const char *in_e
break;
}
}
+ return out;
+}
+
+char *reencode_string(const char *in, const char *out_encoding, const char *in_encoding)
+{
+ iconv_t conv;
+ char *out;
+
+ if (!in_encoding)
+ return NULL;
+ conv = iconv_open(out_encoding, in_encoding);
+ if (conv == (iconv_t) -1)
+ return NULL;
+ out = reencode_string_iconv(in, strlen(in), conv);
iconv_close(conv);
return out;
}
diff --git a/utf8.h b/utf8.h
index 81f2c82fa..3c0ae7624 100644
--- a/utf8.h
+++ b/utf8.h
@@ -14,6 +14,7 @@ int strbuf_add_wrapped_bytes(struct strbuf *buf, const char *data, int len,
int indent, int indent2, int width);
#ifndef NO_ICONV
+char *reencode_string_iconv(const char *in, size_t insz, iconv_t conv);
char *reencode_string(const char *in, const char *out_encoding, const char *in_encoding);
#else
#define reencode_string(a,b,c) NULL