diff options
author | Junio C Hamano <gitster@pobox.com> | 2010-01-30 16:03:10 -0800 |
---|---|---|
committer | Junio C Hamano <gitster@pobox.com> | 2010-01-30 16:03:10 -0800 |
commit | 00d3278c8534a8244ae3447189401111e017fd5d (patch) | |
tree | f1c19903bc10ffe4816642040080fb6cfd5da376 /utf8.c | |
parent | b9b727ddb3c9e005bc4e9af0b990b6ef06d7f621 (diff) | |
parent | b319ef70a94731a5c6f18d07a49d5dda3f06f5d3 (diff) | |
download | git-00d3278c8534a8244ae3447189401111e017fd5d.tar.gz git-00d3278c8534a8244ae3447189401111e017fd5d.tar.xz |
Merge commit 'b319ef7' into jc/maint-fix-test-perm
* commit 'b319ef7': (8132 commits)
Add a small patch-mode testing library
git-apply--interactive: Refactor patch mode code
t8005: Nobody writes Russian in shift_jis
Fix severe breakage in "git-apply --whitespace=fix"
Update release notes for 1.6.4
After renaming a section, print any trailing variable definitions
Make section_name_match start on '[', and return the length on success
send-email: detect cycles in alias expansion
Show the presence of untracked files in the bash prompt.
SunOS grep does not understand -C<n> nor -e
Fix export_marks() error handling.
git repack: keep commits hidden by a graft
Add a test showing that 'git repack' throws away grafted-away parents
git branch: clean up detached branch handling
git branch: avoid unnecessary object lookups
git branch: fix performance problem
git svn: fix shallow clone when upstream revision is too new
do_one_ref(): null_sha1 check is not about broken ref
configure.ac: properly unset NEEDS_SSL_WITH_CRYPTO when sha1 func is missing
janitor: useless checks before free
...
Diffstat (limited to 'utf8.c')
-rw-r--r-- | utf8.c | 128 |
1 files changed, 91 insertions, 37 deletions
@@ -3,15 +3,14 @@ /* This code is originally from http://www.cl.cam.ac.uk/~mgk25/ucs/ */ -typedef unsigned int ucs_char_t; /* assuming 32bit int */ - struct interval { int first; int last; }; /* auxiliary function for binary search in interval table */ -static int bisearch(ucs_char_t ucs, const struct interval *table, int max) { +static int bisearch(ucs_char_t ucs, const struct interval *table, int max) +{ int min = 0; int mid; @@ -62,7 +61,7 @@ static int bisearch(ucs_char_t ucs, const struct interval *table, int max) { * in ISO 10646. */ -static int wcwidth(ucs_char_t ch) +static int git_wcwidth(ucs_char_t ch) { /* * Sorted list of non-overlapping intervals of non-spacing characters, @@ -152,62 +151,118 @@ static int wcwidth(ucs_char_t ch) } /* - * This function returns the number of columns occupied by the character - * pointed to by the variable start. The pointer is updated to point at - * the next character. If it was not valid UTF-8, the pointer is set to NULL. + * Pick one ucs character starting from the location *start points at, + * and return it, while updating the *start pointer to point at the + * end of that character. When remainder_p is not NULL, the location + * holds the number of bytes remaining in the string that we are allowed + * to pick from. Otherwise we are allowed to pick up to the NUL that + * would eventually appear in the string. *remainder_p is also reduced + * by the number of bytes we have consumed. + * + * If the string was not a valid UTF-8, *start pointer is set to NULL + * and the return value is undefined. */ -int utf8_width(const char **start) +ucs_char_t pick_one_utf8_char(const char **start, size_t *remainder_p) { unsigned char *s = (unsigned char *)*start; ucs_char_t ch; + size_t remainder, incr; + + /* + * A caller that assumes NUL terminated text can choose + * not to bother with the remainder length. We will + * stop at the first NUL. + */ + remainder = (remainder_p ? *remainder_p : 999); - if (*s < 0x80) { + if (remainder < 1) { + goto invalid; + } else if (*s < 0x80) { /* 0xxxxxxx */ ch = *s; - *start += 1; + incr = 1; } else if ((s[0] & 0xe0) == 0xc0) { /* 110XXXXx 10xxxxxx */ - if ((s[1] & 0xc0) != 0x80 || - /* overlong? */ - (s[0] & 0xfe) == 0xc0) + if (remainder < 2 || + (s[1] & 0xc0) != 0x80 || + (s[0] & 0xfe) == 0xc0) goto invalid; ch = ((s[0] & 0x1f) << 6) | (s[1] & 0x3f); - *start += 2; + incr = 2; } else if ((s[0] & 0xf0) == 0xe0) { /* 1110XXXX 10Xxxxxx 10xxxxxx */ - if ((s[1] & 0xc0) != 0x80 || - (s[2] & 0xc0) != 0x80 || - /* overlong? */ - (s[0] == 0xe0 && (s[1] & 0xe0) == 0x80) || - /* surrogate? */ - (s[0] == 0xed && (s[1] & 0xe0) == 0xa0) || - /* U+FFFE or U+FFFF? */ - (s[0] == 0xef && s[1] == 0xbf && - (s[2] & 0xfe) == 0xbe)) + if (remainder < 3 || + (s[1] & 0xc0) != 0x80 || + (s[2] & 0xc0) != 0x80 || + /* overlong? */ + (s[0] == 0xe0 && (s[1] & 0xe0) == 0x80) || + /* surrogate? */ + (s[0] == 0xed && (s[1] & 0xe0) == 0xa0) || + /* U+FFFE or U+FFFF? */ + (s[0] == 0xef && s[1] == 0xbf && + (s[2] & 0xfe) == 0xbe)) goto invalid; ch = ((s[0] & 0x0f) << 12) | ((s[1] & 0x3f) << 6) | (s[2] & 0x3f); - *start += 3; + incr = 3; } else if ((s[0] & 0xf8) == 0xf0) { /* 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx */ - if ((s[1] & 0xc0) != 0x80 || - (s[2] & 0xc0) != 0x80 || - (s[3] & 0xc0) != 0x80 || - /* overlong? */ - (s[0] == 0xf0 && (s[1] & 0xf0) == 0x80) || - /* > U+10FFFF? */ - (s[0] == 0xf4 && s[1] > 0x8f) || s[0] > 0xf4) + if (remainder < 4 || + (s[1] & 0xc0) != 0x80 || + (s[2] & 0xc0) != 0x80 || + (s[3] & 0xc0) != 0x80 || + /* overlong? */ + (s[0] == 0xf0 && (s[1] & 0xf0) == 0x80) || + /* > U+10FFFF? */ + (s[0] == 0xf4 && s[1] > 0x8f) || s[0] > 0xf4) goto invalid; ch = ((s[0] & 0x07) << 18) | ((s[1] & 0x3f) << 12) | ((s[2] & 0x3f) << 6) | (s[3] & 0x3f); - *start += 4; + incr = 4; } else { invalid: *start = NULL; return 0; } - return wcwidth(ch); + *start += incr; + if (remainder_p) + *remainder_p = remainder - incr; + return ch; +} + +/* + * This function returns the number of columns occupied by the character + * pointed to by the variable start. The pointer is updated to point at + * the next character. When remainder_p is not NULL, it points at the + * location that stores the number of remaining bytes we can use to pick + * a character (see pick_one_utf8_char() above). + */ +int utf8_width(const char **start, size_t *remainder_p) +{ + ucs_char_t ch = pick_one_utf8_char(start, remainder_p); + if (!*start) + return 0; + return git_wcwidth(ch); +} + +/* + * Returns the total number of columns required by a null-terminated + * string, assuming that the string is utf8. Returns strlen() instead + * if the string does not look like a valid utf8 string. + */ +int utf8_strwidth(const char *string) +{ + int width = 0; + const char *orig = string; + + while (1) { + if (!string) + return strlen(orig); + if (!*string) + return width; + width += utf8_width(&string, NULL); + } } int is_utf8(const char *text) @@ -217,7 +272,7 @@ int is_utf8(const char *text) text++; continue; } - utf8_width(&text); + utf8_width(&text, NULL); if (!text) return 0; } @@ -277,13 +332,12 @@ int print_wrapped_text(const char *text, int indent, int indent2, int width) continue; } if (assume_utf8) - w += utf8_width(&text); + w += utf8_width(&text, NULL); else { w++; text++; } } - return w; } int is_encoding_utf8(const char *name) @@ -300,7 +354,7 @@ int is_encoding_utf8(const char *name) * with iconv. If the conversion fails, returns NULL. */ #ifndef NO_ICONV -#ifdef OLD_ICONV +#if defined(OLD_ICONV) || (defined(__sun__) && !defined(_XPG6)) typedef const char * iconv_ibp; #else typedef char * iconv_ibp; |