aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJunio C Hamano <gitster@pobox.com>2016-07-19 13:22:17 -0700
committerJunio C Hamano <gitster@pobox.com>2016-07-19 13:22:17 -0700
commita883c31af66195556a775f75851f46573c98e43d (patch)
tree895fcd7abecd46cf97821bdad4bc8d64b3c4c142
parenta63d31b4d3640960d9a71606eee80c32459f906e (diff)
parent695f95ba5dc4ab44f327574f85c3ebe7ebf449b1 (diff)
downloadgit-a883c31af66195556a775f75851f46573c98e43d.tar.gz
git-a883c31af66195556a775f75851f46573c98e43d.tar.xz
Merge branch 'nd/icase'
"git grep -i" has been taught to fold case in non-ascii locales correctly. * nd/icase: grep.c: reuse "icase" variable diffcore-pickaxe: support case insensitive match on non-ascii diffcore-pickaxe: Add regcomp_or_die() grep/pcre: support utf-8 gettext: add is_utf8_locale() grep/pcre: prepare locale-dependent tables for icase matching grep: rewrite an if/else condition to avoid duplicate expression grep/icase: avoid kwsset when -F is specified grep/icase: avoid kwsset on literal non-ascii strings test-regex: expose full regcomp() to the command line test-regex: isolate the bug test code grep: break down an "if" stmt in preparation for next changes
-rw-r--r--diffcore-pickaxe.c33
-rw-r--r--gettext.c24
-rw-r--r--gettext.h1
-rw-r--r--grep.c64
-rw-r--r--grep.h1
-rw-r--r--quote.c37
-rw-r--r--quote.h1
-rw-r--r--t/helper/test-regex.c59
-rwxr-xr-xt/t0070-fundamental.sh2
-rwxr-xr-xt/t7812-grep-icase-non-ascii.sh71
-rwxr-xr-xt/t7813-grep-icase-iso.sh19
11 files changed, 291 insertions, 21 deletions
diff --git a/diffcore-pickaxe.c b/diffcore-pickaxe.c
index 7715c13ec..55067cab6 100644
--- a/diffcore-pickaxe.c
+++ b/diffcore-pickaxe.c
@@ -7,6 +7,8 @@
#include "diffcore.h"
#include "xdiff-interface.h"
#include "kwset.h"
+#include "commit.h"
+#include "quote.h"
typedef int (*pickaxe_fn)(mmfile_t *one, mmfile_t *two,
struct diff_options *o,
@@ -198,6 +200,18 @@ static void pickaxe(struct diff_queue_struct *q, struct diff_options *o,
*q = outq;
}
+static void regcomp_or_die(regex_t *regex, const char *needle, int cflags)
+{
+ int err = regcomp(regex, needle, cflags);
+ if (err) {
+ /* The POSIX.2 people are surely sick */
+ char errbuf[1024];
+ regerror(err, regex, errbuf, 1024);
+ regfree(regex);
+ die("invalid regex: %s", errbuf);
+ }
+}
+
void diffcore_pickaxe(struct diff_options *o)
{
const char *needle = o->pickaxe;
@@ -206,18 +220,19 @@ void diffcore_pickaxe(struct diff_options *o)
kwset_t kws = NULL;
if (opts & (DIFF_PICKAXE_REGEX | DIFF_PICKAXE_KIND_G)) {
- int err;
int cflags = REG_EXTENDED | REG_NEWLINE;
if (DIFF_OPT_TST(o, PICKAXE_IGNORE_CASE))
cflags |= REG_ICASE;
- err = regcomp(&regex, needle, cflags);
- if (err) {
- /* The POSIX.2 people are surely sick */
- char errbuf[1024];
- regerror(err, &regex, errbuf, 1024);
- regfree(&regex);
- die("invalid regex: %s", errbuf);
- }
+ regcomp_or_die(&regex, needle, cflags);
+ regexp = &regex;
+ } else if (DIFF_OPT_TST(o, PICKAXE_IGNORE_CASE) &&
+ has_non_ascii(needle)) {
+ struct strbuf sb = STRBUF_INIT;
+ int cflags = REG_NEWLINE | REG_ICASE;
+
+ basic_regex_quote_buf(&sb, needle);
+ regcomp_or_die(&regex, sb.buf, cflags);
+ strbuf_release(&sb);
regexp = &regex;
} else {
kws = kwsalloc(DIFF_OPT_TST(o, PICKAXE_IGNORE_CASE)
diff --git a/gettext.c b/gettext.c
index a268a2c52..db727ea02 100644
--- a/gettext.c
+++ b/gettext.c
@@ -18,6 +18,8 @@
# endif
#endif
+static const char *charset;
+
/*
* Guess the user's preferred languages from the value in LANGUAGE environment
* variable and LC_MESSAGES locale category if NO_GETTEXT is not defined.
@@ -65,7 +67,6 @@ static int test_vsnprintf(const char *fmt, ...)
return ret;
}
-static const char *charset;
static void init_gettext_charset(const char *domain)
{
/*
@@ -172,8 +173,27 @@ int gettext_width(const char *s)
{
static int is_utf8 = -1;
if (is_utf8 == -1)
- is_utf8 = !strcmp(charset, "UTF-8");
+ is_utf8 = is_utf8_locale();
return is_utf8 ? utf8_strwidth(s) : strlen(s);
}
#endif
+
+int is_utf8_locale(void)
+{
+#ifdef NO_GETTEXT
+ if (!charset) {
+ const char *env = getenv("LC_ALL");
+ if (!env || !*env)
+ env = getenv("LC_CTYPE");
+ if (!env || !*env)
+ env = getenv("LANG");
+ if (!env)
+ env = "";
+ if (strchr(env, '.'))
+ env = strchr(env, '.') + 1;
+ charset = xstrdup(env);
+ }
+#endif
+ return is_encoding_utf8(charset);
+}
diff --git a/gettext.h b/gettext.h
index 33696a40b..7eee64a34 100644
--- a/gettext.h
+++ b/gettext.h
@@ -90,5 +90,6 @@ const char *Q_(const char *msgid, const char *plu, unsigned long n)
#endif
const char *get_preferred_languages(void);
+extern int is_utf8_locale(void);
#endif
diff --git a/grep.c b/grep.c
index 1e15b6292..394c8569d 100644
--- a/grep.c
+++ b/grep.c
@@ -4,6 +4,8 @@
#include "xdiff-interface.h"
#include "diff.h"
#include "diffcore.h"
+#include "commit.h"
+#include "quote.h"
static int grep_source_load(struct grep_source *gs);
static int grep_source_is_binary(struct grep_source *gs);
@@ -322,11 +324,16 @@ static void compile_pcre_regexp(struct grep_pat *p, const struct grep_opt *opt)
int erroffset;
int options = PCRE_MULTILINE;
- if (opt->ignore_case)
+ if (opt->ignore_case) {
+ if (has_non_ascii(p->pattern))
+ p->pcre_tables = pcre_maketables();
options |= PCRE_CASELESS;
+ }
+ if (is_utf8_locale() && has_non_ascii(p->pattern))
+ options |= PCRE_UTF8;
p->pcre_regexp = pcre_compile(p->pattern, options, &error, &erroffset,
- NULL);
+ p->pcre_tables);
if (!p->pcre_regexp)
compile_regexp_failed(p, error);
@@ -360,6 +367,7 @@ static void free_pcre_regexp(struct grep_pat *p)
{
pcre_free(p->pcre_regexp);
pcre_free(p->pcre_extra_info);
+ pcre_free((void *)p->pcre_tables);
}
#else /* !USE_LIBPCRE */
static void compile_pcre_regexp(struct grep_pat *p, const struct grep_opt *opt)
@@ -396,26 +404,68 @@ static int is_fixed(const char *s, size_t len)
return 1;
}
+static void compile_fixed_regexp(struct grep_pat *p, struct grep_opt *opt)
+{
+ struct strbuf sb = STRBUF_INIT;
+ int err;
+ int regflags;
+
+ basic_regex_quote_buf(&sb, p->pattern);
+ regflags = opt->regflags & ~REG_EXTENDED;
+ if (opt->ignore_case)
+ regflags |= REG_ICASE;
+ err = regcomp(&p->regexp, sb.buf, regflags);
+ if (opt->debug)
+ fprintf(stderr, "fixed %s\n", sb.buf);
+ strbuf_release(&sb);
+ if (err) {
+ char errbuf[1024];
+ regerror(err, &p->regexp, errbuf, sizeof(errbuf));
+ regfree(&p->regexp);
+ compile_regexp_failed(p, errbuf);
+ }
+}
+
static void compile_regexp(struct grep_pat *p, struct grep_opt *opt)
{
+ int icase, ascii_only;
int err;
p->word_regexp = opt->word_regexp;
p->ignore_case = opt->ignore_case;
+ icase = opt->regflags & REG_ICASE || p->ignore_case;
+ ascii_only = !has_non_ascii(p->pattern);
+ /*
+ * Even when -F (fixed) asks us to do a non-regexp search, we
+ * may not be able to correctly case-fold when -i
+ * (ignore-case) is asked (in which case, we'll synthesize a
+ * regexp to match the pattern that matches regexp special
+ * characters literally, while ignoring case differences). On
+ * the other hand, even without -F, if the pattern does not
+ * have any regexp special characters and there is no need for
+ * case-folding search, we can internally turn it into a
+ * simple string match using kws. p->fixed tells us if we
+ * want to use kws.
+ */
if (opt->fixed || is_fixed(p->pattern, p->patternlen))
- p->fixed = 1;
+ p->fixed = !icase || ascii_only;
else
p->fixed = 0;
if (p->fixed) {
- if (opt->regflags & REG_ICASE || p->ignore_case)
- p->kws = kwsalloc(tolower_trans_tbl);
- else
- p->kws = kwsalloc(NULL);
+ p->kws = kwsalloc(icase ? tolower_trans_tbl : NULL);
kwsincr(p->kws, p->pattern, p->patternlen);
kwsprep(p->kws);
return;
+ } else if (opt->fixed) {
+ /*
+ * We come here when the pattern has the non-ascii
+ * characters we cannot case-fold, and asked to
+ * ignore-case.
+ */
+ compile_fixed_regexp(p, opt);
+ return;
}
if (opt->pcre) {
diff --git a/grep.h b/grep.h
index 95f197a8d..cee4357b1 100644
--- a/grep.h
+++ b/grep.h
@@ -48,6 +48,7 @@ struct grep_pat {
regex_t regexp;
pcre *pcre_regexp;
pcre_extra *pcre_extra_info;
+ const unsigned char *pcre_tables;
kwset_t kws;
unsigned fixed:1;
unsigned ignore_case:1;
diff --git a/quote.c b/quote.c
index b281a8fe4..53b98a5b8 100644
--- a/quote.c
+++ b/quote.c
@@ -453,3 +453,40 @@ void tcl_quote_buf(struct strbuf *sb, const char *src)
}
strbuf_addch(sb, '"');
}
+
+void basic_regex_quote_buf(struct strbuf *sb, const char *src)
+{
+ char c;
+
+ if (*src == '^') {
+ /* only beginning '^' is special and needs quoting */
+ strbuf_addch(sb, '\\');
+ strbuf_addch(sb, *src++);
+ }
+ if (*src == '*')
+ /* beginning '*' is not special, no quoting */
+ strbuf_addch(sb, *src++);
+
+ while ((c = *src++)) {
+ switch (c) {
+ case '[':
+ case '.':
+ case '\\':
+ case '*':
+ strbuf_addch(sb, '\\');
+ strbuf_addch(sb, c);
+ break;
+
+ case '$':
+ /* only the end '$' is special and needs quoting */
+ if (*src == '\0')
+ strbuf_addch(sb, '\\');
+ strbuf_addch(sb, c);
+ break;
+
+ default:
+ strbuf_addch(sb, c);
+ break;
+ }
+ }
+}
diff --git a/quote.h b/quote.h
index 6c53a2cc6..66f5644aa 100644
--- a/quote.h
+++ b/quote.h
@@ -70,5 +70,6 @@ extern char *quote_path_relative(const char *in, const char *prefix,
extern void perl_quote_buf(struct strbuf *sb, const char *src);
extern void python_quote_buf(struct strbuf *sb, const char *src);
extern void tcl_quote_buf(struct strbuf *sb, const char *src);
+extern void basic_regex_quote_buf(struct strbuf *sb, const char *src);
#endif
diff --git a/t/helper/test-regex.c b/t/helper/test-regex.c
index 0dc598ecd..eff26f534 100644
--- a/t/helper/test-regex.c
+++ b/t/helper/test-regex.c
@@ -1,6 +1,23 @@
#include "git-compat-util.h"
+#include "gettext.h"
-int main(int argc, char **argv)
+struct reg_flag {
+ const char *name;
+ int flag;
+};
+
+static struct reg_flag reg_flags[] = {
+ { "EXTENDED", REG_EXTENDED },
+ { "NEWLINE", REG_NEWLINE },
+ { "ICASE", REG_ICASE },
+ { "NOTBOL", REG_NOTBOL },
+#ifdef REG_STARTEND
+ { "STARTEND", REG_STARTEND },
+#endif
+ { NULL, 0 }
+};
+
+static int test_regex_bug(void)
{
char *pat = "[^={} \t]+";
char *str = "={}\nfred";
@@ -16,5 +33,43 @@ int main(int argc, char **argv)
if (m[0].rm_so == 3) /* matches '\n' when it should not */
die("regex bug confirmed: re-build git with NO_REGEX=1");
- exit(0);
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ const char *pat;
+ const char *str;
+ int flags = 0;
+ regex_t r;
+ regmatch_t m[1];
+
+ if (argc == 2 && !strcmp(argv[1], "--bug"))
+ return test_regex_bug();
+ else if (argc < 3)
+ usage("test-regex --bug\n"
+ "test-regex <pattern> <string> [<options>]");
+
+ argv++;
+ pat = *argv++;
+ str = *argv++;
+ while (*argv) {
+ struct reg_flag *rf;
+ for (rf = reg_flags; rf->name; rf++)
+ if (!strcmp(*argv, rf->name)) {
+ flags |= rf->flag;
+ break;
+ }
+ if (!rf->name)
+ die("do not recognize %s", *argv);
+ argv++;
+ }
+ git_setup_gettext();
+
+ if (regcomp(&r, pat, flags))
+ die("failed regcomp() for pattern '%s'", pat);
+ if (regexec(&r, str, 1, m, 0))
+ return 1;
+
+ return 0;
}
diff --git a/t/t0070-fundamental.sh b/t/t0070-fundamental.sh
index 5ed69a6f5..991ed2a48 100755
--- a/t/t0070-fundamental.sh
+++ b/t/t0070-fundamental.sh
@@ -31,7 +31,7 @@ test_expect_success 'git_mkstemps_mode does not fail if fd 0 is not open' '
test_expect_success 'check for a bug in the regex routines' '
# if this test fails, re-build git with NO_REGEX=1
- test-regex
+ test-regex --bug
'
test_done
diff --git a/t/t7812-grep-icase-non-ascii.sh b/t/t7812-grep-icase-non-ascii.sh
new file mode 100755
index 000000000..169fd8d70
--- /dev/null
+++ b/t/t7812-grep-icase-non-ascii.sh
@@ -0,0 +1,71 @@
+#!/bin/sh
+
+test_description='grep icase on non-English locales'
+
+. ./lib-gettext.sh
+
+test_expect_success GETTEXT_LOCALE 'setup' '
+ test_write_lines "TILRAUN: Halló Heimur!" >file &&
+ git add file &&
+ LC_ALL="$is_IS_locale" &&
+ export LC_ALL
+'
+
+test_have_prereq GETTEXT_LOCALE &&
+test-regex "HALLÓ" "Halló" ICASE &&
+test_set_prereq REGEX_LOCALE
+
+test_expect_success REGEX_LOCALE 'grep literal string, no -F' '
+ git grep -i "TILRAUN: Halló Heimur!" &&
+ git grep -i "TILRAUN: HALLÓ HEIMUR!"
+'
+
+test_expect_success GETTEXT_LOCALE,LIBPCRE 'grep pcre utf-8 icase' '
+ git grep --perl-regexp "TILRAUN: H.lló Heimur!" &&
+ git grep --perl-regexp -i "TILRAUN: H.lló Heimur!" &&
+ git grep --perl-regexp -i "TILRAUN: H.LLÓ HEIMUR!"
+'
+
+test_expect_success GETTEXT_LOCALE,LIBPCRE 'grep pcre utf-8 string with "+"' '
+ test_write_lines "TILRAUN: Hallóó Heimur!" >file2 &&
+ git add file2 &&
+ git grep -l --perl-regexp "TILRAUN: H.lló+ Heimur!" >actual &&
+ echo file >expected &&
+ echo file2 >>expected &&
+ test_cmp expected actual
+'
+
+test_expect_success REGEX_LOCALE 'grep literal string, with -F' '
+ git grep --debug -i -F "TILRAUN: Halló Heimur!" 2>&1 >/dev/null |
+ grep fixed >debug1 &&
+ test_write_lines "fixed TILRAUN: Halló Heimur!" >expect1 &&
+ test_cmp expect1 debug1 &&
+
+ git grep --debug -i -F "TILRAUN: HALLÓ HEIMUR!" 2>&1 >/dev/null |
+ grep fixed >debug2 &&
+ test_write_lines "fixed TILRAUN: HALLÓ HEIMUR!" >expect2 &&
+ test_cmp expect2 debug2
+'
+
+test_expect_success REGEX_LOCALE 'grep string with regex, with -F' '
+ test_write_lines "^*TILR^AUN:.* \\Halló \$He[]imur!\$" >file &&
+
+ git grep --debug -i -F "^*TILR^AUN:.* \\Halló \$He[]imur!\$" 2>&1 >/dev/null |
+ grep fixed >debug1 &&
+ test_write_lines "fixed \\^*TILR^AUN:\\.\\* \\\\Halló \$He\\[]imur!\\\$" >expect1 &&
+ test_cmp expect1 debug1 &&
+
+ git grep --debug -i -F "^*TILR^AUN:.* \\HALLÓ \$HE[]IMUR!\$" 2>&1 >/dev/null |
+ grep fixed >debug2 &&
+ test_write_lines "fixed \\^*TILR^AUN:\\.\\* \\\\HALLÓ \$HE\\[]IMUR!\\\$" >expect2 &&
+ test_cmp expect2 debug2
+'
+
+test_expect_success REGEX_LOCALE 'pickaxe -i on non-ascii' '
+ git commit -m first &&
+ git log --format=%f -i -S"TILRAUN: HALLÓ HEIMUR!" >actual &&
+ echo first >expected &&
+ test_cmp expected actual
+'
+
+test_done
diff --git a/t/t7813-grep-icase-iso.sh b/t/t7813-grep-icase-iso.sh
new file mode 100755
index 000000000..efef7fb81
--- /dev/null
+++ b/t/t7813-grep-icase-iso.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+
+test_description='grep icase on non-English locales'
+
+. ./lib-gettext.sh
+
+test_expect_success GETTEXT_ISO_LOCALE 'setup' '
+ printf "TILRAUN: Halló Heimur!" >file &&
+ git add file &&
+ LC_ALL="$is_IS_iso_locale" &&
+ export LC_ALL
+'
+
+test_expect_success GETTEXT_ISO_LOCALE,LIBPCRE 'grep pcre string' '
+ git grep --perl-regexp -i "TILRAUN: H.lló Heimur!" &&
+ git grep --perl-regexp -i "TILRAUN: H.LLÓ HEIMUR!"
+'
+
+test_done