From f3eb54920eff356a30df144d50b45a5581e5eb13 Mon Sep 17 00:00:00 2001 From: Beat Bolli Date: Wed, 14 Dec 2016 00:31:39 +0100 Subject: update_unicode.sh: move it into contrib/update-unicode As it's used only by a tiny minority of the Git developer population, this script does not belong into the main Git source directory. Move it into contrib/ and adjust the paths to account for the new location. Signed-off-by: Beat Bolli Signed-off-by: Junio C Hamano --- contrib/update-unicode/.gitignore | 3 +++ contrib/update-unicode/README | 20 +++++++++++++++++ contrib/update-unicode/update_unicode.sh | 38 ++++++++++++++++++++++++++++++++ 3 files changed, 61 insertions(+) create mode 100644 contrib/update-unicode/.gitignore create mode 100644 contrib/update-unicode/README create mode 100755 contrib/update-unicode/update_unicode.sh (limited to 'contrib') diff --git a/contrib/update-unicode/.gitignore b/contrib/update-unicode/.gitignore new file mode 100644 index 000000000..b0ebc6aad --- /dev/null +++ b/contrib/update-unicode/.gitignore @@ -0,0 +1,3 @@ +uniset/ +UnicodeData.txt +EastAsianWidth.txt diff --git a/contrib/update-unicode/README b/contrib/update-unicode/README new file mode 100644 index 000000000..b9e2fc854 --- /dev/null +++ b/contrib/update-unicode/README @@ -0,0 +1,20 @@ +TL;DR: Run update_unicode.sh after the publication of a new Unicode +standard and commit the resulting unicode_widths.h file. + +The long version +================ + +The Git source code ships the file unicode_widths.h which contains +tables of zero and double width Unicode code points, respectively. +These tables are generated using update_unicode.sh in this directory. +update_unicode.sh itself uses a third-party tool, uniset, to query two +Unicode data files for the interesting code points. + +On first run, update_unicode.sh clones uniset from Github and builds it. +This requires a current-ish version of autoconf (2.69 works per December +2016). + +On each run, update_unicode.sh checks whether more recent Unicode data +files are available from the Unicode consortium, and rebuilds the header +unicode_widths.h with the new data. The new header can then be +committed. diff --git a/contrib/update-unicode/update_unicode.sh b/contrib/update-unicode/update_unicode.sh new file mode 100755 index 000000000..7b901266c --- /dev/null +++ b/contrib/update-unicode/update_unicode.sh @@ -0,0 +1,38 @@ +#!/bin/sh +#See http://www.unicode.org/reports/tr44/ +# +#Me Enclosing_Mark an enclosing combining mark +#Mn Nonspacing_Mark a nonspacing combining mark (zero advance width) +#Cf Format a format control character +# +cd "$(dirname "$0")" +UNICODEWIDTH_H=$(git rev-parse --show-toplevel)/unicode_width.h +( + if ! test -f UnicodeData.txt; then + wget http://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt + fi && + if ! test -f EastAsianWidth.txt; then + wget http://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt + fi && + if ! test -d uniset; then + git clone https://github.com/depp/uniset.git + fi && + ( + cd uniset && + if ! test -x uniset; then + autoreconf -i && + ./configure --enable-warnings=-Werror CFLAGS='-O0 -ggdb' + fi && + make + ) && + UNICODE_DIR=. && export UNICODE_DIR && + cat >$UNICODEWIDTH_H <<-EOF + static const struct interval zero_width[] = { + $(uniset/uniset --32 cat:Me,Mn,Cf + U+1160..U+11FF - U+00AD | + grep -v plane) + }; + static const struct interval double_width[] = { + $(uniset/uniset --32 eaw:F,W) + }; + EOF +) -- cgit v1.2.1 From b79e28e3701e83ba19d3b11ada4cc2bbdfaf8c29 Mon Sep 17 00:00:00 2001 From: Beat Bolli Date: Wed, 14 Dec 2016 00:31:40 +0100 Subject: update_unicode.sh: remove an unnecessary subshell level After the move into contrib/update-unicode, we no longer create the unicode directory to have a clean working folder. Instead, the directory of the script is used. This means that the subshell can be removed. Signed-off-by: Beat Bolli Signed-off-by: Junio C Hamano --- contrib/update-unicode/update_unicode.sh | 53 ++++++++++++++++---------------- 1 file changed, 26 insertions(+), 27 deletions(-) (limited to 'contrib') diff --git a/contrib/update-unicode/update_unicode.sh b/contrib/update-unicode/update_unicode.sh index 7b901266c..ff664ec95 100755 --- a/contrib/update-unicode/update_unicode.sh +++ b/contrib/update-unicode/update_unicode.sh @@ -7,32 +7,31 @@ # cd "$(dirname "$0")" UNICODEWIDTH_H=$(git rev-parse --show-toplevel)/unicode_width.h + +if ! test -f UnicodeData.txt; then + wget http://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt +fi && +if ! test -f EastAsianWidth.txt; then + wget http://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt +fi && +if ! test -d uniset; then + git clone https://github.com/depp/uniset.git +fi && ( - if ! test -f UnicodeData.txt; then - wget http://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt + cd uniset && + if ! test -x uniset; then + autoreconf -i && + ./configure --enable-warnings=-Werror CFLAGS='-O0 -ggdb' fi && - if ! test -f EastAsianWidth.txt; then - wget http://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt - fi && - if ! test -d uniset; then - git clone https://github.com/depp/uniset.git - fi && - ( - cd uniset && - if ! test -x uniset; then - autoreconf -i && - ./configure --enable-warnings=-Werror CFLAGS='-O0 -ggdb' - fi && - make - ) && - UNICODE_DIR=. && export UNICODE_DIR && - cat >$UNICODEWIDTH_H <<-EOF - static const struct interval zero_width[] = { - $(uniset/uniset --32 cat:Me,Mn,Cf + U+1160..U+11FF - U+00AD | - grep -v plane) - }; - static const struct interval double_width[] = { - $(uniset/uniset --32 eaw:F,W) - }; - EOF -) + make +) && +UNICODE_DIR=. && export UNICODE_DIR && +cat >$UNICODEWIDTH_H <<-EOF +static const struct interval zero_width[] = { + $(uniset/uniset --32 cat:Me,Mn,Cf + U+1160..U+11FF - U+00AD | + grep -v plane) +}; +static const struct interval double_width[] = { + $(uniset/uniset --32 eaw:F,W) +}; +EOF -- cgit v1.2.1 From 3f0a386309402c4a4b07f1c228e61023fbac3566 Mon Sep 17 00:00:00 2001 From: Beat Bolli Date: Wed, 14 Dec 2016 00:31:41 +0100 Subject: update_unicode.sh: pin the uniset repo to a known good commit The uniset upstream has added more commits that for example change the hexadecimal output in '--32' mode to decimal. Let's pin the repo to a commit that still outputs the width tables in the format we want. Signed-off-by: Beat Bolli Signed-off-by: Junio C Hamano --- contrib/update-unicode/update_unicode.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'contrib') diff --git a/contrib/update-unicode/update_unicode.sh b/contrib/update-unicode/update_unicode.sh index ff664ec95..9f1bf31ff 100755 --- a/contrib/update-unicode/update_unicode.sh +++ b/contrib/update-unicode/update_unicode.sh @@ -15,7 +15,8 @@ if ! test -f EastAsianWidth.txt; then wget http://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt fi && if ! test -d uniset; then - git clone https://github.com/depp/uniset.git + git clone https://github.com/depp/uniset.git && + ( cd uniset && git checkout 4b186196dd ) fi && ( cd uniset && -- cgit v1.2.1 From fef54f31627542b90f0a2af6c98ebb1e4c309211 Mon Sep 17 00:00:00 2001 From: Beat Bolli Date: Wed, 14 Dec 2016 00:31:42 +0100 Subject: update_unicode.sh: automatically download newer definition files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Checking just for the unicode data files' existence is not sufficient; we should also download them if a newer version exists on the Unicode consortium's servers. Option -N of wget does this nicely for us. Reviewed-by: Torsten Bögershausen Signed-off-by: Beat Bolli Signed-off-by: Junio C Hamano --- contrib/update-unicode/update_unicode.sh | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'contrib') diff --git a/contrib/update-unicode/update_unicode.sh b/contrib/update-unicode/update_unicode.sh index 9f1bf31ff..56871a1f4 100755 --- a/contrib/update-unicode/update_unicode.sh +++ b/contrib/update-unicode/update_unicode.sh @@ -8,12 +8,8 @@ cd "$(dirname "$0")" UNICODEWIDTH_H=$(git rev-parse --show-toplevel)/unicode_width.h -if ! test -f UnicodeData.txt; then - wget http://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt -fi && -if ! test -f EastAsianWidth.txt; then - wget http://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt -fi && +wget -N http://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt \ + http://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt && if ! test -d uniset; then git clone https://github.com/depp/uniset.git && ( cd uniset && git checkout 4b186196dd ) -- cgit v1.2.1 From 3fe5799144f242ca9cacd0211318e36f9a09d7c6 Mon Sep 17 00:00:00 2001 From: Beat Bolli Date: Wed, 14 Dec 2016 00:31:43 +0100 Subject: update_unicode.sh: remove the plane filter The uniset upstream has accepted my patches that eliminate the Unicode plane offsets from the output in '--32' mode. Remove the corresponding filter in update_unicode.sh. This also fixes the issue that the plane offsets were not removed from the second uniset call. Signed-off-by: Beat Bolli Signed-off-by: Junio C Hamano --- contrib/update-unicode/update_unicode.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'contrib') diff --git a/contrib/update-unicode/update_unicode.sh b/contrib/update-unicode/update_unicode.sh index 56871a1f4..e05db92d3 100755 --- a/contrib/update-unicode/update_unicode.sh +++ b/contrib/update-unicode/update_unicode.sh @@ -25,8 +25,7 @@ fi && UNICODE_DIR=. && export UNICODE_DIR && cat >$UNICODEWIDTH_H <<-EOF static const struct interval zero_width[] = { - $(uniset/uniset --32 cat:Me,Mn,Cf + U+1160..U+11FF - U+00AD | - grep -v plane) + $(uniset/uniset --32 cat:Me,Mn,Cf + U+1160..U+11FF - U+00AD) }; static const struct interval double_width[] = { $(uniset/uniset --32 eaw:F,W) -- cgit v1.2.1