aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJunio C Hamano <gitster@pobox.com>2017-04-26 15:39:07 +0900
committerJunio C Hamano <gitster@pobox.com>2017-04-26 15:39:07 +0900
commit6cbc478d83b5773d1925869e50bf6067306f4817 (patch)
treed90bac61cb05a2138f38da2f92c13b09a56af70a
parent864033a3833d6d0f90bf5a2bd1128b295c1041a7 (diff)
parentb986df5c35a930858c6bba5f66c18ce6760d84e9 (diff)
downloadgit-6cbc478d83b5773d1925869e50bf6067306f4817.tar.gz
git-6cbc478d83b5773d1925869e50bf6067306f4817.tar.xz
Merge branch 'jh/add-index-entry-optim'
"git checkout" that handles a lot of paths has been optimized by reducing the number of unnecessary checks of paths in the has_dir_name() function. * jh/add-index-entry-optim: read-cache: speed up has_dir_name (part 2) read-cache: speed up has_dir_name (part 1) read-cache: speed up add_index_entry during checkout p0006-read-tree-checkout: perf test to time read-tree read-cache: add strcmp_offset function
-rw-r--r--Makefile1
-rw-r--r--cache.h1
-rw-r--r--read-cache.c139
-rw-r--r--t/helper/.gitignore1
-rw-r--r--t/helper/test-strcmp-offset.c22
-rwxr-xr-xt/perf/p0006-read-tree-checkout.sh67
-rw-r--r--t/perf/repos/.gitignore1
-rwxr-xr-xt/perf/repos/inflate-repo.sh85
-rwxr-xr-xt/perf/repos/many-files.sh110
-rwxr-xr-xt/t0065-strcmp-offset.sh21
10 files changed, 446 insertions, 2 deletions
diff --git a/Makefile b/Makefile
index eb1a1a7cf..c739023e4 100644
--- a/Makefile
+++ b/Makefile
@@ -639,6 +639,7 @@ TEST_PROGRAMS_NEED_X += test-scrap-cache-tree
TEST_PROGRAMS_NEED_X += test-sha1
TEST_PROGRAMS_NEED_X += test-sha1-array
TEST_PROGRAMS_NEED_X += test-sigchain
+TEST_PROGRAMS_NEED_X += test-strcmp-offset
TEST_PROGRAMS_NEED_X += test-string-list
TEST_PROGRAMS_NEED_X += test-submodule-config
TEST_PROGRAMS_NEED_X += test-subprocess
diff --git a/cache.h b/cache.h
index 322ae5825..e1f0e182a 100644
--- a/cache.h
+++ b/cache.h
@@ -599,6 +599,7 @@ extern int write_locked_index(struct index_state *, struct lock_file *lock, unsi
extern int discard_index(struct index_state *);
extern int unmerged_index(const struct index_state *);
extern int verify_path(const char *path);
+extern int strcmp_offset(const char *s1, const char *s2, size_t *first_change);
extern int index_dir_exists(struct index_state *istate, const char *name, int namelen);
extern void adjust_dirname_case(struct index_state *istate, char *name);
extern struct cache_entry *index_file_exists(struct index_state *istate, const char *name, int namelen, int igncase);
diff --git a/read-cache.c b/read-cache.c
index 008b33584..d8c657d6a 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -887,9 +887,32 @@ static int has_file_name(struct index_state *istate,
return retval;
}
+
+/*
+ * Like strcmp(), but also return the offset of the first change.
+ * If strings are equal, return the length.
+ */
+int strcmp_offset(const char *s1, const char *s2, size_t *first_change)
+{
+ size_t k;
+
+ if (!first_change)
+ return strcmp(s1, s2);
+
+ for (k = 0; s1[k] == s2[k]; k++)
+ if (s1[k] == '\0')
+ break;
+
+ *first_change = k;
+ return (unsigned char)s1[k] - (unsigned char)s2[k];
+}
+
/*
* Do we have another file with a pathname that is a proper
* subset of the name we're trying to add?
+ *
+ * That is, is there another file in the index with a path
+ * that matches a sub-directory in the given entry?
*/
static int has_dir_name(struct index_state *istate,
const struct cache_entry *ce, int pos, int ok_to_replace)
@@ -898,9 +921,51 @@ static int has_dir_name(struct index_state *istate,
int stage = ce_stage(ce);
const char *name = ce->name;
const char *slash = name + ce_namelen(ce);
+ size_t len_eq_last;
+ int cmp_last = 0;
+
+ /*
+ * We are frequently called during an iteration on a sorted
+ * list of pathnames and while building a new index. Therefore,
+ * there is a high probability that this entry will eventually
+ * be appended to the index, rather than inserted in the middle.
+ * If we can confirm that, we can avoid binary searches on the
+ * components of the pathname.
+ *
+ * Compare the entry's full path with the last path in the index.
+ */
+ if (istate->cache_nr > 0) {
+ cmp_last = strcmp_offset(name,
+ istate->cache[istate->cache_nr - 1]->name,
+ &len_eq_last);
+ if (cmp_last > 0) {
+ if (len_eq_last == 0) {
+ /*
+ * The entry sorts AFTER the last one in the
+ * index and their paths have no common prefix,
+ * so there cannot be a F/D conflict.
+ */
+ return retval;
+ } else {
+ /*
+ * The entry sorts AFTER the last one in the
+ * index, but has a common prefix. Fall through
+ * to the loop below to disect the entry's path
+ * and see where the difference is.
+ */
+ }
+ } else if (cmp_last == 0) {
+ /*
+ * The entry exactly matches the last one in the
+ * index, but because of multiple stage and CE_REMOVE
+ * items, we fall through and let the regular search
+ * code handle it.
+ */
+ }
+ }
for (;;) {
- int len;
+ size_t len;
for (;;) {
if (*--slash == '/')
@@ -910,6 +975,67 @@ static int has_dir_name(struct index_state *istate,
}
len = slash - name;
+ if (cmp_last > 0) {
+ /*
+ * (len + 1) is a directory boundary (including
+ * the trailing slash). And since the loop is
+ * decrementing "slash", the first iteration is
+ * the longest directory prefix; subsequent
+ * iterations consider parent directories.
+ */
+
+ if (len + 1 <= len_eq_last) {
+ /*
+ * The directory prefix (including the trailing
+ * slash) also appears as a prefix in the last
+ * entry, so the remainder cannot collide (because
+ * strcmp said the whole path was greater).
+ *
+ * EQ: last: xxx/A
+ * this: xxx/B
+ *
+ * LT: last: xxx/file_A
+ * this: xxx/file_B
+ */
+ return retval;
+ }
+
+ if (len > len_eq_last) {
+ /*
+ * This part of the directory prefix (excluding
+ * the trailing slash) is longer than the known
+ * equal portions, so this sub-directory cannot
+ * collide with a file.
+ *
+ * GT: last: xxxA
+ * this: xxxB/file
+ */
+ return retval;
+ }
+
+ if (istate->cache_nr > 0 &&
+ ce_namelen(istate->cache[istate->cache_nr - 1]) > len) {
+ /*
+ * The directory prefix lines up with part of
+ * a longer file or directory name, but sorts
+ * after it, so this sub-directory cannot
+ * collide with a file.
+ *
+ * last: xxx/yy-file (because '-' sorts before '/')
+ * this: xxx/yy/abc
+ */
+ return retval;
+ }
+
+ /*
+ * This is a possible collision. Fall through and
+ * let the regular search code handle it.
+ *
+ * last: xxx
+ * this: xxx/file
+ */
+ }
+
pos = index_name_stage_pos(istate, name, len, stage);
if (pos >= 0) {
/*
@@ -1001,7 +1127,16 @@ static int add_index_entry_with_check(struct index_state *istate, struct cache_e
if (!(option & ADD_CACHE_KEEP_CACHE_TREE))
cache_tree_invalidate_path(istate, ce->name);
- pos = index_name_stage_pos(istate, ce->name, ce_namelen(ce), ce_stage(ce));
+
+ /*
+ * If this entry's path sorts after the last entry in the index,
+ * we can avoid searching for it.
+ */
+ if (istate->cache_nr > 0 &&
+ strcmp(ce->name, istate->cache[istate->cache_nr - 1]->name) > 0)
+ pos = -istate->cache_nr - 1;
+ else
+ pos = index_name_stage_pos(istate, ce->name, ce_namelen(ce), ce_stage(ce));
/* existing match? Just replace it. */
if (pos >= 0) {
diff --git a/t/helper/.gitignore b/t/helper/.gitignore
index acd5db180..721650256 100644
--- a/t/helper/.gitignore
+++ b/t/helper/.gitignore
@@ -28,6 +28,7 @@
/test-sha1
/test-sha1-array
/test-sigchain
+/test-strcmp-offset
/test-string-list
/test-submodule-config
/test-subprocess
diff --git a/t/helper/test-strcmp-offset.c b/t/helper/test-strcmp-offset.c
new file mode 100644
index 000000000..4a45a54e9
--- /dev/null
+++ b/t/helper/test-strcmp-offset.c
@@ -0,0 +1,22 @@
+#include "cache.h"
+
+int cmd_main(int argc, const char **argv)
+{
+ int result;
+ size_t offset;
+
+ if (!argv[1] || !argv[2])
+ die("usage: %s <string1> <string2>", argv[0]);
+
+ result = strcmp_offset(argv[1], argv[2], &offset);
+
+ /*
+ * Because differnt CRTs behave differently, only rely on signs
+ * of the result values.
+ */
+ result = (result < 0 ? -1 :
+ result > 0 ? 1 :
+ 0);
+ printf("%d %"PRIuMAX"\n", result, (uintmax_t)offset);
+ return 0;
+}
diff --git a/t/perf/p0006-read-tree-checkout.sh b/t/perf/p0006-read-tree-checkout.sh
new file mode 100755
index 000000000..78cc23fe2
--- /dev/null
+++ b/t/perf/p0006-read-tree-checkout.sh
@@ -0,0 +1,67 @@
+#!/bin/sh
+#
+# This test measures the performance of various read-tree
+# and checkout operations. It is primarily interested in
+# the algorithmic costs of index operations and recursive
+# tree traversal -- and NOT disk I/O on thousands of files.
+
+test_description="Tests performance of read-tree"
+
+. ./perf-lib.sh
+
+test_perf_default_repo
+
+# If the test repo was generated by ./repos/many-files.sh
+# then we know something about the data shape and branches,
+# so we can isolate testing to the ballast-related commits
+# and setup sparse-checkout so we don't have to populate
+# the ballast files and directories.
+#
+# Otherwise, we make some general assumptions about the
+# repo and consider the entire history of the current
+# branch to be the ballast.
+
+test_expect_success "setup repo" '
+ if git rev-parse --verify refs/heads/p0006-ballast^{commit}
+ then
+ echo Assuming synthetic repo from many-files.sh
+ git branch br_base master
+ git branch br_ballast p0006-ballast^
+ git branch br_ballast_alias p0006-ballast^
+ git branch br_ballast_plus_1 p0006-ballast
+ git config --local core.sparsecheckout 1
+ cat >.git/info/sparse-checkout <<-EOF
+ /*
+ !ballast/*
+ EOF
+ else
+ echo Assuming non-synthetic repo...
+ git branch br_base $(git rev-list HEAD | tail -n 1)
+ git branch br_ballast HEAD^ || error "no ancestor commit from current head"
+ git branch br_ballast_alias HEAD^
+ git branch br_ballast_plus_1 HEAD
+ fi &&
+ git checkout -q br_ballast &&
+ nr_files=$(git ls-files | wc -l)
+'
+
+test_perf "read-tree br_base br_ballast ($nr_files)" '
+ git read-tree -m br_base br_ballast -n
+'
+
+test_perf "switch between br_base br_ballast ($nr_files)" '
+ git checkout -q br_base &&
+ git checkout -q br_ballast
+'
+
+test_perf "switch between br_ballast br_ballast_plus_1 ($nr_files)" '
+ git checkout -q br_ballast_plus_1 &&
+ git checkout -q br_ballast
+'
+
+test_perf "switch between aliases ($nr_files)" '
+ git checkout -q br_ballast_alias &&
+ git checkout -q br_ballast
+'
+
+test_done
diff --git a/t/perf/repos/.gitignore b/t/perf/repos/.gitignore
new file mode 100644
index 000000000..72e3dc3e1
--- /dev/null
+++ b/t/perf/repos/.gitignore
@@ -0,0 +1 @@
+gen-*/
diff --git a/t/perf/repos/inflate-repo.sh b/t/perf/repos/inflate-repo.sh
new file mode 100755
index 000000000..fcfc992b5
--- /dev/null
+++ b/t/perf/repos/inflate-repo.sh
@@ -0,0 +1,85 @@
+#!/bin/sh
+# Inflate the size of an EXISTING repo.
+#
+# This script should be run inside the worktree of a TEST repo.
+# It will use the contents of the current HEAD to generate a
+# commit containing copies of the current worktree such that the
+# total size of the commit has at least <target_size> files.
+#
+# Usage: [-t target_size] [-b branch_name]
+
+set -e
+
+target_size=10000
+branch_name=p0006-ballast
+ballast=ballast
+
+while test "$#" -ne 0
+do
+ case "$1" in
+ -b)
+ shift;
+ test "$#" -ne 0 || { echo 'error: -b requires an argument' >&2; exit 1; }
+ branch_name=$1;
+ shift ;;
+ -t)
+ shift;
+ test "$#" -ne 0 || { echo 'error: -t requires an argument' >&2; exit 1; }
+ target_size=$1;
+ shift ;;
+ *)
+ echo "error: unknown option '$1'" >&2; exit 1 ;;
+ esac
+done
+
+git ls-tree -r HEAD >GEN_src_list
+nr_src_files=$(cat GEN_src_list | wc -l)
+
+src_branch=$(git symbolic-ref --short HEAD)
+
+echo "Branch $src_branch initially has $nr_src_files files."
+
+if test $target_size -le $nr_src_files
+then
+ echo "Repository already exceeds target size $target_size."
+ rm GEN_src_list
+ exit 1
+fi
+
+# Create well-known branch and add 1 file change to start
+# if off before the ballast.
+git checkout -b $branch_name HEAD
+echo "$target_size" > inflate-repo.params
+git add inflate-repo.params
+git commit -q -m params
+
+# Create ballast for in our branch.
+copy=1
+nr_files=$nr_src_files
+while test $nr_files -lt $target_size
+do
+ sed -e "s| | $ballast/$copy/|" <GEN_src_list |
+ git update-index --index-info
+
+ nr_files=$(expr $nr_files + $nr_src_files)
+ copy=$(expr $copy + 1)
+done
+rm GEN_src_list
+git commit -q -m "ballast"
+
+# Modify 1 file and commit.
+echo "$target_size" >> inflate-repo.params
+git add inflate-repo.params
+git commit -q -m "ballast plus 1"
+
+nr_files=$(git ls-files | wc -l)
+
+# Checkout master to put repo in canonical state (because
+# the perf test may need to clone and enable sparse-checkout
+# before attempting to checkout a commit with the ballast
+# (because it may contain 100K directories and 1M files)).
+git checkout $src_branch
+
+echo "Repository inflated. Branch $branch_name has $nr_files files."
+
+exit 0
diff --git a/t/perf/repos/many-files.sh b/t/perf/repos/many-files.sh
new file mode 100755
index 000000000..28720e4e1
--- /dev/null
+++ b/t/perf/repos/many-files.sh
@@ -0,0 +1,110 @@
+#!/bin/sh
+# Generate test data repository using the given parameters.
+# When omitted, we create "gen-many-files-d-w-f.git".
+#
+# Usage: [-r repo] [-d depth] [-w width] [-f files]
+#
+# -r repo: path to the new repo to be generated
+# -d depth: the depth of sub-directories
+# -w width: the number of sub-directories at each level
+# -f files: the number of files created in each directory
+#
+# Note that all files will have the same SHA-1 and each
+# directory at a level will have the same SHA-1, so we
+# will potentially have a large index, but not a large
+# ODB.
+#
+# Ballast will be created under "ballast/".
+
+EMPTY_BLOB=e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
+
+set -e
+
+# (5, 10, 9) will create 999,999 ballast files.
+# (4, 10, 9) will create 99,999 ballast files.
+depth=5
+width=10
+files=9
+
+while test "$#" -ne 0
+do
+ case "$1" in
+ -r)
+ shift;
+ test "$#" -ne 0 || { echo 'error: -r requires an argument' >&2; exit 1; }
+ repo=$1;
+ shift ;;
+ -d)
+ shift;
+ test "$#" -ne 0 || { echo 'error: -d requires an argument' >&2; exit 1; }
+ depth=$1;
+ shift ;;
+ -w)
+ shift;
+ test "$#" -ne 0 || { echo 'error: -w requires an argument' >&2; exit 1; }
+ width=$1;
+ shift ;;
+ -f)
+ shift;
+ test "$#" -ne 0 || { echo 'error: -f requires an argument' >&2; exit 1; }
+ files=$1;
+ shift ;;
+ *)
+ echo "error: unknown option '$1'" >&2; exit 1 ;;
+ esac
+done
+
+# Inflate the index with thousands of empty files.
+# usage: dir depth width files
+fill_index() {
+ awk -v arg_dir=$1 -v arg_depth=$2 -v arg_width=$3 -v arg_files=$4 '
+ function make_paths(dir, depth, width, files, f, w) {
+ for (f = 1; f <= files; f++) {
+ print dir "/file" f
+ }
+ if (depth > 0) {
+ for (w = 1; w <= width; w++) {
+ make_paths(dir "/dir" w, depth - 1, width, files)
+ }
+ }
+ }
+ END { make_paths(arg_dir, arg_depth, arg_width, arg_files) }
+ ' </dev/null |
+ sed "s/^/100644 $EMPTY_BLOB /" |
+ git update-index --index-info
+ return 0
+}
+
+[ -z "$repo" ] && repo=gen-many-files-$depth.$width.$files.git
+
+mkdir $repo
+cd $repo
+git init .
+
+# Create an initial commit just to define master.
+touch many-files.empty
+echo "$depth $width $files" >many-files.params
+git add many-files.*
+git commit -q -m params
+
+# Create ballast for p0006 based upon the given params and
+# inflate the index with thousands of empty files and commit.
+git checkout -b p0006-ballast
+fill_index "ballast" $depth $width $files
+git commit -q -m "ballast"
+
+nr_files=$(git ls-files | wc -l)
+
+# Modify 1 file and commit.
+echo "$depth $width $files" >>many-files.params
+git add many-files.params
+git commit -q -m "ballast plus 1"
+
+# Checkout master to put repo in canonical state (because
+# the perf test may need to clone and enable sparse-checkout
+# before attempting to checkout a commit with the ballast
+# (because it may contain 100K directories and 1M files)).
+git checkout master
+
+echo "Repository "$repo" ($depth, $width, $files) created. Ballast $nr_files."
+exit 0
diff --git a/t/t0065-strcmp-offset.sh b/t/t0065-strcmp-offset.sh
new file mode 100755
index 000000000..7d6d21425
--- /dev/null
+++ b/t/t0065-strcmp-offset.sh
@@ -0,0 +1,21 @@
+#!/bin/sh
+
+test_description='Test strcmp_offset functionality'
+
+. ./test-lib.sh
+
+while read s1 s2 expect
+do
+ test_expect_success "strcmp_offset($s1, $s2)" '
+ echo "$expect" >expect &&
+ test-strcmp-offset "$s1" "$s2" >actual &&
+ test_cmp expect actual
+ '
+done <<-EOF
+abc abc 0 3
+abc def -1 0
+abc abz -1 2
+abc abcdef -1 3
+EOF
+
+test_done