diff options
author | Jeff King <peff@peff.net> | 2011-09-12 15:56:52 -0400 |
---|---|---|
committer | Junio C Hamano <gitster@pobox.com> | 2011-09-12 14:16:41 -0700 |
commit | 6859de45a94ec0e88703250d9d4df64a09042333 (patch) | |
tree | 9b62995c0d8e3f3a12a992eed1397a7e5e423c66 | |
parent | f696543dad6c7ba27b0c4fab167a5687263a9ba0 (diff) | |
download | git-6859de45a94ec0e88703250d9d4df64a09042333.tar.gz git-6859de45a94ec0e88703250d9d4df64a09042333.tar.xz |
fetch: avoid quadratic loop checking for updated submodules
Recent versions of git can be slow to fetch repositories with a
large number of refs (or when they already have a large
number of refs). For example, GitHub makes pull-requests
available as refs, which can lead to a large number of
available refs. This slowness goes away when submodule
recursion is turned off:
$ git ls-remote git://github.com/rails/rails.git | wc -l
3034
[this takes ~10 seconds of CPU time to complete]
git fetch --recurse-submodules=no \
git://github.com/rails/rails.git "refs/*:refs/*"
[this still isn't done after 10 _minutes_ of pegging the CPU]
git fetch \
git://github.com/rails/rails.git "refs/*:refs/*"
You can produce a quicker and simpler test case like this:
doit() {
head=`git rev-parse HEAD`
for i in `seq 1 $1`; do
echo $head refs/heads/ref$i
done >.git/packed-refs
echo "==> $1"
rm -rf dest
git init -q --bare dest &&
(cd dest && time git.compile fetch -q .. refs/*:refs/*)
}
rm -rf repo
git init -q repo && cd repo &&
>file && git add file && git commit -q -m one
doit 100
doit 200
doit 400
doit 800
doit 1600
doit 3200
Which yields timings like:
# refs seconds of CPU
100 0.06
200 0.24
400 0.95
800 3.39
1600 13.66
3200 54.09
Notice that although the number of refs doubles in each
trial, the CPU time spent quadruples.
The problem is that the submodule recursion code works
something like:
- for each ref we fetch
- for each commit in git rev-list $new_sha1 --not --all
- add modified submodules to list
- fetch any newly referenced submodules
But that means if we fetch N refs, we start N revision
walks. Worse, because we use "--all", the number of refs we
must process that constitute "--all" keeps growing, too. And
you end up doing O(N^2) ref resolutions.
Instead, this patch structures the code like this:
- for each sha1 we already have
- add $old_sha1 to list $old
- for each ref we fetch
- add $new_sha1 to list $new
- for each commit in git rev-list $new --not $old
- add modified submodules to list
- fetch any newly referenced submodules
This yields timings like:
# refs seconds of CPU
100 0.00
200 0.04
400 0.04
800 0.10
1600 0.21
3200 0.39
Note that the amount of effort doubles as the number of refs
doubles. Similarly, the fetch of rails.git takes about as
much time as it does with --recurse-submodules=no.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
-rw-r--r-- | submodule.c | 77 |
1 files changed, 72 insertions, 5 deletions
diff --git a/submodule.c b/submodule.c index b6dec70bd..9431c42df 100644 --- a/submodule.c +++ b/submodule.c @@ -8,12 +8,17 @@ #include "diffcore.h" #include "refs.h" #include "string-list.h" +#include "sha1-array.h" static struct string_list config_name_for_path; static struct string_list config_fetch_recurse_submodules_for_name; static struct string_list config_ignore_for_name; static int config_fetch_recurse_submodules = RECURSE_SUBMODULES_ON_DEMAND; static struct string_list changed_submodule_paths; +static int initialized_fetch_ref_tips; +static struct sha1_array ref_tips_before_fetch; +static struct sha1_array ref_tips_after_fetch; + /* * The following flag is set if the .gitmodules file is unmerged. We then * disable recursion for all submodules where .git/config doesn't have a @@ -366,16 +371,72 @@ static void submodule_collect_changed_cb(struct diff_queue_struct *q, } } +static int add_sha1_to_array(const char *ref, const unsigned char *sha1, + int flags, void *data) +{ + sha1_array_append(data, sha1); + return 0; +} + void check_for_new_submodule_commits(unsigned char new_sha1[20]) { + if (!initialized_fetch_ref_tips) { + for_each_ref(add_sha1_to_array, &ref_tips_before_fetch); + initialized_fetch_ref_tips = 1; + } + + sha1_array_append(&ref_tips_after_fetch, new_sha1); +} + +struct argv_array { + const char **argv; + unsigned int argc; + unsigned int alloc; +}; + +static void init_argv(struct argv_array *array) +{ + array->argv = NULL; + array->argc = 0; + array->alloc = 0; +} + +static void push_argv(struct argv_array *array, const char *value) +{ + ALLOC_GROW(array->argv, array->argc + 2, array->alloc); + array->argv[array->argc++] = xstrdup(value); + array->argv[array->argc] = NULL; +} + +static void clear_argv(struct argv_array *array) +{ + int i; + for (i = 0; i < array->argc; i++) + free((char **)array->argv[i]); + free(array->argv); + init_argv(array); +} + +static void add_sha1_to_argv(const unsigned char sha1[20], void *data) +{ + push_argv(data, sha1_to_hex(sha1)); +} + +static void calculate_changed_submodule_paths(void) +{ struct rev_info rev; struct commit *commit; - const char *argv[] = {NULL, NULL, "--not", "--all", NULL}; - int argc = ARRAY_SIZE(argv) - 1; + struct argv_array argv; init_revisions(&rev, NULL); - argv[1] = xstrdup(sha1_to_hex(new_sha1)); - setup_revisions(argc, argv, &rev, NULL); + init_argv(&argv); + push_argv(&argv, "--"); /* argv[0] program name */ + sha1_array_for_each_unique(&ref_tips_after_fetch, + add_sha1_to_argv, &argv); + push_argv(&argv, "--not"); + sha1_array_for_each_unique(&ref_tips_before_fetch, + add_sha1_to_argv, &argv); + setup_revisions(argv.argc, argv.argv, &rev, NULL); if (prepare_revision_walk(&rev)) die("revision walk setup failed"); @@ -398,7 +459,11 @@ void check_for_new_submodule_commits(unsigned char new_sha1[20]) parent = parent->next; } } - free((char *)argv[1]); + + clear_argv(&argv); + sha1_array_clear(&ref_tips_before_fetch); + sha1_array_clear(&ref_tips_after_fetch); + initialized_fetch_ref_tips = 0; } int fetch_populated_submodules(int num_options, const char **options, @@ -432,6 +497,8 @@ int fetch_populated_submodules(int num_options, const char **options, cp.git_cmd = 1; cp.no_stdin = 1; + calculate_changed_submodule_paths(); + for (i = 0; i < active_nr; i++) { struct strbuf submodule_path = STRBUF_INIT; struct strbuf submodule_git_dir = STRBUF_INIT; |