aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJunio C Hamano <gitster@pobox.com>2016-08-08 14:48:39 -0700
committerJunio C Hamano <gitster@pobox.com>2016-08-08 14:48:39 -0700
commit78849622ec58ae576f36b155824e813543a9bdf5 (patch)
tree5b1c92c1beb5bfdd6072c31e8ba85d541ac2146a
parent7647537e3e8f263306e22cb22b4f4f7b09589b19 (diff)
parent56dfeb62638760fa78a442a97f19abf1af374d29 (diff)
downloadgit-78849622ec58ae576f36b155824e813543a9bdf5.tar.gz
git-78849622ec58ae576f36b155824e813543a9bdf5.tar.xz
Merge branch 'jk/pack-objects-optim'
"git pack-objects" has a few options that tell it not to pack objects found in certain packfiles, which require it to scan .idx files of all available packs. The codepaths involved in these operations have been optimized for a common case of not having any non-local pack and/or any .kept pack. * jk/pack-objects-optim: pack-objects: compute local/ignore_pack_keep early pack-objects: break out of want_object loop early find_pack_entry: replace last_found_pack with MRU cache add generic most-recently-used list sha1_file: drop free_pack_by_name t/perf: add tests for many-pack scenarios
-rw-r--r--Makefile1
-rw-r--r--builtin/pack-objects.c40
-rw-r--r--cache.h8
-rw-r--r--mru.c50
-rw-r--r--mru.h45
-rw-r--r--pack-write.c1
-rw-r--r--sha1_file.c66
-rwxr-xr-xt/perf/p5303-many-packs.sh87
8 files changed, 248 insertions, 50 deletions
diff --git a/Makefile b/Makefile
index 6a13386c2..ad3624d95 100644
--- a/Makefile
+++ b/Makefile
@@ -755,6 +755,7 @@ LIB_OBJS += merge.o
LIB_OBJS += merge-blobs.o
LIB_OBJS += merge-recursive.o
LIB_OBJS += mergesort.o
+LIB_OBJS += mru.o
LIB_OBJS += name-hash.o
LIB_OBJS += notes.o
LIB_OBJS += notes-cache.o
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 92e2e5f7a..4a6339896 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -46,6 +46,7 @@ static int keep_unreachable, unpack_unreachable, include_tag;
static unsigned long unpack_unreachable_expiration;
static int pack_loose_unreachable;
static int local;
+static int have_non_local_packs;
static int incremental;
static int ignore_packed_keep;
static int allow_ofs_delta;
@@ -978,6 +979,23 @@ static int want_object_in_pack(const unsigned char *sha1,
return 1;
if (incremental)
return 0;
+
+ /*
+ * When asked to do --local (do not include an
+ * object that appears in a pack we borrow
+ * from elsewhere) or --honor-pack-keep (do not
+ * include an object that appears in a pack marked
+ * with .keep), we need to make sure no copy of this
+ * object come from in _any_ pack that causes us to
+ * omit it, and need to complete this loop. When
+ * neither option is in effect, we know the object
+ * we just found is going to be packed, so break
+ * out of the loop to return 1 now.
+ */
+ if (!ignore_packed_keep &&
+ (!local || !have_non_local_packs))
+ break;
+
if (local && !p->pack_local)
return 0;
if (ignore_packed_keep && p->pack_local && p->pack_keep)
@@ -2784,6 +2802,28 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
progress = 2;
prepare_packed_git();
+ if (ignore_packed_keep) {
+ struct packed_git *p;
+ for (p = packed_git; p; p = p->next)
+ if (p->pack_local && p->pack_keep)
+ break;
+ if (!p) /* no keep-able packs found */
+ ignore_packed_keep = 0;
+ }
+ if (local) {
+ /*
+ * unlike ignore_packed_keep above, we do not want to
+ * unset "local" based on looking at packs, as it
+ * also covers non-local objects
+ */
+ struct packed_git *p;
+ for (p = packed_git; p; p = p->next) {
+ if (!p->pack_local) {
+ have_non_local_packs = 1;
+ break;
+ }
+ }
+ }
if (progress)
progress_state = start_progress(_("Counting objects"), 0);
diff --git a/cache.h b/cache.h
index e8128fc5d..7c8051be0 100644
--- a/cache.h
+++ b/cache.h
@@ -1378,6 +1378,13 @@ extern struct packed_git {
char pack_name[FLEX_ARRAY]; /* more */
} *packed_git;
+/*
+ * A most-recently-used ordered version of the packed_git list, which can
+ * be iterated instead of packed_git (and marked via mru_mark).
+ */
+struct mru;
+extern struct mru *packed_git_mru;
+
struct pack_entry {
off_t offset;
unsigned char sha1[20];
@@ -1417,7 +1424,6 @@ extern unsigned char *use_pack(struct packed_git *, struct pack_window **, off_t
extern void close_pack_windows(struct packed_git *);
extern void close_all_packs(void);
extern void unuse_pack(struct pack_window **);
-extern void free_pack_by_name(const char *);
extern void clear_delta_base_cache(void);
extern struct packed_git *add_packed_git(const char *path, size_t path_len, int local);
diff --git a/mru.c b/mru.c
new file mode 100644
index 000000000..9dedae028
--- /dev/null
+++ b/mru.c
@@ -0,0 +1,50 @@
+#include "cache.h"
+#include "mru.h"
+
+void mru_append(struct mru *mru, void *item)
+{
+ struct mru_entry *cur = xmalloc(sizeof(*cur));
+ cur->item = item;
+ cur->prev = mru->tail;
+ cur->next = NULL;
+
+ if (mru->tail)
+ mru->tail->next = cur;
+ else
+ mru->head = cur;
+ mru->tail = cur;
+}
+
+void mru_mark(struct mru *mru, struct mru_entry *entry)
+{
+ /* If we're already at the front of the list, nothing to do */
+ if (mru->head == entry)
+ return;
+
+ /* Otherwise, remove us from our current slot... */
+ if (entry->prev)
+ entry->prev->next = entry->next;
+ if (entry->next)
+ entry->next->prev = entry->prev;
+ else
+ mru->tail = entry->prev;
+
+ /* And insert us at the beginning. */
+ entry->prev = NULL;
+ entry->next = mru->head;
+ if (mru->head)
+ mru->head->prev = entry;
+ mru->head = entry;
+}
+
+void mru_clear(struct mru *mru)
+{
+ struct mru_entry *p = mru->head;
+
+ while (p) {
+ struct mru_entry *to_free = p;
+ p = p->next;
+ free(to_free);
+ }
+ mru->head = mru->tail = NULL;
+}
diff --git a/mru.h b/mru.h
new file mode 100644
index 000000000..42e4aeaa1
--- /dev/null
+++ b/mru.h
@@ -0,0 +1,45 @@
+#ifndef MRU_H
+#define MRU_H
+
+/**
+ * A simple most-recently-used cache, backed by a doubly-linked list.
+ *
+ * Usage is roughly:
+ *
+ * // Create a list. Zero-initialization is required.
+ * static struct mru cache;
+ * mru_append(&cache, item);
+ * ...
+ *
+ * // Iterate in MRU order.
+ * struct mru_entry *p;
+ * for (p = cache.head; p; p = p->next) {
+ * if (matches(p->item))
+ * break;
+ * }
+ *
+ * // Mark an item as used, moving it to the front of the list.
+ * mru_mark(&cache, p);
+ *
+ * // Reset the list to empty, cleaning up all resources.
+ * mru_clear(&cache);
+ *
+ * Note that you SHOULD NOT call mru_mark() and then continue traversing the
+ * list; it reorders the marked item to the front of the list, and therefore
+ * you will begin traversing the whole list again.
+ */
+
+struct mru_entry {
+ void *item;
+ struct mru_entry *prev, *next;
+};
+
+struct mru {
+ struct mru_entry *head, *tail;
+};
+
+void mru_append(struct mru *mru, void *item);
+void mru_mark(struct mru *mru, struct mru_entry *entry);
+void mru_clear(struct mru *mru);
+
+#endif /* MRU_H */
diff --git a/pack-write.c b/pack-write.c
index 33293ce2a..ea0b78813 100644
--- a/pack-write.c
+++ b/pack-write.c
@@ -354,7 +354,6 @@ void finish_tmp_packfile(struct strbuf *name_buffer,
die_errno("unable to make temporary index file readable");
strbuf_addf(name_buffer, "%s.pack", sha1_to_hex(sha1));
- free_pack_by_name(name_buffer->buf);
if (rename(pack_tmp_name, name_buffer->buf))
die_errno("unable to rename temporary pack file");
diff --git a/sha1_file.c b/sha1_file.c
index cb571ac6e..3066b5f71 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -23,6 +23,7 @@
#include "bulk-checkin.h"
#include "streaming.h"
#include "dir.h"
+#include "mru.h"
#ifndef O_NOATIME
#if defined(__linux__) && (defined(__i386__) || defined(__PPC__))
@@ -59,14 +60,6 @@ static struct cached_object empty_tree = {
0
};
-/*
- * A pointer to the last packed_git in which an object was found.
- * When an object is sought, we look in this packfile first, because
- * objects that are looked up at similar times are often in the same
- * packfile as one another.
- */
-static struct packed_git *last_found_pack;
-
static struct cached_object *find_cached_object(const unsigned char *sha1)
{
int i;
@@ -522,6 +515,9 @@ static size_t peak_pack_mapped;
static size_t pack_mapped;
struct packed_git *packed_git;
+static struct mru packed_git_mru_storage;
+struct mru *packed_git_mru = &packed_git_mru_storage;
+
void pack_report(void)
{
fprintf(stderr,
@@ -891,36 +887,6 @@ void close_pack_index(struct packed_git *p)
}
}
-/*
- * This is used by git-repack in case a newly created pack happens to
- * contain the same set of objects as an existing one. In that case
- * the resulting file might be different even if its name would be the
- * same. It is best to close any reference to the old pack before it is
- * replaced on disk. Of course no index pointers or windows for given pack
- * must subsist at this point. If ever objects from this pack are requested
- * again, the new version of the pack will be reinitialized through
- * reprepare_packed_git().
- */
-void free_pack_by_name(const char *pack_name)
-{
- struct packed_git *p, **pp = &packed_git;
-
- while (*pp) {
- p = *pp;
- if (strcmp(pack_name, p->pack_name) == 0) {
- clear_delta_base_cache();
- close_pack(p);
- free(p->bad_object_sha1);
- *pp = p->next;
- if (last_found_pack == p)
- last_found_pack = NULL;
- free(p);
- return;
- }
- pp = &p->next;
- }
-}
-
static unsigned int get_max_fd_limit(void)
{
#ifdef RLIMIT_NOFILE
@@ -1385,6 +1351,15 @@ static void rearrange_packed_git(void)
free(ary);
}
+static void prepare_packed_git_mru(void)
+{
+ struct packed_git *p;
+
+ mru_clear(packed_git_mru);
+ for (p = packed_git; p; p = p->next)
+ mru_append(packed_git_mru, p);
+}
+
static int prepare_packed_git_run_once = 0;
void prepare_packed_git(void)
{
@@ -1400,6 +1375,7 @@ void prepare_packed_git(void)
alt->name[-1] = '/';
}
rearrange_packed_git();
+ prepare_packed_git_mru();
prepare_packed_git_run_once = 1;
}
@@ -2604,21 +2580,15 @@ static int fill_pack_entry(const unsigned char *sha1,
*/
static int find_pack_entry(const unsigned char *sha1, struct pack_entry *e)
{
- struct packed_git *p;
+ struct mru_entry *p;
prepare_packed_git();
if (!packed_git)
return 0;
- if (last_found_pack && fill_pack_entry(sha1, e, last_found_pack))
- return 1;
-
- for (p = packed_git; p; p = p->next) {
- if (p == last_found_pack)
- continue; /* we already checked this one */
-
- if (fill_pack_entry(sha1, e, p)) {
- last_found_pack = p;
+ for (p = packed_git_mru->head; p; p = p->next) {
+ if (fill_pack_entry(sha1, e, p->item)) {
+ mru_mark(packed_git_mru, p);
return 1;
}
}
diff --git a/t/perf/p5303-many-packs.sh b/t/perf/p5303-many-packs.sh
new file mode 100755
index 000000000..377985194
--- /dev/null
+++ b/t/perf/p5303-many-packs.sh
@@ -0,0 +1,87 @@
+#!/bin/sh
+
+test_description='performance with large numbers of packs'
+. ./perf-lib.sh
+
+test_perf_large_repo
+
+# A real many-pack situation would probably come from having a lot of pushes
+# over time. We don't know how big each push would be, but we can fake it by
+# just walking the first-parent chain and having every 5 commits be their own
+# "push". This isn't _entirely_ accurate, as real pushes would have some
+# duplicate objects due to thin-pack fixing, but it's a reasonable
+# approximation.
+#
+# And then all of the rest of the objects can go in a single packfile that
+# represents the state before any of those pushes (actually, we'll generate
+# that first because in such a setup it would be the oldest pack, and we sort
+# the packs by reverse mtime inside git).
+repack_into_n () {
+ rm -rf staging &&
+ mkdir staging &&
+
+ git rev-list --first-parent HEAD |
+ sed -n '1~5p' |
+ head -n "$1" |
+ perl -e 'print reverse <>' \
+ >pushes
+
+ # create base packfile
+ head -n 1 pushes |
+ git pack-objects --delta-base-offset --revs staging/pack
+
+ # and then incrementals between each pair of commits
+ last= &&
+ while read rev
+ do
+ if test -n "$last"; then
+ {
+ echo "$rev" &&
+ echo "^$last"
+ } |
+ git pack-objects --delta-base-offset --revs \
+ staging/pack || return 1
+ fi
+ last=$rev
+ done <pushes &&
+
+ # and install the whole thing
+ rm -f .git/objects/pack/* &&
+ mv staging/* .git/objects/pack/
+}
+
+# Pretend we just have a single branch and no reflogs, and that everything is
+# in objects/pack; that makes our fake pack-building via repack_into_n()
+# much simpler.
+test_expect_success 'simplify reachability' '
+ tip=$(git rev-parse --verify HEAD) &&
+ git for-each-ref --format="option no-deref%0adelete %(refname)" |
+ git update-ref --stdin &&
+ rm -rf .git/logs &&
+ git update-ref refs/heads/master $tip &&
+ git symbolic-ref HEAD refs/heads/master &&
+ git repack -ad
+'
+
+for nr_packs in 1 50 1000
+do
+ test_expect_success "create $nr_packs-pack scenario" '
+ repack_into_n $nr_packs
+ '
+
+ test_perf "rev-list ($nr_packs)" '
+ git rev-list --objects --all >/dev/null
+ '
+
+ # This simulates the interesting part of the repack, which is the
+ # actual pack generation, without smudging the on-disk setup
+ # between trials.
+ test_perf "repack ($nr_packs)" '
+ git pack-objects --keep-true-parents \
+ --honor-pack-keep --non-empty --all \
+ --reflog --indexed-objects --delta-base-offset \
+ --stdout </dev/null >/dev/null
+ '
+done
+
+test_done