From 3ffefb54c0515308ceafb6ba071567d9fd379498 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 10 Jun 2014 17:36:52 -0400 Subject: commit_tree: take a pointer/len pair rather than a const strbuf While strbufs are pretty common throughout our code, it is more flexible for functions to take a pointer/len pair than a strbuf. It's easy to turn a strbuf into such a pair (by dereferencing its members), but less easy to go the other way (you can strbuf_attach, but that has implications about memory ownership). This patch teaches commit_tree (and its associated callers and sub-functions) to take such a pair for the commit message rather than a strbuf. This makes passing the buffer around slightly more verbose, but means we can get rid of some dangerous strbuf_attach calls in the next patch. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- commit.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'commit.c') diff --git a/commit.c b/commit.c index f4793316a..bd3d5afcd 100644 --- a/commit.c +++ b/commit.c @@ -1344,7 +1344,8 @@ void free_commit_extra_headers(struct commit_extra_header *extra) } } -int commit_tree(const struct strbuf *msg, const unsigned char *tree, +int commit_tree(const char *msg, size_t msg_len, + const unsigned char *tree, struct commit_list *parents, unsigned char *ret, const char *author, const char *sign_commit) { @@ -1352,7 +1353,7 @@ int commit_tree(const struct strbuf *msg, const unsigned char *tree, int result; append_merge_tag_headers(parents, &tail); - result = commit_tree_extended(msg, tree, parents, ret, + result = commit_tree_extended(msg, msg_len, tree, parents, ret, author, sign_commit, extra); free_commit_extra_headers(extra); return result; @@ -1473,7 +1474,8 @@ static const char commit_utf8_warn[] = "You may want to amend it after fixing the message, or set the config\n" "variable i18n.commitencoding to the encoding your project uses.\n"; -int commit_tree_extended(const struct strbuf *msg, const unsigned char *tree, +int commit_tree_extended(const char *msg, size_t msg_len, + const unsigned char *tree, struct commit_list *parents, unsigned char *ret, const char *author, const char *sign_commit, struct commit_extra_header *extra) @@ -1484,7 +1486,7 @@ int commit_tree_extended(const struct strbuf *msg, const unsigned char *tree, assert_sha1_type(tree, OBJ_TREE); - if (memchr(msg->buf, '\0', msg->len)) + if (memchr(msg, '\0', msg_len)) return error("a NUL byte in commit log message not allowed."); /* Not having i18n.commitencoding is the same as having utf-8 */ @@ -1523,7 +1525,7 @@ int commit_tree_extended(const struct strbuf *msg, const unsigned char *tree, strbuf_addch(&buffer, '\n'); /* And add the comment */ - strbuf_addbuf(&buffer, msg); + strbuf_add(&buffer, msg, msg_len); /* And check the encoding */ if (encoding_is_utf8 && !verify_utf8(&buffer)) -- cgit v1.2.1 From 969eba6341a5af8ac52c67e26462548ed05e23e3 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 10 Jun 2014 17:39:04 -0400 Subject: commit: push commit_index update into alloc_commit_node Whenever we create a commit object via lookup_commit, we give it a unique index to be used with the commit-slab API. The theory is that any "struct commit" we create would follow this code path, so any such struct would get an index. However, callers could use alloc_commit_node() directly (and get multiple commits with index 0). Let's push the indexing into alloc_commit_node so that it's hard for callers to get it wrong. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- commit.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'commit.c') diff --git a/commit.c b/commit.c index bd3d5afcd..fbdc480cc 100644 --- a/commit.c +++ b/commit.c @@ -17,7 +17,6 @@ static struct commit_extra_header *read_commit_extra_header_lines(const char *bu int save_commit_buffer = 1; const char *commit_type = "commit"; -static int commit_count; static struct commit *check_commit(struct object *obj, const unsigned char *sha1, @@ -64,7 +63,6 @@ struct commit *lookup_commit(const unsigned char *sha1) struct object *obj = lookup_object(sha1); if (!obj) { struct commit *c = alloc_commit_node(); - c->index = commit_count++; return create_object(sha1, OBJ_COMMIT, c); } if (!obj->type) -- cgit v1.2.1 From 0fb370da9ca972f9571530f95c0dacb31368c280 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Thu, 12 Jun 2014 18:05:37 -0400 Subject: provide a helper to free commit buffer This converts two lines into one at each caller. But more importantly, it abstracts the concept of freeing the buffer, which will make it easier to change later. Note that we also need to provide a "detach" mechanism for a tricky case in index-pack. We are passed a buffer for the object generated by processing the incoming pack. If we are not using --strict, we just calculate the sha1 on that buffer and return, leaving the caller to free it. But if we are using --strict, we actually attach that buffer to an object, pass the object to the fsck functions, and then detach the buffer from the object again (so that the caller can free it as usual). In this case, we don't want to free the buffer ourselves, but just make sure it is no longer associated with the commit. Note that we are making the assumption here that the attach/detach process does not impact the buffer at all (e.g., it is never reallocated or modified). That holds true now, and we have no plans to change that. However, as we abstract the commit_buffer code, this dependency becomes less obvious. So when we detach, let's also make sure that we get back the same buffer that we gave to the commit_buffer code. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- commit.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'commit.c') diff --git a/commit.c b/commit.c index fbdc480cc..11a05c1f2 100644 --- a/commit.c +++ b/commit.c @@ -245,6 +245,19 @@ int unregister_shallow(const unsigned char *sha1) return 0; } +void free_commit_buffer(struct commit *commit) +{ + free(commit->buffer); + commit->buffer = NULL; +} + +const void *detach_commit_buffer(struct commit *commit) +{ + void *ret = commit->buffer; + commit->buffer = NULL; + return ret; +} + int parse_commit_buffer(struct commit *item, const void *buffer, unsigned long size) { const char *tail = buffer; -- cgit v1.2.1 From 66c2827ea4deb24ff541e30a5b6239ad5e9f6801 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 10 Jun 2014 17:40:14 -0400 Subject: provide a helper to set the commit buffer Right now this is just a one-liner, but abstracting it will make it easier to change later. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- commit.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'commit.c') diff --git a/commit.c b/commit.c index 11a05c1f2..fc8b4e287 100644 --- a/commit.c +++ b/commit.c @@ -245,6 +245,11 @@ int unregister_shallow(const unsigned char *sha1) return 0; } +void set_commit_buffer(struct commit *commit, void *buffer) +{ + commit->buffer = buffer; +} + void free_commit_buffer(struct commit *commit) { free(commit->buffer); @@ -335,7 +340,7 @@ int parse_commit(struct commit *item) } ret = parse_commit_buffer(item, buffer, size); if (save_commit_buffer && !ret) { - item->buffer = buffer; + set_commit_buffer(item, buffer); return 0; } free(buffer); -- cgit v1.2.1 From 152ff1ccebd822fd97f27d2a6c3fa2058f088fd8 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 10 Jun 2014 17:40:39 -0400 Subject: provide helpers to access the commit buffer Many sites look at commit->buffer to get more detailed information than what is in the parsed commit struct. However, we sometimes drop commit->buffer to save memory, in which case the caller would need to read the object afresh. Some callers do this (leading to duplicated code), and others do not (which opens the possibility of a segfault if somebody else frees the buffer). Let's provide a pair of helpers, "get" and "unuse", that let callers easily get the buffer. They will use the cached buffer when possible, and otherwise load from disk using read_sha1_file. Note that we also need to add a "get_cached" variant which returns NULL when we do not have a cached buffer. At first glance this seems to defeat the purpose of "get", which is to always provide a return value. However, some log code paths actually use the NULL-ness of commit->buffer as a boolean flag to decide whether to try printing the commit. At least for now, we want to continue supporting that use. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- commit.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'commit.c') diff --git a/commit.c b/commit.c index fc8b4e287..b6b0e0d6f 100644 --- a/commit.c +++ b/commit.c @@ -250,6 +250,34 @@ void set_commit_buffer(struct commit *commit, void *buffer) commit->buffer = buffer; } +const void *get_cached_commit_buffer(const struct commit *commit) +{ + return commit->buffer; +} + +const void *get_commit_buffer(const struct commit *commit) +{ + const void *ret = get_cached_commit_buffer(commit); + if (!ret) { + enum object_type type; + unsigned long size; + ret = read_sha1_file(commit->object.sha1, &type, &size); + if (!ret) + die("cannot read commit object %s", + sha1_to_hex(commit->object.sha1)); + if (type != OBJ_COMMIT) + die("expected commit for %s, got %s", + sha1_to_hex(commit->object.sha1), typename(type)); + } + return ret; +} + +void unuse_commit_buffer(const struct commit *commit, const void *buffer) +{ + if (commit->buffer != buffer) + free((void *)buffer); +} + void free_commit_buffer(struct commit *commit) { free(commit->buffer); -- cgit v1.2.1 From ba41c1c93fd9109eae954f75a8cb8e32c3e29530 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 10 Jun 2014 17:41:02 -0400 Subject: use get_commit_buffer to avoid duplicate code For both of these sites, we already do the "fallback to read_sha1_file" trick. But we can shorten the code by just using get_commit_buffer. Note that the error cases are slightly different when read_sha1_file fails. get_commit_buffer will die() if the object cannot be loaded, or is a non-commit. For get_sha1_oneline, this will almost certainly never happen, as we will have just called parse_object (and if it does, it's probably worth complaining about). For record_author_date, the new behavior is probably better; we notify the user of the error instead of silently ignoring it. And because it's used only for sorting by author-date, somebody examining a corrupt repo can fallback to the regular traversal order. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- commit.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'commit.c') diff --git a/commit.c b/commit.c index b6b0e0d6f..1903dde28 100644 --- a/commit.c +++ b/commit.c @@ -583,22 +583,12 @@ static void record_author_date(struct author_date_slab *author_date, struct commit *commit) { const char *buf, *line_end, *ident_line; - char *buffer = NULL; + const char *buffer = get_commit_buffer(commit); struct ident_split ident; char *date_end; unsigned long date; - if (!commit->buffer) { - unsigned long size; - enum object_type type; - buffer = read_sha1_file(commit->object.sha1, &type, &size); - if (!buffer) - return; - } - - for (buf = commit->buffer ? commit->buffer : buffer; - buf; - buf = line_end + 1) { + for (buf = buffer; buf; buf = line_end + 1) { line_end = strchrnul(buf, '\n'); ident_line = skip_prefix(buf, "author "); if (!ident_line) { @@ -619,7 +609,7 @@ static void record_author_date(struct author_date_slab *author_date, *(author_date_slab_at(author_date, commit)) = date; fail_exit: - free(buffer); + unuse_commit_buffer(commit, buffer); } static int compare_commits_by_author_date(const void *a_, const void *b_, -- cgit v1.2.1 From c1b3c71f4b4571abb2b2a457122fd100dc9f7eb0 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 10 Jun 2014 17:43:02 -0400 Subject: commit: convert commit->buffer to a slab This will make it easier to manage the buffer cache independently of the "struct commit" objects. It also shrinks "struct commit" by one pointer, which may be helpful. Unfortunately it does not reduce the max memory size of something like "rev-list", because rev-list uses get_cached_commit_buffer() to decide not to show each commit's output (and due to the design of slab_at, accessing the slab requires us to extend it, allocating exactly the same number of buffer pointers we dropped from the commit structs). Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- commit.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'commit.c') diff --git a/commit.c b/commit.c index 1903dde28..e289c7832 100644 --- a/commit.c +++ b/commit.c @@ -245,14 +245,17 @@ int unregister_shallow(const unsigned char *sha1) return 0; } +define_commit_slab(buffer_slab, void *); +static struct buffer_slab buffer_slab = COMMIT_SLAB_INIT(1, buffer_slab); + void set_commit_buffer(struct commit *commit, void *buffer) { - commit->buffer = buffer; + *buffer_slab_at(&buffer_slab, commit) = buffer; } const void *get_cached_commit_buffer(const struct commit *commit) { - return commit->buffer; + return *buffer_slab_at(&buffer_slab, commit); } const void *get_commit_buffer(const struct commit *commit) @@ -274,20 +277,23 @@ const void *get_commit_buffer(const struct commit *commit) void unuse_commit_buffer(const struct commit *commit, const void *buffer) { - if (commit->buffer != buffer) + void *cached = *buffer_slab_at(&buffer_slab, commit); + if (cached != buffer) free((void *)buffer); } void free_commit_buffer(struct commit *commit) { - free(commit->buffer); - commit->buffer = NULL; + void **b = buffer_slab_at(&buffer_slab, commit); + free(*b); + *b = NULL; } const void *detach_commit_buffer(struct commit *commit) { - void *ret = commit->buffer; - commit->buffer = NULL; + void **b = buffer_slab_at(&buffer_slab, commit); + void *ret = *b; + *b = NULL; return ret; } -- cgit v1.2.1 From 8597ea3afea067b39ba7d4adae7ec6c1ee0e7c91 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 10 Jun 2014 17:44:13 -0400 Subject: commit: record buffer length in cache Most callsites which use the commit buffer try to use the cached version attached to the commit, rather than re-reading from disk. Unfortunately, that interface provides only a pointer to the NUL-terminated buffer, with no indication of the original length. For the most part, this doesn't matter. People do not put NULs in their commit messages, and the log code is happy to treat it all as a NUL-terminated string. However, some code paths do care. For example, when checking signatures, we want to be very careful that we verify all the bytes to avoid malicious trickery. This patch just adds an optional "size" out-pointer to get_commit_buffer and friends. The existing callers all pass NULL (there did not seem to be any obvious sites where we could avoid an immediate strlen() call, though perhaps with some further refactoring we could). Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- commit.c | 54 ++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 18 deletions(-) (limited to 'commit.c') diff --git a/commit.c b/commit.c index e289c7832..a036e181c 100644 --- a/commit.c +++ b/commit.c @@ -245,22 +245,31 @@ int unregister_shallow(const unsigned char *sha1) return 0; } -define_commit_slab(buffer_slab, void *); +struct commit_buffer { + void *buffer; + unsigned long size; +}; +define_commit_slab(buffer_slab, struct commit_buffer); static struct buffer_slab buffer_slab = COMMIT_SLAB_INIT(1, buffer_slab); -void set_commit_buffer(struct commit *commit, void *buffer) +void set_commit_buffer(struct commit *commit, void *buffer, unsigned long size) { - *buffer_slab_at(&buffer_slab, commit) = buffer; + struct commit_buffer *v = buffer_slab_at(&buffer_slab, commit); + v->buffer = buffer; + v->size = size; } -const void *get_cached_commit_buffer(const struct commit *commit) +const void *get_cached_commit_buffer(const struct commit *commit, unsigned long *sizep) { - return *buffer_slab_at(&buffer_slab, commit); + struct commit_buffer *v = buffer_slab_at(&buffer_slab, commit); + if (sizep) + *sizep = v->size; + return v->buffer; } -const void *get_commit_buffer(const struct commit *commit) +const void *get_commit_buffer(const struct commit *commit, unsigned long *sizep) { - const void *ret = get_cached_commit_buffer(commit); + const void *ret = get_cached_commit_buffer(commit, sizep); if (!ret) { enum object_type type; unsigned long size; @@ -271,29 +280,38 @@ const void *get_commit_buffer(const struct commit *commit) if (type != OBJ_COMMIT) die("expected commit for %s, got %s", sha1_to_hex(commit->object.sha1), typename(type)); + if (sizep) + *sizep = size; } return ret; } void unuse_commit_buffer(const struct commit *commit, const void *buffer) { - void *cached = *buffer_slab_at(&buffer_slab, commit); - if (cached != buffer) + struct commit_buffer *v = buffer_slab_at(&buffer_slab, commit); + if (v->buffer != buffer) free((void *)buffer); } void free_commit_buffer(struct commit *commit) { - void **b = buffer_slab_at(&buffer_slab, commit); - free(*b); - *b = NULL; + struct commit_buffer *v = buffer_slab_at(&buffer_slab, commit); + free(v->buffer); + v->buffer = NULL; + v->size = 0; } -const void *detach_commit_buffer(struct commit *commit) +const void *detach_commit_buffer(struct commit *commit, unsigned long *sizep) { - void **b = buffer_slab_at(&buffer_slab, commit); - void *ret = *b; - *b = NULL; + struct commit_buffer *v = buffer_slab_at(&buffer_slab, commit); + void *ret; + + ret = v->buffer; + if (sizep) + *sizep = v->size; + + v->buffer = NULL; + v->size = 0; return ret; } @@ -374,7 +392,7 @@ int parse_commit(struct commit *item) } ret = parse_commit_buffer(item, buffer, size); if (save_commit_buffer && !ret) { - set_commit_buffer(item, buffer); + set_commit_buffer(item, buffer, size); return 0; } free(buffer); @@ -589,7 +607,7 @@ static void record_author_date(struct author_date_slab *author_date, struct commit *commit) { const char *buf, *line_end, *ident_line; - const char *buffer = get_commit_buffer(commit); + const char *buffer = get_commit_buffer(commit, NULL); struct ident_split ident; char *date_end; unsigned long date; -- cgit v1.2.1 From 218aa3a6162b80696a82b8745daa38fa826985ae Mon Sep 17 00:00:00 2001 From: Jeff King Date: Fri, 13 Jun 2014 02:32:11 -0400 Subject: reuse cached commit buffer when parsing signatures When we call show_signature or show_mergetag, we read the commit object fresh via read_sha1_file and reparse its headers. However, in most cases we already have the object data available, attached to the "struct commit". This is partially laziness in dealing with the memory allocation issues, but partially defensive programming, in that we would always want to verify a clean version of the buffer (not one that might have been munged by other users of the commit). However, we do not currently ever munge the commit buffer, and not using the already-available buffer carries a fairly big performance penalty when we are looking at a large number of commits. Here are timings on linux.git: [baseline, no signatures] $ time git log >/dev/null real 0m4.902s user 0m4.784s sys 0m0.120s [before] $ time git log --show-signature >/dev/null real 0m14.735s user 0m9.964s sys 0m0.944s [after] $ time git log --show-signature >/dev/null real 0m9.981s user 0m5.260s sys 0m0.936s Note that our user CPU time drops almost in half, close to the non-signature case, but we do still spend more wall-clock and system time, presumably from dealing with gpg. An alternative to this is to note that most commits do not have signatures (less than 1% in this repo), yet we pay the re-parsing cost for every commit just to find out if it has a mergetag or signature. If we checked that when parsing the commit initially, we could avoid re-examining most commits later on. Even if we did pursue that direction, however, this would still speed up the cases where we _do_ have signatures. So it's probably worth doing either way. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- commit.c | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) (limited to 'commit.c') diff --git a/commit.c b/commit.c index a036e181c..ebd7ad846 100644 --- a/commit.c +++ b/commit.c @@ -1138,17 +1138,14 @@ static int do_sign_commit(struct strbuf *buf, const char *keyid) return 0; } -int parse_signed_commit(const unsigned char *sha1, +int parse_signed_commit(const struct commit *commit, struct strbuf *payload, struct strbuf *signature) { + unsigned long size; - enum object_type type; - char *buffer = read_sha1_file(sha1, &type, &size); + const char *buffer = get_commit_buffer(commit, &size); int in_signature, saw_signature = -1; - char *line, *tail; - - if (!buffer || type != OBJ_COMMIT) - goto cleanup; + const char *line, *tail; line = buffer; tail = buffer + size; @@ -1156,7 +1153,7 @@ int parse_signed_commit(const unsigned char *sha1, saw_signature = 0; while (line < tail) { const char *sig = NULL; - char *next = memchr(line, '\n', tail - line); + const char *next = memchr(line, '\n', tail - line); next = next ? next + 1 : tail; if (in_signature && line[0] == ' ') @@ -1177,8 +1174,7 @@ int parse_signed_commit(const unsigned char *sha1, } line = next; } - cleanup: - free(buffer); + unuse_commit_buffer(commit, buffer); return saw_signature; } @@ -1269,8 +1265,7 @@ void check_commit_signature(const struct commit* commit, struct signature_check sigc->result = 'N'; - if (parse_signed_commit(commit->object.sha1, - &payload, &signature) <= 0) + if (parse_signed_commit(commit, &payload, &signature) <= 0) goto out; status = verify_signed_buffer(payload.buf, payload.len, signature.buf, signature.len, @@ -1315,11 +1310,9 @@ struct commit_extra_header *read_commit_extra_headers(struct commit *commit, { struct commit_extra_header *extra = NULL; unsigned long size; - enum object_type type; - char *buffer = read_sha1_file(commit->object.sha1, &type, &size); - if (buffer && type == OBJ_COMMIT) - extra = read_commit_extra_header_lines(buffer, size, exclude); - free(buffer); + const char *buffer = get_commit_buffer(commit, &size); + extra = read_commit_extra_header_lines(buffer, size, exclude); + unuse_commit_buffer(commit, buffer); return extra; } -- cgit v1.2.1