diff options
author | Vicent Marti <tanoku@gmail.com> | 2013-11-14 07:43:51 -0500 |
---|---|---|
committer | Junio C Hamano <gitster@pobox.com> | 2013-12-30 12:17:20 -0800 |
commit | e1273106f62927e3efdb1cfa107cb1a9f913274c (patch) | |
tree | 9a23af0dbec1791e1a0d8b3137e614744ee14f19 /ewah/ewah_io.c | |
parent | 7e3dae494370b5596a6ea76af1191829ce11bce2 (diff) | |
download | git-e1273106f62927e3efdb1cfa107cb1a9f913274c.tar.gz git-e1273106f62927e3efdb1cfa107cb1a9f913274c.tar.xz |
ewah: compressed bitmap implementation
EWAH is a word-aligned compressed variant of a bitset (i.e. a data
structure that acts as a 0-indexed boolean array for many entries).
It uses a 64-bit run-length encoding (RLE) compression scheme,
trading some compression for better processing speed.
The goal of this word-aligned implementation is not to achieve
the best compression, but rather to improve query processing time.
As it stands right now, this EWAH implementation will always be more
efficient storage-wise than its uncompressed alternative.
EWAH arrays will be used as the on-disk format to store reachability
bitmaps for all objects in a repository while keeping reasonable sizes,
in the same way that JGit does.
This EWAH implementation is a mostly straightforward port of the
original `javaewah` library that JGit currently uses. The library is
self-contained and has been embedded whole (4 files) inside the `ewah`
folder to ease redistribution.
The library is re-licensed under the GPLv2 with the permission of Daniel
Lemire, the original author. The source code for the C version can
be found on GitHub:
https://github.com/vmg/libewok
The original Java implementation can also be found on GitHub:
https://github.com/lemire/javaewah
[jc: stripped debug-only code per Peff's $gmane/239768]
Signed-off-by: Vicent Marti <tanoku@gmail.com>
Signed-off-by: Jeff King <peff@peff.net>
Helped-by: Ramsay Jones <ramsay@ramsay1.demon.co.uk>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
Diffstat (limited to 'ewah/ewah_io.c')
-rw-r--r-- | ewah/ewah_io.c | 193 |
1 files changed, 193 insertions, 0 deletions
diff --git a/ewah/ewah_io.c b/ewah/ewah_io.c new file mode 100644 index 000000000..aed0da686 --- /dev/null +++ b/ewah/ewah_io.c @@ -0,0 +1,193 @@ +/** + * Copyright 2013, GitHub, Inc + * Copyright 2009-2013, Daniel Lemire, Cliff Moon, + * David McIntosh, Robert Becho, Google Inc. and Veronika Zenz + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "git-compat-util.h" +#include "ewok.h" + +int ewah_serialize_native(struct ewah_bitmap *self, int fd) +{ + uint32_t write32; + size_t to_write = self->buffer_size * 8; + + /* 32 bit -- bit size for the map */ + write32 = (uint32_t)self->bit_size; + if (write(fd, &write32, 4) != 4) + return -1; + + /** 32 bit -- number of compressed 64-bit words */ + write32 = (uint32_t)self->buffer_size; + if (write(fd, &write32, 4) != 4) + return -1; + + if (write(fd, self->buffer, to_write) != to_write) + return -1; + + /** 32 bit -- position for the RLW */ + write32 = self->rlw - self->buffer; + if (write(fd, &write32, 4) != 4) + return -1; + + return (3 * 4) + to_write; +} + +int ewah_serialize_to(struct ewah_bitmap *self, + int (*write_fun)(void *, const void *, size_t), + void *data) +{ + size_t i; + eword_t dump[2048]; + const size_t words_per_dump = sizeof(dump) / sizeof(eword_t); + uint32_t bitsize, word_count, rlw_pos; + + const eword_t *buffer; + size_t words_left; + + /* 32 bit -- bit size for the map */ + bitsize = htonl((uint32_t)self->bit_size); + if (write_fun(data, &bitsize, 4) != 4) + return -1; + + /** 32 bit -- number of compressed 64-bit words */ + word_count = htonl((uint32_t)self->buffer_size); + if (write_fun(data, &word_count, 4) != 4) + return -1; + + /** 64 bit x N -- compressed words */ + buffer = self->buffer; + words_left = self->buffer_size; + + while (words_left >= words_per_dump) { + for (i = 0; i < words_per_dump; ++i, ++buffer) + dump[i] = htonll(*buffer); + + if (write_fun(data, dump, sizeof(dump)) != sizeof(dump)) + return -1; + + words_left -= words_per_dump; + } + + if (words_left) { + for (i = 0; i < words_left; ++i, ++buffer) + dump[i] = htonll(*buffer); + + if (write_fun(data, dump, words_left * 8) != words_left * 8) + return -1; + } + + /** 32 bit -- position for the RLW */ + rlw_pos = (uint8_t*)self->rlw - (uint8_t *)self->buffer; + rlw_pos = htonl(rlw_pos / sizeof(eword_t)); + + if (write_fun(data, &rlw_pos, 4) != 4) + return -1; + + return (3 * 4) + (self->buffer_size * 8); +} + +static int write_helper(void *fd, const void *buf, size_t len) +{ + return write((intptr_t)fd, buf, len); +} + +int ewah_serialize(struct ewah_bitmap *self, int fd) +{ + return ewah_serialize_to(self, write_helper, (void *)(intptr_t)fd); +} + +int ewah_read_mmap(struct ewah_bitmap *self, void *map, size_t len) +{ + uint32_t *read32 = map; + eword_t *read64; + size_t i; + + self->bit_size = ntohl(*read32++); + self->buffer_size = self->alloc_size = ntohl(*read32++); + self->buffer = ewah_realloc(self->buffer, + self->alloc_size * sizeof(eword_t)); + + if (!self->buffer) + return -1; + + for (i = 0, read64 = (void *)read32; i < self->buffer_size; ++i) + self->buffer[i] = ntohll(*read64++); + + read32 = (void *)read64; + self->rlw = self->buffer + ntohl(*read32++); + + return (3 * 4) + (self->buffer_size * 8); +} + +int ewah_deserialize(struct ewah_bitmap *self, int fd) +{ + size_t i; + eword_t dump[2048]; + const size_t words_per_dump = sizeof(dump) / sizeof(eword_t); + uint32_t bitsize, word_count, rlw_pos; + + eword_t *buffer = NULL; + size_t words_left; + + ewah_clear(self); + + /* 32 bit -- bit size for the map */ + if (read(fd, &bitsize, 4) != 4) + return -1; + + self->bit_size = (size_t)ntohl(bitsize); + + /** 32 bit -- number of compressed 64-bit words */ + if (read(fd, &word_count, 4) != 4) + return -1; + + self->buffer_size = self->alloc_size = (size_t)ntohl(word_count); + self->buffer = ewah_realloc(self->buffer, + self->alloc_size * sizeof(eword_t)); + + if (!self->buffer) + return -1; + + /** 64 bit x N -- compressed words */ + buffer = self->buffer; + words_left = self->buffer_size; + + while (words_left >= words_per_dump) { + if (read(fd, dump, sizeof(dump)) != sizeof(dump)) + return -1; + + for (i = 0; i < words_per_dump; ++i, ++buffer) + *buffer = ntohll(dump[i]); + + words_left -= words_per_dump; + } + + if (words_left) { + if (read(fd, dump, words_left * 8) != words_left * 8) + return -1; + + for (i = 0; i < words_left; ++i, ++buffer) + *buffer = ntohll(dump[i]); + } + + /** 32 bit -- position for the RLW */ + if (read(fd, &rlw_pos, 4) != 4) + return -1; + + self->rlw = self->buffer + ntohl(rlw_pos); + return 0; +} |