All of lore.kernel.org
 help / color / mirror / Atom feed
From: Jeff King <peff@peff.net>
To: Derrick Stolee <stolee@gmail.com>
Cc: "SZEDER Gábor" <szeder.dev@gmail.com>,
	"Ævar Arnfjörð Bjarmason" <avarab@gmail.com>,
	"Stefan Beller" <sbeller@google.com>, git <git@vger.kernel.org>,
	"Duy Nguyen" <pclouds@gmail.com>
Subject: [PoC -- do not apply 3/3] test-tree-bitmap: replace ewah with custom rle encoding
Date: Tue, 9 Oct 2018 19:14:41 -0400	[thread overview]
Message-ID: <20181009231441.GC23730@sigill.intra.peff.net> (raw)
In-Reply-To: <20181009231250.GA19342@sigill.intra.peff.net>

The rules are basically:

 - each bitmap is a series of counts of runs of 0/1

 - each count is one of our standard varints

 - each bitmap must have at least one initial count of
   zeroes (which may itself be a zero-length count, if the
   first bit is set)

 - a zero-length count anywhere else marks the end of
   the bitmap

For a sparse bitmap, these will tend to be quite short,
because long runs are encoded as fairly small counts. The
worst case is an alternate 0/1/0/1 bitmap, where we will
spend a full byte to specify each bit (thus bloating it by a
factor of 8 over an uncompressed bitmap).

Signed-off-by: Jeff King <peff@peff.net>
---
 t/helper/test-tree-bitmap.c | 105 +++++++++++++++++++++++++++++++-----
 1 file changed, 91 insertions(+), 14 deletions(-)

diff --git a/t/helper/test-tree-bitmap.c b/t/helper/test-tree-bitmap.c
index 6f8833344a..36f19ed464 100644
--- a/t/helper/test-tree-bitmap.c
+++ b/t/helper/test-tree-bitmap.c
@@ -3,6 +3,7 @@
 #include "diffcore.h"
 #include "argv-array.h"
 #include "ewah/ewok.h"
+#include "varint.h"
 
 /* map of pathnames to bit positions */
 struct pathmap_entry {
@@ -123,6 +124,49 @@ static void collect_paths(struct hashmap *paths)
 	free(sorted);
 }
 
+static void strbuf_add_varint(struct strbuf *out, uintmax_t val)
+{
+	size_t len;
+	strbuf_grow(out, 16); /* enough for any varint */
+	len = encode_varint(val, (unsigned char *)out->buf + out->len);
+	strbuf_setlen(out, out->len + len);
+}
+
+static void bitmap_to_rle(struct strbuf *out, struct bitmap *bitmap)
+{
+	int curval = 0; /* count zeroes, then ones, then zeroes, etc */
+	size_t run = 0;
+	size_t word;
+	size_t orig_len = out->len;
+
+	for (word = 0; word < bitmap->word_alloc; word++) {
+		int bit;
+
+		for (bit = 0; bit < BITS_IN_EWORD; bit++) {
+			int val = !!(bitmap->words[word] & (((eword_t)1) << bit));
+			if (val == curval)
+				run++;
+			else {
+				strbuf_add_varint(out, run);
+				curval = 1 - curval; /* flip 0/1 */
+				run = 1;
+			}
+		}
+	}
+
+	/*
+	 * complete the run, but do not bother with trailing zeroes, unless we
+	 * failed to write even an initial run of 0's.
+	 */
+	if (curval && run)
+		strbuf_add_varint(out, run);
+	else if (orig_len == out->len)
+		strbuf_add_varint(out, 0);
+
+	/* signal end-of-input with an empty run */
+	strbuf_add_varint(out, 0);
+}
+
 /* generate the bitmap for a single commit */
 static void generate_bitmap(struct diff_queue_struct *q,
 			    struct diff_options *opts,
@@ -130,7 +174,6 @@ static void generate_bitmap(struct diff_queue_struct *q,
 {
 	struct walk_paths_data *data = vdata;
 	struct bitmap *bitmap = bitmap_new();
-	struct ewah_bitmap *ewah;
 	struct strbuf out = STRBUF_INIT;
 	size_t i;
 
@@ -148,8 +191,7 @@ static void generate_bitmap(struct diff_queue_struct *q,
 		bitmap_set(bitmap, entry->pos);
 	}
 
-	ewah = bitmap_to_ewah(bitmap);
-	ewah_serialize_strbuf(ewah, &out);
+	bitmap_to_rle(&out, bitmap);
 
 	fwrite(data->commit->object.oid.hash, 1, GIT_SHA1_RAWSZ, stdout);
 	fwrite(out.buf, 1, out.len, stdout);
@@ -160,7 +202,6 @@ static void generate_bitmap(struct diff_queue_struct *q,
 		     (unsigned)out.len);
 
 	strbuf_release(&out);
-	ewah_free(ewah);
 	bitmap_free(bitmap);
 }
 
@@ -181,6 +222,51 @@ static void show_path(size_t pos, void *data)
 	printf("%s\n", paths[pos]);
 }
 
+static size_t rle_each_bit(const unsigned char *in, size_t len,
+			   void (*fn)(size_t, void *), void *data)
+{
+	int curval = 0; /* look for zeroes first, then ones, etc */
+	const unsigned char *cur = in;
+	const unsigned char *end = in + len;
+	size_t pos;
+
+	/* we always have a first run, even if it's 0 zeroes */
+	pos = decode_varint(&cur);
+
+	/*
+	 * ugh, varint does not seem to have a way to prevent reading past
+	 * the end of the buffer. We'll do a length check after each one,
+	 * so the worst case is bounded.
+	 */
+	if (cur > end) {
+		error("input underflow in rle");
+		return len;
+	}
+
+	while (1) {
+		size_t run = decode_varint(&cur);
+
+		if (cur > end) {
+			error("input underflow in rle");
+			return len;
+		}
+
+		if (!run)
+			break; /* empty run signals end */
+
+		curval = 1 - curval; /* flip 0/1 */
+		if (curval) {
+			/* we have a run of 1's; deliver them */
+			size_t i;
+			for (i = 0; i < run; i++)
+				fn(pos + i, data);
+		}
+		pos += run;
+	}
+
+	return cur - in;
+}
+
 static void do_dump(void)
 {
 	struct strbuf in = STRBUF_INIT;
@@ -219,7 +305,6 @@ static void do_dump(void)
 	/* read the bitmap for each commit */
 	while (remain) {
 		struct object_id oid;
-		struct ewah_bitmap *ewah;
 		ssize_t len;
 
 		if (remain < GIT_SHA1_RAWSZ) {
@@ -230,17 +315,9 @@ static void do_dump(void)
 		cur += GIT_SHA1_RAWSZ;
 		remain -= GIT_SHA1_RAWSZ;
 
-		ewah = ewah_new();
-		len = ewah_read_mmap(ewah, cur, remain);
-		if (len < 0) {
-			ewah_free(ewah);
-			goto out;
-		}
-
 		printf("%s\n", oid_to_hex(&oid));
-		ewah_each_bit(ewah, show_path, paths);
+		len = rle_each_bit((const unsigned char *)cur, remain, show_path, paths);
 
-		ewah_free(ewah);
 		cur += len;
 		remain -= len;
 	}
-- 
2.19.1.550.g7610f1eecb

  parent reply	other threads:[~2018-10-09 23:14 UTC|newest]

Thread overview: 78+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-10-03 13:23 We should add a "git gc --auto" after "git clone" due to commit graph Ævar Arnfjörð Bjarmason
2018-10-03 13:36 ` SZEDER Gábor
2018-10-03 13:42   ` Derrick Stolee
2018-10-03 14:18     ` Ævar Arnfjörð Bjarmason
2018-10-03 14:01   ` Ævar Arnfjörð Bjarmason
2018-10-03 14:17     ` SZEDER Gábor
2018-10-03 14:22       ` Ævar Arnfjörð Bjarmason
2018-10-03 14:53         ` SZEDER Gábor
2018-10-03 15:19           ` Ævar Arnfjörð Bjarmason
2018-10-03 16:59             ` SZEDER Gábor
2018-10-05  6:09               ` Junio C Hamano
2018-10-10 22:07                 ` SZEDER Gábor
2018-10-10 23:01                   ` Ævar Arnfjörð Bjarmason
2018-10-03 19:08           ` Stefan Beller
2018-10-03 19:21             ` Jeff King
2018-10-03 20:35               ` Ævar Arnfjörð Bjarmason
2018-10-03 17:47         ` Stefan Beller
2018-10-03 18:47           ` Ævar Arnfjörð Bjarmason
2018-10-03 18:51             ` Jeff King
2018-10-03 18:59               ` Derrick Stolee
2018-10-03 19:18                 ` Jeff King
2018-10-08 16:41                   ` SZEDER Gábor
2018-10-08 16:57                     ` Derrick Stolee
2018-10-08 18:10                       ` SZEDER Gábor
2018-10-08 18:29                         ` Derrick Stolee
2018-10-09  3:08                           ` Jeff King
2018-10-09 13:48                             ` Bloom Filters (was Re: We should add a "git gc --auto" after "git clone" due to commit graph) Derrick Stolee
2018-10-09 18:45                               ` Ævar Arnfjörð Bjarmason
2018-10-09 18:46                               ` Jeff King
2018-10-09 19:03                                 ` Derrick Stolee
2018-10-09 21:14                                   ` Jeff King
2018-10-09 23:12                                     ` Bloom Filters Jeff King
2018-10-09 23:13                                       ` [PoC -- do not apply 1/3] initial tree-bitmap proof of concept Jeff King
2018-10-09 23:14                                       ` [PoC -- do not apply 2/3] test-tree-bitmap: add "dump" mode Jeff King
2018-10-10  0:48                                         ` Junio C Hamano
2018-10-11  3:13                                           ` Jeff King
2018-10-09 23:14                                       ` Jeff King [this message]
2018-10-10  0:58                                         ` [PoC -- do not apply 3/3] test-tree-bitmap: replace ewah with custom rle encoding Junio C Hamano
2018-10-11  3:20                                           ` Jeff King
2018-10-11 12:33                                       ` Bloom Filters Derrick Stolee
2018-10-11 13:43                                         ` Jeff King
2018-10-09 21:30                             ` We should add a "git gc --auto" after "git clone" due to commit graph SZEDER Gábor
2018-10-09 19:34                       ` [PATCH 0/4] Bloom filter experiment SZEDER Gábor
2018-10-09 19:34                         ` [PATCH 1/4] Add a (very) barebones Bloom filter implementation SZEDER Gábor
2018-10-09 19:34                         ` [PATCH 2/4] commit-graph: write a Bloom filter containing changed paths for each commit SZEDER Gábor
2018-10-09 21:06                           ` Jeff King
2018-10-09 21:37                             ` SZEDER Gábor
2018-10-09 19:34                         ` [PATCH 3/4] revision.c: use the Bloom filter to speed up path-limited revision walks SZEDER Gábor
2018-10-09 19:34                         ` [PATCH 4/4] revision.c: add GIT_TRACE_BLOOM_FILTER for a bit of statistics SZEDER Gábor
2018-10-09 19:47                         ` [PATCH 0/4] Bloom filter experiment Derrick Stolee
2018-10-11  1:21                         ` [PATCH 0/2] Per-commit filter proof of concept Jonathan Tan
2018-10-11  1:21                           ` [PATCH 1/2] One filter per commit Jonathan Tan
2018-10-11 12:49                             ` Derrick Stolee
2018-10-11 19:11                               ` [PATCH] Per-commit and per-parent filters for 2 parents Jonathan Tan
2018-10-11  1:21                           ` [PATCH 2/2] Only make bloom filter for first parent Jonathan Tan
2018-10-11  7:37                           ` [PATCH 0/2] Per-commit filter proof of concept Ævar Arnfjörð Bjarmason
2018-10-15 14:39                         ` [PATCH 0/4] Bloom filter experiment Derrick Stolee
2018-10-16  4:45                           ` Junio C Hamano
2018-10-16 11:13                             ` Derrick Stolee
2018-10-16 12:57                               ` Ævar Arnfjörð Bjarmason
2018-10-16 13:03                                 ` Derrick Stolee
2018-10-18  2:00                                 ` Junio C Hamano
2018-10-16 23:41                           ` Jonathan Tan
2018-10-08 23:02                     ` We should add a "git gc --auto" after "git clone" due to commit graph Junio C Hamano
2018-10-03 14:32     ` Duy Nguyen
2018-10-03 16:45 ` Duy Nguyen
2018-10-04 21:42 ` [RFC PATCH] " Ævar Arnfjörð Bjarmason
2018-10-05 12:05   ` Derrick Stolee
2018-10-05 13:05     ` Ævar Arnfjörð Bjarmason
2018-10-05 13:45       ` Derrick Stolee
2018-10-05 14:04         ` Ævar Arnfjörð Bjarmason
2018-10-05 19:21         ` Jeff King
2018-10-05 19:41           ` Derrick Stolee
2018-10-05 19:47             ` Jeff King
2018-10-05 20:00               ` Derrick Stolee
2018-10-05 20:02                 ` Jeff King
2018-10-05 20:01               ` Ævar Arnfjörð Bjarmason
2018-10-05 20:09                 ` Jeff King

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20181009231441.GC23730@sigill.intra.peff.net \
    --to=peff@peff.net \
    --cc=avarab@gmail.com \
    --cc=git@vger.kernel.org \
    --cc=pclouds@gmail.com \
    --cc=sbeller@google.com \
    --cc=stolee@gmail.com \
    --cc=szeder.dev@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.