All of lore.kernel.org
 help / color / mirror / Atom feed
From: "SZEDER Gábor" <szeder.dev@gmail.com>
To: git@vger.kernel.org
Cc: "Jeff King" <peff@peff.net>, "Junio C Hamano" <gitster@pobox.com>,
	"Derrick Stolee" <stolee@gmail.com>,
	"Ævar Arnfjörð Bjarmason" <avarab@gmail.com>,
	"Stefan Beller" <sbeller@google.com>,
	"Duy Nguyen" <pclouds@gmail.com>,
	"SZEDER Gábor" <szeder.dev@gmail.com>
Subject: [PATCH 1/4] Add a (very) barebones Bloom filter implementation
Date: Tue,  9 Oct 2018 21:34:42 +0200	[thread overview]
Message-ID: <20181009193445.21908-2-szeder.dev@gmail.com> (raw)
In-Reply-To: <20181009193445.21908-1-szeder.dev@gmail.com>

---
 Makefile       |   1 +
 bloom-filter.c | 103 +++++++++++++++++++++++++++++++++++++++++++++++++
 bloom-filter.h |  39 +++++++++++++++++++
 3 files changed, 143 insertions(+)
 create mode 100644 bloom-filter.c
 create mode 100644 bloom-filter.h

diff --git a/Makefile b/Makefile
index 13e1c52478..850eafb3ee 100644
--- a/Makefile
+++ b/Makefile
@@ -827,6 +827,7 @@ LIB_OBJS += base85.o
 LIB_OBJS += bisect.o
 LIB_OBJS += blame.o
 LIB_OBJS += blob.o
+LIB_OBJS += bloom-filter.o
 LIB_OBJS += branch.o
 LIB_OBJS += bulk-checkin.o
 LIB_OBJS += bundle.o
diff --git a/bloom-filter.c b/bloom-filter.c
new file mode 100644
index 0000000000..7dce0e35fa
--- /dev/null
+++ b/bloom-filter.c
@@ -0,0 +1,103 @@
+#include "cache.h"
+#include "bloom-filter.h"
+
+void bloom_filter_init(struct bloom_filter *bf, uint32_t bit_size)
+{
+	if (bit_size % CHAR_BIT)
+		BUG("invalid size for bloom filter");
+
+	bf->nr_entries = 0;
+	bf->bit_size = bit_size;
+	bf->bits = xmalloc(bit_size / CHAR_BIT);
+}
+
+void bloom_filter_free(struct bloom_filter *bf)
+{
+	bf->nr_entries = 0;
+	bf->bit_size = 0;
+	FREE_AND_NULL(bf->bits);
+}
+
+
+void bloom_filter_set_bits(struct bloom_filter *bf, const uint32_t *offsets,
+			   int nr_offsets, int nr_entries)
+{
+	int i;
+	for (i = 0; i < nr_offsets; i++) {
+		uint32_t byte_offset = (offsets[i] % bf->bit_size) / CHAR_BIT;
+		unsigned char mask = 1 << offsets[i] % CHAR_BIT;
+		bf->bits[byte_offset] |= mask;
+	}
+	bf->nr_entries += nr_entries;
+}
+
+int bloom_filter_check_bits(struct bloom_filter *bf, const uint32_t *offsets,
+			    int nr)
+{
+	int i;
+	for (i = 0; i < nr; i++) {
+		uint32_t byte_offset = (offsets[i] % bf->bit_size) / CHAR_BIT;
+		unsigned char mask = 1 << offsets[i] % CHAR_BIT;
+		if (!(bf->bits[byte_offset] & mask))
+			return 0;
+	}
+	return 1;
+}
+
+
+void bloom_filter_add_hash(struct bloom_filter *bf, const unsigned char *hash)
+{
+	uint32_t offsets[GIT_MAX_RAWSZ / sizeof(uint32_t)];
+	hashcpy((unsigned char*)offsets, hash);
+	bloom_filter_set_bits(bf, offsets,
+			     the_hash_algo->rawsz / sizeof(*offsets), 1);
+}
+
+int bloom_filter_check_hash(struct bloom_filter *bf, const unsigned char *hash)
+{
+	uint32_t offsets[GIT_MAX_RAWSZ / sizeof(uint32_t)];
+	hashcpy((unsigned char*)offsets, hash);
+	return bloom_filter_check_bits(bf, offsets,
+			the_hash_algo->rawsz / sizeof(*offsets));
+}
+
+void hashxor(const unsigned char *hash1, const unsigned char *hash2,
+	     unsigned char *out)
+{
+	int i;
+	for (i = 0; i < the_hash_algo->rawsz; i++)
+		out[i] = hash1[i] ^ hash2[i];
+}
+
+/* hardcoded for now... */
+static GIT_PATH_FUNC(git_path_bloom, "objects/info/bloom")
+
+int bloom_filter_load(struct bloom_filter *bf)
+{
+	int fd = open(git_path_bloom(), O_RDONLY);
+
+	if (fd < 0)
+		return -1;
+
+	read_in_full(fd, &bf->nr_entries, sizeof(bf->nr_entries));
+	read_in_full(fd, &bf->bit_size, sizeof(bf->bit_size));
+	if (bf->bit_size % CHAR_BIT)
+		BUG("invalid size for bloom filter");
+	bf->bits = xmalloc(bf->bit_size / CHAR_BIT);
+	read_in_full(fd, bf->bits, bf->bit_size / CHAR_BIT);
+
+	close(fd);
+
+	return 0;
+}
+
+void bloom_filter_write(struct bloom_filter *bf)
+{
+	int fd = xopen(git_path_bloom(), O_WRONLY | O_CREAT | O_TRUNC, 0666);
+
+	write_in_full(fd, &bf->nr_entries, sizeof(bf->nr_entries));
+	write_in_full(fd, &bf->bit_size, sizeof(bf->bit_size));
+	write_in_full(fd, bf->bits, bf->bit_size / CHAR_BIT);
+
+	close(fd);
+}
diff --git a/bloom-filter.h b/bloom-filter.h
new file mode 100644
index 0000000000..94d0af1708
--- /dev/null
+++ b/bloom-filter.h
@@ -0,0 +1,39 @@
+#ifndef BLOOM_FILTER_H
+#define BLOOM_FILTER_H
+
+#include "git-compat-util.h"
+
+struct bloom_filter {
+	uint32_t nr_entries;
+	uint32_t bit_size;
+	unsigned char *bits;
+};
+
+
+void bloom_filter_init(struct bloom_filter *bf, uint32_t bit_size);
+void bloom_filter_free(struct bloom_filter *bf);
+
+void bloom_filter_set_bits(struct bloom_filter *bf, const uint32_t *offsets,
+			   int nr_offsets, int nr_enries);
+int bloom_filter_check_bits(struct bloom_filter *bf, const uint32_t *offsets,
+			    int nr);
+
+/*
+ * Turns the given (SHA1) hash into 5 unsigned ints, and sets the bits at
+ * those positions (modulo the bitmap's size) in the Bloom filter.
+ */
+void bloom_filter_add_hash(struct bloom_filter *bf, const unsigned char *hash);
+/*
+ * Turns the given (SHA1) hash into 5 unsigned ints, and checks the bits at
+ * those positions (modulo the bitmap's size) in the Bloom filter.
+ * Returns 1 if all those bits are set, 0 otherwise.
+ */
+int bloom_filter_check_hash(struct bloom_filter *bf, const unsigned char *hash);
+
+void hashxor(const unsigned char *hash1, const unsigned char *hash2,
+	     unsigned char *out);
+
+int bloom_filter_load(struct bloom_filter *bf);
+void bloom_filter_write(struct bloom_filter *bf);
+
+#endif
-- 
2.19.1.409.g0a0ee5eb6b


  reply	other threads:[~2018-10-09 19:35 UTC|newest]

Thread overview: 78+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-10-03 13:23 We should add a "git gc --auto" after "git clone" due to commit graph Ævar Arnfjörð Bjarmason
2018-10-03 13:36 ` SZEDER Gábor
2018-10-03 13:42   ` Derrick Stolee
2018-10-03 14:18     ` Ævar Arnfjörð Bjarmason
2018-10-03 14:01   ` Ævar Arnfjörð Bjarmason
2018-10-03 14:17     ` SZEDER Gábor
2018-10-03 14:22       ` Ævar Arnfjörð Bjarmason
2018-10-03 14:53         ` SZEDER Gábor
2018-10-03 15:19           ` Ævar Arnfjörð Bjarmason
2018-10-03 16:59             ` SZEDER Gábor
2018-10-05  6:09               ` Junio C Hamano
2018-10-10 22:07                 ` SZEDER Gábor
2018-10-10 23:01                   ` Ævar Arnfjörð Bjarmason
2018-10-03 19:08           ` Stefan Beller
2018-10-03 19:21             ` Jeff King
2018-10-03 20:35               ` Ævar Arnfjörð Bjarmason
2018-10-03 17:47         ` Stefan Beller
2018-10-03 18:47           ` Ævar Arnfjörð Bjarmason
2018-10-03 18:51             ` Jeff King
2018-10-03 18:59               ` Derrick Stolee
2018-10-03 19:18                 ` Jeff King
2018-10-08 16:41                   ` SZEDER Gábor
2018-10-08 16:57                     ` Derrick Stolee
2018-10-08 18:10                       ` SZEDER Gábor
2018-10-08 18:29                         ` Derrick Stolee
2018-10-09  3:08                           ` Jeff King
2018-10-09 13:48                             ` Bloom Filters (was Re: We should add a "git gc --auto" after "git clone" due to commit graph) Derrick Stolee
2018-10-09 18:45                               ` Ævar Arnfjörð Bjarmason
2018-10-09 18:46                               ` Jeff King
2018-10-09 19:03                                 ` Derrick Stolee
2018-10-09 21:14                                   ` Jeff King
2018-10-09 23:12                                     ` Bloom Filters Jeff King
2018-10-09 23:13                                       ` [PoC -- do not apply 1/3] initial tree-bitmap proof of concept Jeff King
2018-10-09 23:14                                       ` [PoC -- do not apply 2/3] test-tree-bitmap: add "dump" mode Jeff King
2018-10-10  0:48                                         ` Junio C Hamano
2018-10-11  3:13                                           ` Jeff King
2018-10-09 23:14                                       ` [PoC -- do not apply 3/3] test-tree-bitmap: replace ewah with custom rle encoding Jeff King
2018-10-10  0:58                                         ` Junio C Hamano
2018-10-11  3:20                                           ` Jeff King
2018-10-11 12:33                                       ` Bloom Filters Derrick Stolee
2018-10-11 13:43                                         ` Jeff King
2018-10-09 21:30                             ` We should add a "git gc --auto" after "git clone" due to commit graph SZEDER Gábor
2018-10-09 19:34                       ` [PATCH 0/4] Bloom filter experiment SZEDER Gábor
2018-10-09 19:34                         ` SZEDER Gábor [this message]
2018-10-09 19:34                         ` [PATCH 2/4] commit-graph: write a Bloom filter containing changed paths for each commit SZEDER Gábor
2018-10-09 21:06                           ` Jeff King
2018-10-09 21:37                             ` SZEDER Gábor
2018-10-09 19:34                         ` [PATCH 3/4] revision.c: use the Bloom filter to speed up path-limited revision walks SZEDER Gábor
2018-10-09 19:34                         ` [PATCH 4/4] revision.c: add GIT_TRACE_BLOOM_FILTER for a bit of statistics SZEDER Gábor
2018-10-09 19:47                         ` [PATCH 0/4] Bloom filter experiment Derrick Stolee
2018-10-11  1:21                         ` [PATCH 0/2] Per-commit filter proof of concept Jonathan Tan
2018-10-11  1:21                           ` [PATCH 1/2] One filter per commit Jonathan Tan
2018-10-11 12:49                             ` Derrick Stolee
2018-10-11 19:11                               ` [PATCH] Per-commit and per-parent filters for 2 parents Jonathan Tan
2018-10-11  1:21                           ` [PATCH 2/2] Only make bloom filter for first parent Jonathan Tan
2018-10-11  7:37                           ` [PATCH 0/2] Per-commit filter proof of concept Ævar Arnfjörð Bjarmason
2018-10-15 14:39                         ` [PATCH 0/4] Bloom filter experiment Derrick Stolee
2018-10-16  4:45                           ` Junio C Hamano
2018-10-16 11:13                             ` Derrick Stolee
2018-10-16 12:57                               ` Ævar Arnfjörð Bjarmason
2018-10-16 13:03                                 ` Derrick Stolee
2018-10-18  2:00                                 ` Junio C Hamano
2018-10-16 23:41                           ` Jonathan Tan
2018-10-08 23:02                     ` We should add a "git gc --auto" after "git clone" due to commit graph Junio C Hamano
2018-10-03 14:32     ` Duy Nguyen
2018-10-03 16:45 ` Duy Nguyen
2018-10-04 21:42 ` [RFC PATCH] " Ævar Arnfjörð Bjarmason
2018-10-05 12:05   ` Derrick Stolee
2018-10-05 13:05     ` Ævar Arnfjörð Bjarmason
2018-10-05 13:45       ` Derrick Stolee
2018-10-05 14:04         ` Ævar Arnfjörð Bjarmason
2018-10-05 19:21         ` Jeff King
2018-10-05 19:41           ` Derrick Stolee
2018-10-05 19:47             ` Jeff King
2018-10-05 20:00               ` Derrick Stolee
2018-10-05 20:02                 ` Jeff King
2018-10-05 20:01               ` Ævar Arnfjörð Bjarmason
2018-10-05 20:09                 ` Jeff King

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20181009193445.21908-2-szeder.dev@gmail.com \
    --to=szeder.dev@gmail.com \
    --cc=avarab@gmail.com \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    --cc=pclouds@gmail.com \
    --cc=peff@peff.net \
    --cc=sbeller@google.com \
    --cc=stolee@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.