git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: "Garima Singh via GitGitGadget" <gitgitgadget@gmail.com>
To: git@vger.kernel.org
Cc: stolee@gmail.com, szeder.dev@gmail.com, jonathantanmy@google.com,
	Garima Singh <garima.singh@microsoft.com>,
	Garima Singh <garima.singh@microsoft.com>
Subject: [PATCH v3 02/16] bloom.c: add the murmur3 hash implementation
Date: Mon, 30 Mar 2020 00:31:24 +0000	[thread overview]
Message-ID: <a5aa3415c05ee9bc67a9471445a20c71a9834673.1585528298.git.gitgitgadget@gmail.com> (raw)
In-Reply-To: <pull.497.v3.git.1585528298.gitgitgadget@gmail.com>

From: Garima Singh <garima.singh@microsoft.com>

In preparation for computing changed paths Bloom filters,
implement the Murmur3 hash algorithm as described in [1].
It hashes the given data using the given seed and produces
a uniformly distributed hash value.

[1] https://en.wikipedia.org/wiki/MurmurHash#Algorithm

Helped-by: Derrick Stolee <dstolee@microsoft.com>
Helped-by: Szeder Gábor <szeder.dev@gmail.com>
Reviewed-by: Jakub Narębski <jnareb@gmail.com>
Signed-off-by: Garima Singh <garima.singh@microsoft.com>
---
 Makefile              |  2 ++
 bloom.c               | 73 +++++++++++++++++++++++++++++++++++++++++++
 bloom.h               | 13 ++++++++
 t/helper/test-bloom.c | 13 ++++++++
 t/helper/test-tool.c  |  1 +
 t/helper/test-tool.h  |  1 +
 t/t0095-bloom.sh      | 30 ++++++++++++++++++
 7 files changed, 133 insertions(+)
 create mode 100644 bloom.c
 create mode 100644 bloom.h
 create mode 100644 t/helper/test-bloom.c
 create mode 100755 t/t0095-bloom.sh

diff --git a/Makefile b/Makefile
index ef1ff2228f0..491f75e68c5 100644
--- a/Makefile
+++ b/Makefile
@@ -695,6 +695,7 @@ X =
 PROGRAMS += $(patsubst %.o,git-%$X,$(PROGRAM_OBJS))
 
 TEST_BUILTINS_OBJS += test-advise.o
+TEST_BUILTINS_OBJS += test-bloom.o
 TEST_BUILTINS_OBJS += test-chmtime.o
 TEST_BUILTINS_OBJS += test-config.o
 TEST_BUILTINS_OBJS += test-ctype.o
@@ -840,6 +841,7 @@ LIB_OBJS += base85.o
 LIB_OBJS += bisect.o
 LIB_OBJS += blame.o
 LIB_OBJS += blob.o
+LIB_OBJS += bloom.o
 LIB_OBJS += branch.o
 LIB_OBJS += bulk-checkin.o
 LIB_OBJS += bundle.o
diff --git a/bloom.c b/bloom.c
new file mode 100644
index 00000000000..40e87632aeb
--- /dev/null
+++ b/bloom.c
@@ -0,0 +1,73 @@
+#include "git-compat-util.h"
+#include "bloom.h"
+
+static uint32_t rotate_left(uint32_t value, int32_t count)
+{
+	uint32_t mask = 8 * sizeof(uint32_t) - 1;
+	count &= mask;
+	return ((value << count) | (value >> ((-count) & mask)));
+}
+
+/*
+ * Calculate the murmur3 32-bit hash value for the given data
+ * using the given seed.
+ * Produces a uniformly distributed hash value.
+ * Not considered to be cryptographically secure.
+ * Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm
+ */
+uint32_t murmur3_seeded(uint32_t seed, const char *data, size_t len)
+{
+	const uint32_t c1 = 0xcc9e2d51;
+	const uint32_t c2 = 0x1b873593;
+	const uint32_t r1 = 15;
+	const uint32_t r2 = 13;
+	const uint32_t m = 5;
+	const uint32_t n = 0xe6546b64;
+	int i;
+	uint32_t k1 = 0;
+	const char *tail;
+
+	int len4 = len / sizeof(uint32_t);
+
+	uint32_t k;
+	for (i = 0; i < len4; i++) {
+		uint32_t byte1 = (uint32_t)data[4*i];
+		uint32_t byte2 = ((uint32_t)data[4*i + 1]) << 8;
+		uint32_t byte3 = ((uint32_t)data[4*i + 2]) << 16;
+		uint32_t byte4 = ((uint32_t)data[4*i + 3]) << 24;
+		k = byte1 | byte2 | byte3 | byte4;
+		k *= c1;
+		k = rotate_left(k, r1);
+		k *= c2;
+
+		seed ^= k;
+		seed = rotate_left(seed, r2) * m + n;
+	}
+
+	tail = (data + len4 * sizeof(uint32_t));
+
+	switch (len & (sizeof(uint32_t) - 1)) {
+	case 3:
+		k1 ^= ((uint32_t)tail[2]) << 16;
+		/*-fallthrough*/
+	case 2:
+		k1 ^= ((uint32_t)tail[1]) << 8;
+		/*-fallthrough*/
+	case 1:
+		k1 ^= ((uint32_t)tail[0]) << 0;
+		k1 *= c1;
+		k1 = rotate_left(k1, r1);
+		k1 *= c2;
+		seed ^= k1;
+		break;
+	}
+
+	seed ^= (uint32_t)len;
+	seed ^= (seed >> 16);
+	seed *= 0x85ebca6b;
+	seed ^= (seed >> 13);
+	seed *= 0xc2b2ae35;
+	seed ^= (seed >> 16);
+
+	return seed;
+}
\ No newline at end of file
diff --git a/bloom.h b/bloom.h
new file mode 100644
index 00000000000..d0fcc5f0aa6
--- /dev/null
+++ b/bloom.h
@@ -0,0 +1,13 @@
+#ifndef BLOOM_H
+#define BLOOM_H
+
+/*
+ * Calculate the murmur3 32-bit hash value for the given data
+ * using the given seed.
+ * Produces a uniformly distributed hash value.
+ * Not considered to be cryptographically secure.
+ * Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm
+ */
+uint32_t murmur3_seeded(uint32_t seed, const char *data, size_t len);
+
+#endif
\ No newline at end of file
diff --git a/t/helper/test-bloom.c b/t/helper/test-bloom.c
new file mode 100644
index 00000000000..60ee2043689
--- /dev/null
+++ b/t/helper/test-bloom.c
@@ -0,0 +1,13 @@
+#include "git-compat-util.h"
+#include "bloom.h"
+#include "test-tool.h"
+
+int cmd__bloom(int argc, const char **argv)
+{
+	if (!strcmp(argv[1], "get_murmur3")) {
+		uint32_t hashed = murmur3_seeded(0, argv[2], strlen(argv[2]));
+		printf("Murmur3 Hash with seed=0:0x%08x\n", hashed);
+	}
+
+	return 0;
+}
\ No newline at end of file
diff --git a/t/helper/test-tool.c b/t/helper/test-tool.c
index 31eedcd241f..6e26bd65c97 100644
--- a/t/helper/test-tool.c
+++ b/t/helper/test-tool.c
@@ -15,6 +15,7 @@ struct test_cmd {
 
 static struct test_cmd cmds[] = {
 	{ "advise", cmd__advise_if_enabled },
+	{ "bloom", cmd__bloom },
 	{ "chmtime", cmd__chmtime },
 	{ "config", cmd__config },
 	{ "ctype", cmd__ctype },
diff --git a/t/helper/test-tool.h b/t/helper/test-tool.h
index 4eb5e6609e1..dceeef1d5c2 100644
--- a/t/helper/test-tool.h
+++ b/t/helper/test-tool.h
@@ -5,6 +5,7 @@
 #include "git-compat-util.h"
 
 int cmd__advise_if_enabled(int argc, const char **argv);
+int cmd__bloom(int argc, const char **argv);
 int cmd__chmtime(int argc, const char **argv);
 int cmd__config(int argc, const char **argv);
 int cmd__ctype(int argc, const char **argv);
diff --git a/t/t0095-bloom.sh b/t/t0095-bloom.sh
new file mode 100755
index 00000000000..2dad8c4a94e
--- /dev/null
+++ b/t/t0095-bloom.sh
@@ -0,0 +1,30 @@
+#!/bin/sh
+
+test_description='Testing the various Bloom filter computations in bloom.c'
+. ./test-lib.sh
+
+test_expect_success 'compute unseeded murmur3 hash for empty string' '
+	cat >expect <<-\EOF &&
+	Murmur3 Hash with seed=0:0x00000000
+	EOF
+	test-tool bloom get_murmur3 "" >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'compute unseeded murmur3 hash for test string 1' '
+	cat >expect <<-\EOF &&
+	Murmur3 Hash with seed=0:0x627b0c2c
+	EOF
+	test-tool bloom get_murmur3 "Hello world!" >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'compute unseeded murmur3 hash for test string 2' '
+	cat >expect <<-\EOF &&
+	Murmur3 Hash with seed=0:0x2e4ff723
+	EOF
+	test-tool bloom get_murmur3 "The quick brown fox jumps over the lazy dog" >actual &&
+	test_cmp expect actual
+'
+
+test_done
\ No newline at end of file
-- 
gitgitgadget


  parent reply	other threads:[~2020-03-30  0:31 UTC|newest]

Thread overview: 159+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-12-20 22:05 [PATCH 0/9] [RFC] Changed Paths Bloom Filters Garima Singh via GitGitGadget
2019-12-20 22:05 ` [PATCH 1/9] commit-graph: add --changed-paths option to write Garima Singh via GitGitGadget
2020-01-01 20:20   ` Jakub Narebski
2019-12-20 22:05 ` [PATCH 2/9] commit-graph: write changed paths bloom filters Garima Singh via GitGitGadget
2019-12-21 16:48   ` Philip Oakley
2020-01-06 18:44   ` Jakub Narebski
2020-01-13 19:48     ` Garima Singh
2019-12-20 22:05 ` [PATCH 3/9] commit-graph: use MAX_NUM_CHUNKS Garima Singh via GitGitGadget
2020-01-07 12:19   ` Jakub Narebski
2019-12-20 22:05 ` [PATCH 4/9] commit-graph: document bloom filter format Garima Singh via GitGitGadget
2020-01-07 14:46   ` Jakub Narebski
2019-12-20 22:05 ` [PATCH 5/9] commit-graph: write changed path bloom filters to commit-graph file Garima Singh via GitGitGadget
2020-01-07 16:01   ` Jakub Narebski
2020-01-14 15:14     ` Garima Singh
2019-12-20 22:05 ` [PATCH 6/9] commit-graph: test commit-graph write --changed-paths Garima Singh via GitGitGadget
2020-01-08  0:32   ` Jakub Narebski
2019-12-20 22:05 ` [PATCH 7/9] commit-graph: reuse existing bloom filters during write Garima Singh via GitGitGadget
2020-01-09 19:12   ` Jakub Narebski
2019-12-20 22:05 ` [PATCH 8/9] revision.c: use bloom filters to speed up path based revision walks Garima Singh via GitGitGadget
2020-01-11  0:27   ` Jakub Narebski
2020-01-15  0:08     ` Garima Singh
2019-12-20 22:05 ` [PATCH 9/9] commit-graph: add GIT_TEST_COMMIT_GRAPH_BLOOM_FILTERS test flag Garima Singh via GitGitGadget
2020-01-11 19:56   ` Jakub Narebski
2020-01-15  0:55     ` Garima Singh
2019-12-20 22:14 ` [PATCH 0/9] [RFC] Changed Paths Bloom Filters Junio C Hamano
2019-12-22  9:26 ` Christian Couder
2019-12-22  9:38   ` Jeff King
2020-01-01 12:04     ` Jakub Narebski
2019-12-22  9:30 ` Jeff King
2019-12-22  9:32   ` [PATCH 1/3] commit-graph: examine changed-path objects in pack order Jeff King
2019-12-27 14:51     ` Derrick Stolee
2019-12-29  6:12       ` Jeff King
2019-12-29  6:28         ` Jeff King
2019-12-30 14:37         ` Derrick Stolee
2019-12-30 14:51           ` Derrick Stolee
2019-12-22  9:32   ` [PATCH 2/3] commit-graph: free large diffs, too Jeff King
2019-12-27 14:52     ` Derrick Stolee
2019-12-22  9:32   ` [PATCH 3/3] commit-graph: stop using full rev_info for diffs Jeff King
2019-12-27 14:53     ` Derrick Stolee
2019-12-26 14:21   ` [PATCH 0/9] [RFC] Changed Paths Bloom Filters Derrick Stolee
2019-12-29  6:03     ` Jeff King
2019-12-27 16:11   ` Derrick Stolee
2019-12-29  6:24     ` Jeff King
2019-12-30 16:04       ` Derrick Stolee
2019-12-30 17:02       ` Junio C Hamano
2019-12-31 16:45 ` Jakub Narebski
2020-01-13 16:54   ` Garima Singh
2020-01-20 13:48     ` Jakub Narebski
2020-01-21 16:14       ` Garima Singh
2020-02-02 18:43         ` Jakub Narebski
2020-01-21 23:40 ` Emily Shaffer
2020-01-27 18:24   ` Garima Singh
2020-02-01 23:32   ` Jakub Narebski
2020-02-05 22:56 ` [PATCH v2 00/11] " Garima Singh via GitGitGadget
2020-02-05 22:56   ` [PATCH v2 01/11] commit-graph: use MAX_NUM_CHUNKS Garima Singh via GitGitGadget
2020-02-09 12:39     ` Jakub Narebski
2020-02-05 22:56   ` [PATCH v2 02/11] bloom: core Bloom filter implementation for changed paths Garima Singh via GitGitGadget
2020-02-15 17:17     ` Jakub Narebski
2020-02-16 16:49     ` Jakub Narebski
2020-02-22  0:32       ` Garima Singh
2020-02-23 13:38         ` Jakub Narebski
2020-02-24 17:34           ` Garima Singh
2020-02-24 18:20             ` Jakub Narebski
2020-02-05 22:56   ` [PATCH v2 03/11] diff: halt tree-diff early after max_changes Derrick Stolee via GitGitGadget
2020-02-17  0:00     ` Jakub Narebski
2020-02-22  0:37       ` Garima Singh
2020-02-05 22:56   ` [PATCH v2 04/11] commit-graph: compute Bloom filters for changed paths Garima Singh via GitGitGadget
2020-02-17 21:56     ` Jakub Narebski
2020-02-22  0:55       ` Garima Singh
2020-02-23 17:34         ` Jakub Narebski
2020-02-05 22:56   ` [PATCH v2 05/11] commit-graph: examine changed-path objects in pack order Jeff King via GitGitGadget
2020-02-18 17:59     ` Jakub Narebski
2020-02-24 18:29       ` Garima Singh
2020-02-05 22:56   ` [PATCH v2 06/11] commit-graph: examine commits by generation number Derrick Stolee via GitGitGadget
2020-02-19  0:32     ` Jakub Narebski
2020-02-24 20:45       ` Garima Singh
2020-02-05 22:56   ` [PATCH v2 07/11] commit-graph: write Bloom filters to commit graph file Garima Singh via GitGitGadget
2020-02-19 15:13     ` Jakub Narebski
2020-02-24 21:14       ` Garima Singh
2020-02-25 11:40         ` Jakub Narebski
2020-02-25 15:58           ` Garima Singh
2020-02-05 22:56   ` [PATCH v2 08/11] commit-graph: reuse existing Bloom filters during write Garima Singh via GitGitGadget
2020-02-20 18:48     ` Jakub Narebski
2020-02-24 21:45       ` Garima Singh
2020-02-05 22:56   ` [PATCH v2 09/11] commit-graph: add --changed-paths option to write subcommand Garima Singh via GitGitGadget
2020-02-20 20:28     ` Jakub Narebski
2020-02-24 21:51       ` Garima Singh
2020-02-25 12:10         ` Jakub Narebski
2020-02-20 22:10     ` Bryan Turner
2020-02-22  1:44       ` Garima Singh
2020-02-05 22:56   ` [PATCH v2 10/11] revision.c: use Bloom filters to speed up path based revision walks Garima Singh via GitGitGadget
2020-02-21 17:31     ` Jakub Narebski
2020-02-21 22:45     ` Jakub Narebski
2020-02-05 22:56   ` [PATCH v2 11/11] commit-graph: add GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS test flag Garima Singh via GitGitGadget
2020-02-22  0:11     ` Jakub Narebski
2020-02-07 13:52   ` [PATCH v2 00/11] Changed Paths Bloom Filters SZEDER Gábor
2020-02-07 15:09     ` Garima Singh
2020-02-07 15:36       ` Derrick Stolee
2020-02-07 16:15         ` SZEDER Gábor
2020-02-07 16:33           ` Derrick Stolee
2020-02-11 19:08       ` Garima Singh
2020-02-08 23:04   ` Jakub Narebski
2020-02-21 17:41     ` Garima Singh
2020-03-29 18:36       ` Junio C Hamano
2020-03-30  0:31   ` [PATCH v3 00/16] " Garima Singh via GitGitGadget
2020-03-30  0:31     ` [PATCH v3 01/16] commit-graph: define and use MAX_NUM_CHUNKS Garima Singh via GitGitGadget
2020-03-30  0:31     ` Garima Singh via GitGitGadget [this message]
2020-03-30  0:31     ` [PATCH v3 03/16] bloom.c: introduce core Bloom filter constructs Garima Singh via GitGitGadget
2020-03-30  0:31     ` [PATCH v3 04/16] bloom.c: core Bloom filter implementation for changed paths Garima Singh via GitGitGadget
2020-03-30  0:31     ` [PATCH v3 05/16] diff: halt tree-diff early after max_changes Derrick Stolee via GitGitGadget
2020-03-30  0:31     ` [PATCH v3 06/16] commit-graph: compute Bloom filters for changed paths Garima Singh via GitGitGadget
2020-03-30  0:31     ` [PATCH v3 07/16] commit-graph: examine changed-path objects in pack order Jeff King via GitGitGadget
2020-03-30  0:31     ` [PATCH v3 08/16] commit-graph: examine commits by generation number Garima Singh via GitGitGadget
2020-03-30  0:31     ` [PATCH v3 09/16] diff: skip batch object download when possible Garima Singh via GitGitGadget
2020-03-30  0:31     ` [PATCH v3 10/16] commit-graph: write Bloom filters to commit graph file Garima Singh via GitGitGadget
2020-03-30  0:31     ` [PATCH v3 11/16] commit-graph: reuse existing Bloom filters during write Garima Singh via GitGitGadget
2020-03-30  0:31     ` [PATCH v3 12/16] commit-graph: add --changed-paths option to write subcommand Garima Singh via GitGitGadget
2020-03-30  0:31     ` [PATCH v3 13/16] revision.c: use Bloom filters to speed up path based revision walks Garima Singh via GitGitGadget
2020-03-30  0:31     ` [PATCH v3 14/16] revision.c: add trace2 stats around Bloom filter usage Garima Singh via GitGitGadget
2020-03-30  0:31     ` [PATCH v3 15/16] t4216: add end to end tests for git log with Bloom filters Garima Singh via GitGitGadget
2020-03-30  0:31     ` [PATCH v3 16/16] commit-graph: add GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS test flag Garima Singh via GitGitGadget
2020-04-06 16:59     ` [PATCH v4 00/15] Changed Paths Bloom Filters Garima Singh via GitGitGadget
2020-04-06 16:59       ` [PATCH v4 01/15] commit-graph: define and use MAX_NUM_CHUNKS Garima Singh via GitGitGadget
2020-04-06 16:59       ` [PATCH v4 02/15] bloom.c: add the murmur3 hash implementation Garima Singh via GitGitGadget
2020-04-06 16:59       ` [PATCH v4 03/15] bloom.c: introduce core Bloom filter constructs Garima Singh via GitGitGadget
2020-04-06 16:59       ` [PATCH v4 04/15] bloom.c: core Bloom filter implementation for changed paths Garima Singh via GitGitGadget
2020-06-27 15:53         ` SZEDER Gábor
2020-04-06 16:59       ` [PATCH v4 05/15] diff: halt tree-diff early after max_changes Derrick Stolee via GitGitGadget
2020-08-04 14:47         ` SZEDER Gábor
2020-08-04 16:25           ` Derrick Stolee
2020-08-04 17:00             ` SZEDER Gábor
2020-08-04 17:31               ` Derrick Stolee
2020-08-05 17:08                 ` Derrick Stolee
2020-04-06 16:59       ` [PATCH v4 06/15] commit-graph: compute Bloom filters for changed paths Garima Singh via GitGitGadget
2020-04-06 16:59       ` [PATCH v4 07/15] commit-graph: examine changed-path objects in pack order Jeff King via GitGitGadget
2020-04-06 16:59       ` [PATCH v4 08/15] commit-graph: examine commits by generation number Garima Singh via GitGitGadget
2020-04-06 16:59       ` [PATCH v4 09/15] commit-graph: write Bloom filters to commit graph file Garima Singh via GitGitGadget
2020-05-29  8:57         ` SZEDER Gábor
2020-05-29 13:35           ` Derrick Stolee
2020-05-31 17:23             ` SZEDER Gábor
2020-07-09 17:00         ` [PATCH] commit-graph: fix "Writing out commit graph" progress counter SZEDER Gábor
2020-07-09 18:01           ` Derrick Stolee
2020-07-09 18:20             ` Derrick Stolee
2020-04-06 16:59       ` [PATCH v4 10/15] commit-graph: reuse existing Bloom filters during write Garima Singh via GitGitGadget
2020-06-19 14:02         ` SZEDER Gábor
2020-06-19 19:28           ` Junio C Hamano
2020-07-27 21:33         ` SZEDER Gábor
2020-04-06 16:59       ` [PATCH v4 11/15] commit-graph: add --changed-paths option to write subcommand Garima Singh via GitGitGadget
2020-06-07 22:21         ` SZEDER Gábor
2020-04-06 16:59       ` [PATCH v4 12/15] revision.c: use Bloom filters to speed up path based revision walks Garima Singh via GitGitGadget
2020-06-26  6:34         ` SZEDER Gábor
2020-04-06 16:59       ` [PATCH v4 13/15] revision.c: add trace2 stats around Bloom filter usage Garima Singh via GitGitGadget
2020-04-06 16:59       ` [PATCH v4 14/15] t4216: add end to end tests for git log with Bloom filters Garima Singh via GitGitGadget
2020-04-06 16:59       ` [PATCH v4 15/15] commit-graph: add GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS test flag Garima Singh via GitGitGadget
2020-04-08 15:51       ` [PATCH v4 00/15] Changed Paths Bloom Filters Derrick Stolee
2020-04-08 19:21         ` Junio C Hamano
2020-04-08 20:05         ` Jakub Narębski
2020-04-12 20:34         ` Taylor Blau
2020-03-05 19:49 ` [PATCH 0/9] [RFC] " Garima Singh

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=a5aa3415c05ee9bc67a9471445a20c71a9834673.1585528298.git.gitgitgadget@gmail.com \
    --to=gitgitgadget@gmail.com \
    --cc=garima.singh@microsoft.com \
    --cc=git@vger.kernel.org \
    --cc=jonathantanmy@google.com \
    --cc=stolee@gmail.com \
    --cc=szeder.dev@gmail.com \
    --subject='Re: [PATCH v3 02/16] bloom.c: add the murmur3 hash implementation' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
on how to clone and mirror all data and code used for this inbox