From: "SZEDER Gábor" <szeder.dev@gmail.com>
To: git@vger.kernel.org
Cc: "Junio C Hamano" <gitster@pobox.com>,
"Garima Singh" <garima.singh@microsoft.com>,
"Derrick Stolee" <stolee@gmail.com>,
"Jakub Narebski" <jnareb@gmail.com>, "Jeff King" <peff@peff.net>,
"Taylor Blau" <me@ttaylorr.com>,
"SZEDER Gábor" <szeder.dev@gmail.com>
Subject: [PATCH 26/34] commit-graph: deduplicate modified path Bloom filters
Date: Fri, 29 May 2020 10:50:30 +0200 [thread overview]
Message-ID: <20200529085038.26008-27-szeder.dev@gmail.com> (raw)
In-Reply-To: <20200529085038.26008-1-szeder.dev@gmail.com>
Some commits modify the same set of paths, and, consequently, have
identical modified path Bloom filters.
Use a hashmap to find these identical Bloom filters, and omit all
duplicates from the Modified Path Bloom Filters chunk, reducing its
size.
MPBF chunk size
without with Time spent
deduplication deduping
--------------------------------------------------------
android-base 8620198 2618764 -69.6% 0.089s
cmssw 10347018 9094468 -12.1% 0.056s
cpython 526381 371629 -29.4% 0.013s
elasticsearch 4733274 3607106 -23.8% 0.029s
gcc 3724544 3394521 -8.9% 0.072s
gecko-dev 20591010 17042732 -17.2% 0.235s
git 160718 139546 -13.2% 0.004s
glibc 759132 699623 -7.8% 0.009s
go 458375 421394 -8.1% 0.008s
jdk 13213208 12158533 -8.0% 0.034s
linux 26575278 25029700 -5.8% 0.169s
llvm-project 3988133 3269438 -18.0% 0.085s
rails 958691 614528 -35.9% 0.015s
rust 1985782 1682919 -15.3% 0.023s
tensorflow 4173570 3789768 -9.2% 0.022s
webkit 5891751 5394507 -8.4% 0.096s
Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
---
commit-graph.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 71 insertions(+), 5 deletions(-)
diff --git a/commit-graph.c b/commit-graph.c
index 56b8f33d10..178bbf0113 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -57,6 +57,7 @@
struct modified_path_bloom_filter_info {
struct bloom_filter filter;
+ struct modified_path_bloom_filter_info *duplicate_of;
/*
* The offset relative to the start of the Modified Path Bloom
* Filters chunk where this Bloom filter has been written,
@@ -1099,6 +1100,9 @@ struct write_commit_graph_context {
/* Excluding embedded modified path Bloom filters */
uint64_t total_filter_size;
+
+ /* Used to find identical modified path Bloom filters */
+ struct hashmap dedup_hashmap;
} mpbfctx;
};
@@ -1315,7 +1319,11 @@ static int write_graph_chunk_modified_path_bloom_filters(struct hashfile *f,
bfi = modified_path_bloom_filters_peek(
&modified_path_bloom_filters, commit);
- if (!bfi || !bfi->filter.nr_bits)
+ if (!bfi)
+ continue;
+ if (bfi->duplicate_of)
+ continue;
+ if (!bfi->filter.nr_bits)
continue;
if (bfi->filter.nr_bits == GRAPH_MODIFIED_PATH_BLOOM_FILTER_EMBEDDED_NR_BITS)
continue;
@@ -1364,6 +1372,9 @@ static int write_graph_chunk_modified_path_bloom_index(struct hashfile *f,
*/
filterdata[0] |= 1 << 7;
hashwrite(f, filterdata, sizeof(filterdata));
+ } else if (bfi->duplicate_of) {
+ uint64_t offset = htonll(bfi->duplicate_of->offset);
+ hashwrite(f, &offset, sizeof(offset));
} else if (bfi->offset != -1) {
uint64_t offset = htonll(bfi->offset);
hashwrite(f, &offset, sizeof(offset));
@@ -1515,6 +1526,53 @@ static void file_change_cb(struct diff_options *options,
handle_modified_file(options->change_fn_data, fullpath);
}
+struct modified_path_bloom_filter_dedup_entry {
+ struct hashmap_entry entry;
+ struct modified_path_bloom_filter_info *bfi;
+};
+
+static int modified_path_bloom_filter_dedup_cmp(const void *unused_cmp_data,
+ const struct hashmap_entry *he1,
+ const struct hashmap_entry *he2,
+ const void *unused_keydata)
+{
+ const struct modified_path_bloom_filter_dedup_entry *a, *b;
+
+ a = container_of(he1, const struct modified_path_bloom_filter_dedup_entry, entry);
+ b = container_of(he2, const struct modified_path_bloom_filter_dedup_entry, entry);
+
+ if (a->bfi->filter.nr_bits != b->bfi->filter.nr_bits)
+ return 1;
+
+ return memcmp(a->bfi->filter.bits, b->bfi->filter.bits,
+ bloom_filter_bytes(&a->bfi->filter));
+}
+
+static int handle_duplicate_modified_path_bloom_filter(
+ struct modified_path_bloom_filter_context *mpbfctx,
+ struct modified_path_bloom_filter_info *bfi)
+{
+ struct modified_path_bloom_filter_dedup_entry *e, stack_entry;
+ unsigned int h;
+
+ h = memhash(bfi->filter.bits, bloom_filter_bytes(&bfi->filter));
+ hashmap_entry_init(&stack_entry.entry, h);
+ stack_entry.bfi = bfi;
+
+ e = hashmap_get_entry(&mpbfctx->dedup_hashmap, &stack_entry, entry,
+ NULL);
+ if (e) {
+ bfi->duplicate_of = e->bfi;
+ return 1;
+ } else {
+ e = xmalloc(sizeof(*e));
+ hashmap_entry_init(&e->entry, h);
+ e->bfi = bfi;
+ hashmap_add(&mpbfctx->dedup_hashmap, &e->entry);
+ return 0;
+ }
+}
+
static void create_modified_path_bloom_filter(
struct modified_path_bloom_filter_context *mpbfctx,
struct commit *commit)
@@ -1547,17 +1605,20 @@ static void create_modified_path_bloom_filter(
bfi = modified_path_bloom_filters_at(&modified_path_bloom_filters,
commit);
bfi->offset = -1;
- if (path_component_count > mpbfctx->embedded_limit) {
+ if (path_component_count > mpbfctx->embedded_limit)
bloom_filter_init(&bfi->filter, mpbfctx->num_hashes,
path_component_count);
- mpbfctx->total_filter_size += sizeof(uint32_t) +
- bloom_filter_bytes(&bfi->filter);
- } else
+ else
bloom_filter_init_with_size(&bfi->filter,
GRAPH_MODIFIED_PATH_BLOOM_FILTER_EMBEDDED_NR_BITS);
bloom_filter_set_bits(&bfi->filter, mpbfctx->hashes,
mpbfctx->hashes_nr);
+
+ if (path_component_count > mpbfctx->embedded_limit &&
+ !handle_duplicate_modified_path_bloom_filter(mpbfctx, bfi))
+ mpbfctx->total_filter_size += sizeof(uint32_t) +
+ bloom_filter_bytes(&bfi->filter);
}
static void add_missing_parents(struct write_commit_graph_context *ctx, struct commit *commit)
@@ -2396,6 +2457,8 @@ int write_commit_graph(const char *obj_dir,
diff_setup_done(&ctx->mpbfctx.diffopt);
strbuf_init(&ctx->mpbfctx.prev_path, PATH_MAX);
+ hashmap_init(&ctx->mpbfctx.dedup_hashmap,
+ modified_path_bloom_filter_dedup_cmp, NULL, 0);
}
if (ctx->split) {
@@ -2525,6 +2588,9 @@ int write_commit_graph(const char *obj_dir,
strbuf_release(&ctx->mpbfctx.prev_path);
free(ctx->mpbfctx.hashed_prefix_lengths);
free(ctx->mpbfctx.hashes);
+ hashmap_free_entries(&ctx->mpbfctx.dedup_hashmap,
+ struct modified_path_bloom_filter_dedup_entry,
+ entry);
}
free(ctx);
--
2.27.0.rc1.431.g5c813f95dc
next prev parent reply other threads:[~2020-05-29 8:51 UTC|newest]
Thread overview: 43+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-05-29 8:50 [PoC PATCH 00/34] An alternative modified path Bloom filters implementation SZEDER Gábor
2020-05-28 22:09 ` Johannes Schindelin
2020-05-29 8:50 ` [PATCH 01/34] tree-walk.c: don't match submodule entries for 'submod/anything' SZEDER Gábor
2020-05-29 8:50 ` [PATCH 02/34] commit-graph: fix parsing the Chunk Lookup table SZEDER Gábor
2020-05-29 8:50 ` [PATCH 03/34] commit-graph-format.txt: all multi-byte numbers are in network byte order SZEDER Gábor
2020-05-29 8:50 ` [PATCH 04/34] commit-slab: add a function to deep free entries on the slab SZEDER Gábor
2020-06-04 16:43 ` Derrick Stolee
2020-06-27 15:56 ` SZEDER Gábor
2020-06-29 11:30 ` Derrick Stolee
2020-05-29 8:50 ` [PATCH 05/34] diff.h: drop diff_tree_oid() & friends' return value SZEDER Gábor
2020-05-29 8:50 ` [PATCH 06/34] commit-graph: clean up #includes SZEDER Gábor
2020-05-29 8:50 ` [PATCH 07/34] commit-graph: simplify parse_commit_graph() #1 SZEDER Gábor
2020-05-29 8:50 ` [PATCH 08/34] commit-graph: simplify parse_commit_graph() #2 SZEDER Gábor
2020-05-29 8:50 ` [PATCH 09/34] commit-graph: simplify write_commit_graph_file() #1 SZEDER Gábor
2020-05-29 8:50 ` [PATCH 10/34] commit-graph: simplify write_commit_graph_file() #2 SZEDER Gábor
2020-05-29 8:50 ` [PATCH 11/34] commit-graph: allocate the 'struct chunk_info' array dinamically SZEDER Gábor
2020-05-29 8:50 ` [PATCH 12/34] commit-graph: unify the signatures of all write_graph_chunk_*() functions SZEDER Gábor
2020-05-29 8:50 ` [PATCH 13/34] commit-graph: simplify write_commit_graph_file() #3 SZEDER Gábor
2020-05-29 8:50 ` [PATCH 14/34] commit-graph: check chunk sizes after writing SZEDER Gábor
2020-05-29 8:50 ` [PATCH 15/34] commit-graph-format.txt: document the modified path Bloom filter chunks SZEDER Gábor
2020-06-02 17:59 ` SZEDER Gábor
2020-05-29 8:50 ` [PATCH 16/34] Add a generic and minimal Bloom filter implementation SZEDER Gábor
2020-05-29 8:50 ` [PATCH 17/34] Import a streaming-capable Murmur3 hash function implementation SZEDER Gábor
2020-05-29 8:50 ` [PATCH 18/34] commit-graph: write "empty" Modified Path Bloom Filter Index chunk SZEDER Gábor
2020-05-29 8:50 ` [PATCH 19/34] commit-graph: add commit slab for modified path Bloom filters SZEDER Gábor
2020-05-29 8:50 ` [PATCH 20/34] commit-graph: fill the Modified Path Bloom Filter Index chunk SZEDER Gábor
2020-05-29 8:50 ` [PATCH 21/34] commit-graph: load and use " SZEDER Gábor
2020-05-29 8:50 ` [PATCH 22/34] commit-graph: write the Modified Path Bloom Filters chunk SZEDER Gábor
2020-05-29 8:50 ` [PATCH 23/34] commit-graph: load and use " SZEDER Gábor
2020-05-29 8:50 ` [PATCH 24/34] commit-graph: check all leading directories in modified path Bloom filters SZEDER Gábor
2020-05-29 8:50 ` [PATCH 25/34] commit-graph: check embedded modified path Bloom filters with a mask SZEDER Gábor
2020-05-29 8:50 ` SZEDER Gábor [this message]
2020-05-29 8:50 ` [PATCH 27/34] commit-graph: load modified path Bloom filters for merge commits SZEDER Gábor
2020-05-29 8:50 ` [PATCH 28/34] commit-graph: write Modified Path Bloom Filter Merge Index chunk SZEDER Gábor
2020-05-29 8:50 ` [PATCH 29/34] commit-graph: extract init and free write_commit_graph_context SZEDER Gábor
2020-05-29 8:50 ` [PATCH 30/34] commit-graph: move write_commit_graph_reachable below write_commit_graph SZEDER Gábor
2020-05-29 8:50 ` [PATCH 31/34] t7007-show: make the first test compatible with the next patch SZEDER Gábor
2020-05-29 8:50 ` [PATCH 32/34] PoC commit-graph: use revision walk machinery for '--reachable' SZEDER Gábor
2020-05-29 8:50 ` [PATCH 33/34] commit-graph: write modified path Bloom filters in "history order" SZEDER Gábor
2020-05-29 8:50 ` [PATCH 34/34] commit-graph: use modified path Bloom filters with wildcards, if possible SZEDER Gábor
2020-05-29 13:59 ` [PoC PATCH 00/34] An alternative modified path Bloom filters implementation Derrick Stolee
2020-06-01 23:25 ` Taylor Blau
2020-06-02 17:08 ` Junio C Hamano
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20200529085038.26008-27-szeder.dev@gmail.com \
--to=szeder.dev@gmail.com \
--cc=garima.singh@microsoft.com \
--cc=git@vger.kernel.org \
--cc=gitster@pobox.com \
--cc=jnareb@gmail.com \
--cc=me@ttaylorr.com \
--cc=peff@peff.net \
--cc=stolee@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.