Git Mailing List Archive on lore.kernel.org
 help / color / Atom feed
From: "SZEDER Gábor" <szeder.dev@gmail.com>
To: git@vger.kernel.org
Cc: "Junio C Hamano" <gitster@pobox.com>,
	"Garima Singh" <garima.singh@microsoft.com>,
	"Derrick Stolee" <stolee@gmail.com>,
	"Jakub Narebski" <jnareb@gmail.com>, "Jeff King" <peff@peff.net>,
	"Taylor Blau" <me@ttaylorr.com>,
	"SZEDER Gábor" <szeder.dev@gmail.com>
Subject: [PATCH 23/34] commit-graph: load and use the Modified Path Bloom Filters chunk
Date: Fri, 29 May 2020 10:50:27 +0200
Message-ID: <20200529085038.26008-24-szeder.dev@gmail.com> (raw)
In-Reply-To: <20200529085038.26008-1-szeder.dev@gmail.com>

The table below compares the average runtime and memory usage of

  git rev-list HEAD -- "$path"

with and without modified path Bloom filters for 5000+ randomly
selected paths from each repository:

                  Average runtime            |       Max RSS
                 without     with   Speedup  |  without   with
  -------------------------------------------+-------------------------
  android-base   0.8780s   0.1742s    5.04x  |   387MB   227MB   -41.4%
  cmssw          0.3143s   0.0452s    6.95x  |   181MB    79MB   -56.4%
  cpython        0.7453s   0.0956s    7.80x  |   159MB    71MB   -55.4%
  elasticsearch  0.1492s   0.0191s    7.82x  |   134MB    53MB   -60.2%
  gcc            7.1852s   0.3584s   20.05x  |   297MB   231MB   -22.3%
  gecko-dev      4.6113s   0.6318s    7.30x  |   832MB   600MB   -27.9%
  git            0.6180s   0.0405s   15.26x  |   131MB    43MB   -67.2%
  glibc          0.5618s   0.0471s   11.93x  |   136MB    50MB   -63.3%
  go             0.4913s   0.0515s    9.53x  |   130MB    50MB   -61.5%
  jdk            0.0482s   0.0089s    5.42x  |    52MB    39MB   -25.0%
  linux          0.7043s   0.1129s    6.24x  |   438MB   270MB   -38.4%
  llvm-project   2.6844s   0.4873s    5.51x  |   384MB   282MB   -26.6%
  rails          0.2784s   0.0484s    5.75x  |    88MB    51MB   -42.1%
  rust           0.7757s   0.0619s   12.53x  |   345MB    89MB   -74.3%
  tensorflow     0.6258s   0.0642s    9.74x  |   233MB    85MB   -63.5%
  webkit         1.9137s   0.3332s    5.74x  |   941MB   480MB   -49.0%
---
 commit-graph.c | 44 ++++++++++++++++++++++++++++++++++++++++++--
 commit-graph.h |  2 ++
 2 files changed, 44 insertions(+), 2 deletions(-)

diff --git a/commit-graph.c b/commit-graph.c
index 3210ec2f93..f9a21ecdfb 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -327,6 +327,15 @@ struct commit_graph *parse_commit_graph(void *graph_map, int fd,
 			}
 			break;
 
+		case GRAPH_CHUNKID_MODIFIED_PATH_BLOOM_FILTERS:
+			if (graph->chunk_mpbf_filters)
+				chunk_repeated = 1;
+			else {
+				graph->chunk_mpbf_filters = data + chunk_offset;
+				graph->chunk_mpbf_filters_size = next_chunk_offset - chunk_offset;
+			}
+			break;
+
 		case GRAPH_CHUNKID_MODIFIED_PATH_BLOOM_FILTER_EXCLUDES:
 			if (graph->chunk_mpbf_excludes)
 				chunk_repeated = 1;
@@ -830,6 +839,7 @@ static int load_modified_path_bloom_filter_from_graph(
 		struct commit *parent, struct bloom_filter *bf)
 {
 	const uint8_t *bloom_index;
+	uint64_t offset;
 	int first_parent = 0;
 
 	if (commit->graph_pos == COMMIT_NOT_FROM_GRAPH)
@@ -857,9 +867,39 @@ static int load_modified_path_bloom_filter_from_graph(
 		bf->nr_bits = GRAPH_MODIFIED_PATH_BLOOM_FILTER_EMBEDDED_NR_BITS;
 		bf->bits = (uint8_t*) bloom_index;
 		return 1;
+	} else if (bloom_index[0] & (1 << 6)) {
+		/*
+		 * Modified path Bloom filters for second..nth parents of
+		 * merge commits are not implemented yet.
+		 */
+		return 0;
+	} else {
+		if (!first_parent)
+			return 0;
+		offset = get_be64(bloom_index);
 	}
-	/* support for non-embedded Bloom filters is not implemented yet. */
-	return 0;
+
+	if (!graph->chunk_mpbf_filters)
+		BUG("commit %s refers to offset %lu of the Modified Path Bloom Filters chunk, but that chunk is missing",
+		    oid_to_hex(&commit->object.oid), offset);
+
+	if (offset + sizeof(uint32_t) >= graph->chunk_mpbf_filters_size)
+		BUG("commit %s refers to offset %lu of the Modified Path Bloom Filters chunk, but that's too large for chunk of size %lu bytes",
+		    oid_to_hex(&commit->object.oid), offset,
+		    graph->chunk_mpbf_filters_size);
+
+	bf->nr_bits = get_be32(graph->chunk_mpbf_filters + offset);
+	if (!bf->nr_bits)
+		BUG("commit %s has a modified path Bloom filter at offset %lu, which has zero size",
+		    oid_to_hex(&commit->object.oid), offset);
+	if (offset + sizeof(uint32_t) + bloom_filter_bytes(bf) > graph->chunk_mpbf_filters_size)
+		BUG("commit %s has a modified path Bloom filter of %u bits at offset %lu, which doesn't fit into a Modified Path Bloom Filters chunk of %lu bytes",
+		    oid_to_hex(&commit->object.oid), bf->nr_bits, offset,
+		    graph->chunk_mpbf_filters_size);
+	/* Casting away const-ness :( */
+	bf->bits = (uint8_t*)(graph->chunk_mpbf_filters + offset + sizeof(uint32_t));
+
+	return 1;
 }
 
 enum bloom_result check_modified_path_bloom_filter(struct repository *r,
diff --git a/commit-graph.h b/commit-graph.h
index 09dfc16932..cde0d7fa30 100644
--- a/commit-graph.h
+++ b/commit-graph.h
@@ -69,6 +69,8 @@ struct commit_graph {
 	const unsigned char *chunk_base_graphs;
 	const unsigned char *chunk_mpbf_index;
 	uint64_t chunk_mpbf_index_size;
+	const unsigned char *chunk_mpbf_filters;
+	uint64_t chunk_mpbf_filters_size;
 	const unsigned char *chunk_mpbf_excludes;
 
 	uint8_t num_modified_path_bloom_hashes;
-- 
2.27.0.rc1.431.g5c813f95dc


  parent reply index

Thread overview: 43+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-05-29  8:50 [PoC PATCH 00/34] An alternative modified path Bloom filters implementation SZEDER Gábor
2020-05-28 22:09 ` Johannes Schindelin
2020-05-29  8:50 ` [PATCH 01/34] tree-walk.c: don't match submodule entries for 'submod/anything' SZEDER Gábor
2020-05-29  8:50 ` [PATCH 02/34] commit-graph: fix parsing the Chunk Lookup table SZEDER Gábor
2020-05-29  8:50 ` [PATCH 03/34] commit-graph-format.txt: all multi-byte numbers are in network byte order SZEDER Gábor
2020-05-29  8:50 ` [PATCH 04/34] commit-slab: add a function to deep free entries on the slab SZEDER Gábor
2020-06-04 16:43   ` Derrick Stolee
2020-06-27 15:56     ` SZEDER Gábor
2020-06-29 11:30       ` Derrick Stolee
2020-05-29  8:50 ` [PATCH 05/34] diff.h: drop diff_tree_oid() & friends' return value SZEDER Gábor
2020-05-29  8:50 ` [PATCH 06/34] commit-graph: clean up #includes SZEDER Gábor
2020-05-29  8:50 ` [PATCH 07/34] commit-graph: simplify parse_commit_graph() #1 SZEDER Gábor
2020-05-29  8:50 ` [PATCH 08/34] commit-graph: simplify parse_commit_graph() #2 SZEDER Gábor
2020-05-29  8:50 ` [PATCH 09/34] commit-graph: simplify write_commit_graph_file() #1 SZEDER Gábor
2020-05-29  8:50 ` [PATCH 10/34] commit-graph: simplify write_commit_graph_file() #2 SZEDER Gábor
2020-05-29  8:50 ` [PATCH 11/34] commit-graph: allocate the 'struct chunk_info' array dinamically SZEDER Gábor
2020-05-29  8:50 ` [PATCH 12/34] commit-graph: unify the signatures of all write_graph_chunk_*() functions SZEDER Gábor
2020-05-29  8:50 ` [PATCH 13/34] commit-graph: simplify write_commit_graph_file() #3 SZEDER Gábor
2020-05-29  8:50 ` [PATCH 14/34] commit-graph: check chunk sizes after writing SZEDER Gábor
2020-05-29  8:50 ` [PATCH 15/34] commit-graph-format.txt: document the modified path Bloom filter chunks SZEDER Gábor
2020-06-02 17:59   ` SZEDER Gábor
2020-05-29  8:50 ` [PATCH 16/34] Add a generic and minimal Bloom filter implementation SZEDER Gábor
2020-05-29  8:50 ` [PATCH 17/34] Import a streaming-capable Murmur3 hash function implementation SZEDER Gábor
2020-05-29  8:50 ` [PATCH 18/34] commit-graph: write "empty" Modified Path Bloom Filter Index chunk SZEDER Gábor
2020-05-29  8:50 ` [PATCH 19/34] commit-graph: add commit slab for modified path Bloom filters SZEDER Gábor
2020-05-29  8:50 ` [PATCH 20/34] commit-graph: fill the Modified Path Bloom Filter Index chunk SZEDER Gábor
2020-05-29  8:50 ` [PATCH 21/34] commit-graph: load and use " SZEDER Gábor
2020-05-29  8:50 ` [PATCH 22/34] commit-graph: write the Modified Path Bloom Filters chunk SZEDER Gábor
2020-05-29  8:50 ` SZEDER Gábor [this message]
2020-05-29  8:50 ` [PATCH 24/34] commit-graph: check all leading directories in modified path Bloom filters SZEDER Gábor
2020-05-29  8:50 ` [PATCH 25/34] commit-graph: check embedded modified path Bloom filters with a mask SZEDER Gábor
2020-05-29  8:50 ` [PATCH 26/34] commit-graph: deduplicate modified path Bloom filters SZEDER Gábor
2020-05-29  8:50 ` [PATCH 27/34] commit-graph: load modified path Bloom filters for merge commits SZEDER Gábor
2020-05-29  8:50 ` [PATCH 28/34] commit-graph: write Modified Path Bloom Filter Merge Index chunk SZEDER Gábor
2020-05-29  8:50 ` [PATCH 29/34] commit-graph: extract init and free write_commit_graph_context SZEDER Gábor
2020-05-29  8:50 ` [PATCH 30/34] commit-graph: move write_commit_graph_reachable below write_commit_graph SZEDER Gábor
2020-05-29  8:50 ` [PATCH 31/34] t7007-show: make the first test compatible with the next patch SZEDER Gábor
2020-05-29  8:50 ` [PATCH 32/34] PoC commit-graph: use revision walk machinery for '--reachable' SZEDER Gábor
2020-05-29  8:50 ` [PATCH 33/34] commit-graph: write modified path Bloom filters in "history order" SZEDER Gábor
2020-05-29  8:50 ` [PATCH 34/34] commit-graph: use modified path Bloom filters with wildcards, if possible SZEDER Gábor
2020-05-29 13:59 ` [PoC PATCH 00/34] An alternative modified path Bloom filters implementation Derrick Stolee
2020-06-01 23:25 ` Taylor Blau
2020-06-02 17:08   ` Junio C Hamano

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200529085038.26008-24-szeder.dev@gmail.com \
    --to=szeder.dev@gmail.com \
    --cc=garima.singh@microsoft.com \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    --cc=jnareb@gmail.com \
    --cc=me@ttaylorr.com \
    --cc=peff@peff.net \
    --cc=stolee@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Git Mailing List Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/git/0 git/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 git git/ https://lore.kernel.org/git \
		git@vger.kernel.org
	public-inbox-index git

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.git


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git