Re: [PATCH 10/15] midx: use chunk-format API in write_midx_internal()

From: Taylor Blau <me@ttaylorr.com>
To: Derrick Stolee via GitGitGadget <gitgitgadget@gmail.com>
Cc: git@vger.kernel.org, szeder.dev@gmail.com,
	Derrick Stolee <derrickstolee@github.com>,
	Derrick Stolee <dstolee@microsoft.com>
Subject: Re: [PATCH 10/15] midx: use chunk-format API in write_midx_internal()
Date: Tue, 8 Dec 2020 13:42:05 -0500	[thread overview]
Message-ID: <X8/I/RzXZksio+ri@nand.local> (raw)
In-Reply-To: <f2f78ee1054f64fe767335c4b0c77ba2fbec5fae.1607012215.git.gitgitgadget@gmail.com>

On Thu, Dec 03, 2020 at 04:16:49PM +0000, Derrick Stolee via GitGitGadget wrote:
> From: Derrick Stolee <dstolee@microsoft.com>
>
> The chunk-format API allows automatically writing the table of contents
> for a chunk-based file format when using an array of "struct
> chunk_info"s. Update write_midx_internal() to use this strategy, which
> also simplifies the chunk writing loop. This loop will be replaced with
> a chunk-format API call in an upcoming change.
>
> Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
> ---
>  midx.c | 96 +++++++++++++---------------------------------------------
>  1 file changed, 21 insertions(+), 75 deletions(-)
>
> diff --git a/midx.c b/midx.c
> index ce6d4339bd..0548266bea 100644
> --- a/midx.c
> +++ b/midx.c
> @@ -11,6 +11,7 @@
>  #include "trace2.h"
>  #include "run-command.h"
>  #include "repository.h"
> +#include "chunk-format.h"
>
>  #define MIDX_SIGNATURE 0x4d494458 /* "MIDX" */
>  #define MIDX_VERSION 1
> @@ -799,15 +800,14 @@ static int write_midx_large_offsets(struct hashfile *f,
>  static int write_midx_internal(const char *object_dir, struct multi_pack_index *m,
>  			       struct string_list *packs_to_drop, unsigned flags)
>  {
> -	unsigned char cur_chunk, num_chunks = 0;
> +	unsigned char num_chunks = 0;
>  	char *midx_name;
>  	uint32_t i;
>  	struct hashfile *f = NULL;
>  	struct lock_file lk;
>  	struct write_midx_context ctx = { 0 };
>  	uint64_t header_size = 0;
> -	uint32_t chunk_ids[MIDX_MAX_CHUNKS + 1];
> -	uint64_t chunk_offsets[MIDX_MAX_CHUNKS + 1];
> +	struct chunk_info chunks[MIDX_MAX_CHUNKS];
>  	int pack_name_concat_len = 0;
>  	int dropped_packs = 0;
>  	int result = 0;
> @@ -923,7 +923,6 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index *
>  	if (ctx.m)
>  		close_midx(ctx.m);
>
> -	cur_chunk = 0;
>  	num_chunks = ctx.large_offsets_needed ? 5 : 4;
>
>  	if (ctx.nr - dropped_packs == 0) {
> @@ -934,85 +933,32 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index *
>
>  	header_size = write_midx_header(f, num_chunks, ctx.nr - dropped_packs);
>
> -	chunk_ids[cur_chunk] = MIDX_CHUNKID_PACKNAMES;
> -	chunk_offsets[cur_chunk] = header_size + (num_chunks + 1) * MIDX_CHUNKLOOKUP_WIDTH;
> +	chunks[0].id = MIDX_CHUNKID_PACKNAMES;
> +	chunks[0].size = pack_name_concat_len;
> +	chunks[0].write_fn = write_midx_pack_names;

[...]

Hmm. The caller has to do quite a lot of work in order to feed chunks
into the new chunk-format code.

I wonder what it would look like if we introduced a new 'struct
chunkfile' type. It would know about the underlying hashfile that it's
writing to, as well as the chunks it's supposed to write. It would
likely support three operations: add a new chunk, write the TOC, and
write the contents (the latter two could probably be combined into one).

Below is a patch to do just that. It seems to work OK on t5319, but
fails some tests in t5318. I haven't looked into the commit-graph test
failures, but I'd be happy to do so if you think this is a direction
worth going in.

Before I show the patch, though, let's take a look at the stat:

 chunk-format.c | 61 ++++++++++++++++++++++++++++++++++++++----------
 chunk-format.h | 25 ++++++++++++--------
 commit-graph.c | 90 ++++++++++++++++++++++-------------------------------------------------
 midx.c         | 38 +++++++++++++-----------------
 4 files changed, 107 insertions(+), 107 deletions(-)

That's a little bit more code in the chunk-format code, which I can live
with, but the diff in commit-graph.c and midx.c is a net-negative, as I
would expect it to be. So, I'm happy that this seems to make things
easier on the callers.

OK, enough rambling. Here's the patch:

diff --git a/chunk-format.c b/chunk-format.c
index 771b6d98d0..14f2fe5c9a 100644
--- a/chunk-format.c
+++ b/chunk-format.c
@@ -1,26 +1,63 @@
 #include "git-compat-util.h"
 #include "chunk-format.h"
 #include "csum-file.h"
+#include "cache.h"
 #define CHUNK_LOOKUP_WIDTH 12

-void write_table_of_contents(struct hashfile *f,
-			     uint64_t cur_offset,
-			     struct chunk_info *chunks,
-			     int nr)
+void chunkfile_init(struct chunkfile *out, struct hashfile *f)
 {
-	int i;
+	if (!out)
+		BUG("cannot initialize null chunkfile");
+
+	memset(out, 0, sizeof(*out));
+	out->f = f;
+}
+
+void chunkfile_push_chunk(struct chunkfile *c,
+			  uint32_t id, uint64_t size, chunk_write_fn write_fn)
+{
+	ALLOC_GROW(c->chunks, c->chunks_nr + 1, c->chunks_alloc);
+	c->chunks[c->chunks_nr].id = id;
+	c->chunks[c->chunks_nr].size = size;
+	c->chunks[c->chunks_nr].write_fn = write_fn;
+	c->chunks_nr++;
+}
+
+void chunkfile_write_toc(struct chunkfile *c)
+{
+	size_t i;
+	off_t cur_offset = hashfile_total(c->f);

 	/* Add the table of contents to the current offset */
-	cur_offset += (nr + 1) * CHUNK_LOOKUP_WIDTH;
+	cur_offset += (c->chunks_nr + 1) * CHUNK_LOOKUP_WIDTH;

-	for (i = 0; i < nr; i++) {
-		hashwrite_be32(f, chunks[i].id);
-		hashwrite_be64(f, cur_offset);
+	for (i = 0; i < c->chunks_nr; i++) {
+		hashwrite_be32(c->f, c->chunks[i].id);
+		hashwrite_be64(c->f, cur_offset);

-		cur_offset += chunks[i].size;
+		cur_offset += c->chunks[i].size;
 	}

 	/* Trailing entry marks the end of the chunks */
-	hashwrite_be32(f, 0);
-	hashwrite_be64(f, cur_offset);
+	hashwrite_be32(c->f, 0);
+	hashwrite_be64(c->f, cur_offset);
+}
+
+void chunkfile_write_chunks(struct chunkfile *c, void *ctx)
+{
+	size_t i;
+
+	for (i = 0; i < c->chunks_nr; i++) {
+		off_t before, after;
+		struct chunk_info *chunk = &c->chunks[i];
+		if (!chunk->write_fn)
+			continue;
+
+		before = hashfile_total(c->f);
+		chunk->write_fn(c->f, ctx);
+		after = hashfile_total(c->f);
+
+		if (after - before != chunk->size)
+			BUG("unexpected chunk size");
+	}
 }
diff --git a/chunk-format.h b/chunk-format.h
index 4b9cbeb372..b0fb515425 100644
--- a/chunk-format.h
+++ b/chunk-format.h
@@ -19,18 +19,23 @@ struct chunk_info {
 	chunk_write_fn write_fn;
 };

+struct chunkfile {
+	struct hashfile *f;
+
+	struct chunk_info *chunks;
+	size_t chunks_nr;
+	size_t chunks_alloc;
+};
+
+void chunkfile_init(struct chunkfile *out, struct hashfile *f);
+void chunkfile_push_chunk(struct chunkfile *c,
+			  uint32_t id, uint64_t size, chunk_write_fn write_fn);
+
 /*
  * Write the chunk data into the supplied hashfile.
- *
- * * 'cur_offset' indicates the number of bytes written to the hashfile
- *   before the table of contents starts.
- *
- * * 'nr' is the number of chunks with non-zero IDs, so 'nr + 1'
- *   chunks are written in total.
  */
-void write_table_of_contents(struct hashfile *f,
-			     uint64_t cur_offset,
-			     struct chunk_info *chunks,
-			     int nr);
+void chunkfile_write_toc(struct chunkfile *c);
+
+void chunkfile_write_chunks(struct chunkfile *c, void *ctx);

 #endif
diff --git a/commit-graph.c b/commit-graph.c
index 5494fda1d3..abc0c9e46f 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -1702,11 +1702,9 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
 	uint32_t i;
 	int fd;
 	struct hashfile *f;
+	struct chunkfile cf;
 	struct lock_file lk = LOCK_INIT;
-	struct chunk_info chunks[MAX_NUM_CHUNKS + 1];
 	const unsigned hashsz = the_hash_algo->rawsz;
-	struct strbuf progress_title = STRBUF_INIT;
-	int num_chunks = 3;
 	struct object_id file_hash;

 	if (ctx->split) {
@@ -1753,76 +1751,42 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
 		f = hashfd(lk.tempfile->fd, lk.tempfile->filename.buf);
 	}

-	chunks[0].id = GRAPH_CHUNKID_OIDFANOUT;
-	chunks[0].size = GRAPH_FANOUT_SIZE;
-	chunks[0].write_fn = write_graph_chunk_fanout;
-	chunks[1].id = GRAPH_CHUNKID_OIDLOOKUP;
-	chunks[1].size = hashsz * ctx->commits.nr;
-	chunks[1].write_fn = write_graph_chunk_oids;
-	chunks[2].id = GRAPH_CHUNKID_DATA;
-	chunks[2].size = (hashsz + 16) * ctx->commits.nr;
-	chunks[2].write_fn = write_graph_chunk_data;
-	if (ctx->num_extra_edges) {
-		chunks[num_chunks].id = GRAPH_CHUNKID_EXTRAEDGES;
-		chunks[num_chunks].size = 4 * ctx->num_extra_edges;
-		chunks[num_chunks].write_fn = write_graph_chunk_extra_edges;
-		num_chunks++;
-	}
+	chunkfile_init(&cf, f);
+
+	chunkfile_push_chunk(&cf, GRAPH_CHUNKID_OIDFANOUT,
+			     GRAPH_FANOUT_SIZE, write_graph_chunk_fanout);
+	chunkfile_push_chunk(&cf, GRAPH_CHUNKID_OIDLOOKUP,
+			     hashsz * ctx->commits.nr, write_graph_chunk_oids);
+	chunkfile_push_chunk(&cf, GRAPH_CHUNKID_DATA,
+			     (hashsz + 16) * ctx->commits.nr, write_graph_chunk_data);
+	if (ctx->num_extra_edges)
+		chunkfile_push_chunk(&cf, GRAPH_CHUNKID_EXTRAEDGES,
+				     4 * ctx->num_extra_edges,
+				     write_graph_chunk_extra_edges);
 	if (ctx->changed_paths) {
-		chunks[num_chunks].id = GRAPH_CHUNKID_BLOOMINDEXES;
-		chunks[num_chunks].size = sizeof(uint32_t) * ctx->commits.nr;
-		chunks[num_chunks].write_fn = write_graph_chunk_bloom_indexes;
-		num_chunks++;
-		chunks[num_chunks].id = GRAPH_CHUNKID_BLOOMDATA;
-		chunks[num_chunks].size = sizeof(uint32_t) * 3
-					  + ctx->total_bloom_filter_data_size;
-		chunks[num_chunks].write_fn = write_graph_chunk_bloom_data;
-		num_chunks++;
-	}
-	if (ctx->num_commit_graphs_after > 1) {
-		chunks[num_chunks].id = GRAPH_CHUNKID_BASE;
-		chunks[num_chunks].size = hashsz * (ctx->num_commit_graphs_after - 1);
-		chunks[num_chunks].write_fn = write_graph_chunk_base;
-		num_chunks++;
+		chunkfile_push_chunk(&cf, GRAPH_CHUNKID_BLOOMINDEXES,
+				     sizeof(uint32_t) * ctx->commits.nr,
+				     write_graph_chunk_bloom_indexes);
+		chunkfile_push_chunk(&cf, GRAPH_CHUNKID_BLOOMDATA,
+				     sizeof(uint32_t) * 3 + ctx->total_bloom_filter_data_size,
+				     write_graph_chunk_bloom_data);
 	}
+	if (ctx->num_commit_graphs_after > 1)
+		chunkfile_push_chunk(&cf, GRAPH_CHUNKID_BASE,
+				     hashsz * (ctx->num_commit_graphs_after - 1),
+				     write_graph_chunk_base);

-	chunks[num_chunks].id = 0;
-	chunks[num_chunks].size = 0;
+	chunkfile_push_chunk(&cf, 0, 0, NULL);

 	hashwrite_be32(f, GRAPH_SIGNATURE);

 	hashwrite_u8(f, GRAPH_VERSION);
 	hashwrite_u8(f, oid_version());
-	hashwrite_u8(f, num_chunks);
+	hashwrite_u8(f, cf.chunks_nr - 1);
 	hashwrite_u8(f, ctx->num_commit_graphs_after - 1);

-	write_table_of_contents(f, /* cur_offset */ 8, chunks, num_chunks);
-
-	if (ctx->report_progress) {
-		strbuf_addf(&progress_title,
-			    Q_("Writing out commit graph in %d pass",
-			       "Writing out commit graph in %d passes",
-			       num_chunks),
-			    num_chunks);
-		ctx->progress = start_delayed_progress(
-			progress_title.buf,
-			num_chunks * ctx->commits.nr);
-	}
-
-	for (i = 0; i < num_chunks; i++) {
-		uint64_t start_offset = f->total + f->offset;
-
-		if (chunks[i].write_fn(f, ctx))
-			return -1;
-
-		if (f->total + f->offset != start_offset + chunks[i].size)
-			BUG("expected to write %"PRId64" bytes to chunk %"PRIx32", but wrote %"PRId64" instead",
-			    chunks[i].size, chunks[i].id,
-			    f->total + f->offset - start_offset);
-	}
-
-	stop_progress(&ctx->progress);
-	strbuf_release(&progress_title);
+	chunkfile_write_toc(&cf);
+	chunkfile_write_chunks(&cf, ctx);

 	if (ctx->split && ctx->base_graph_name && ctx->num_commit_graphs_after > 1) {
 		char *new_base_hash = xstrdup(oid_to_hex(&ctx->new_base_graph->oid));
diff --git a/midx.c b/midx.c
index 0548266bea..6315ea7555 100644
--- a/midx.c
+++ b/midx.c
@@ -807,7 +807,7 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index *
 	struct lock_file lk;
 	struct write_midx_context ctx = { 0 };
 	uint64_t header_size = 0;
-	struct chunk_info chunks[MIDX_MAX_CHUNKS];
+	struct chunkfile cf;
 	int pack_name_concat_len = 0;
 	int dropped_packs = 0;
 	int result = 0;
@@ -933,33 +933,27 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index *

 	header_size = write_midx_header(f, num_chunks, ctx.nr - dropped_packs);

-	chunks[0].id = MIDX_CHUNKID_PACKNAMES;
-	chunks[0].size = pack_name_concat_len;
-	chunks[0].write_fn = write_midx_pack_names;
+	chunkfile_init(&cf, f);

-	chunks[1].id = MIDX_CHUNKID_OIDFANOUT;
-	chunks[1].size = MIDX_CHUNK_FANOUT_SIZE;
-	chunks[1].write_fn = write_midx_oid_fanout;
-
-	chunks[2].id = MIDX_CHUNKID_OIDLOOKUP;
-	chunks[2].size = ctx.entries_nr * the_hash_algo->rawsz;
-	chunks[2].write_fn = write_midx_oid_lookup;
-
-	chunks[3].id = MIDX_CHUNKID_OBJECTOFFSETS;
-	chunks[3].size = ctx.entries_nr * MIDX_CHUNK_OFFSET_WIDTH;
-	chunks[3].write_fn = write_midx_object_offsets;
+	chunkfile_push_chunk(&cf, MIDX_CHUNKID_PACKNAMES,
+			     pack_name_concat_len, write_midx_pack_names);
+	chunkfile_push_chunk(&cf, MIDX_CHUNKID_OIDFANOUT,
+			     MIDX_CHUNK_FANOUT_SIZE, write_midx_oid_fanout);
+	chunkfile_push_chunk(&cf, MIDX_CHUNKID_OIDLOOKUP,
+			     ctx.entries_nr * the_hash_algo->rawsz, write_midx_oid_lookup);
+	chunkfile_push_chunk(&cf, MIDX_CHUNKID_OBJECTOFFSETS,
+			     ctx.entries_nr * MIDX_CHUNK_OFFSET_WIDTH, write_midx_object_offsets);

 	if (ctx.large_offsets_needed) {
-		chunks[4].id = MIDX_CHUNKID_LARGEOFFSETS;
-		chunks[4].size = ctx.num_large_offsets * MIDX_CHUNK_LARGE_OFFSET_WIDTH;
-		chunks[4].write_fn = write_midx_large_offsets;
+		chunkfile_push_chunk(&cf, MIDX_CHUNKID_LARGEOFFSETS,
+				     ctx.num_large_offsets * MIDX_CHUNK_LARGE_OFFSET_WIDTH,
+				     write_midx_large_offsets);
 	}

-	write_table_of_contents(f, header_size, chunks, num_chunks);
-
-	for (i = 0; i < num_chunks; i++)
-		chunks[i].write_fn(f, &ctx);
+	chunkfile_write_toc(&cf);
+	chunkfile_write_chunks(&cf, &ctx);

+	/* maybe move this into chunkfile??? */
 	finalize_hashfile(f, NULL, CSUM_FSYNC | CSUM_HASH_IN_STREAM);
 	commit_lock_file(&lk);