All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Derrick Stolee via GitGitGadget" <gitgitgadget@gmail.com>
To: git@vger.kernel.org
Cc: szeder.dev@gmail.com, me@ttaylorr.com,
	Derrick Stolee <derrickstolee@github.com>,
	Derrick Stolee <dstolee@microsoft.com>
Subject: [PATCH 02/15] chunk-format: add API for writing table of contents
Date: Thu, 03 Dec 2020 16:16:41 +0000	[thread overview]
Message-ID: <da7933cc59928c7995821dcc72ceae0c690ebaca.1607012215.git.gitgitgadget@gmail.com> (raw)
In-Reply-To: <pull.804.git.1607012215.gitgitgadget@gmail.com>

From: Derrick Stolee <dstolee@microsoft.com>

The commit-graph and multi-pack-index formats share a concept of
"chunks" that are described by a table of contents near the beginning of
the file.

The table of contents consists of rows of 12 bytes. Each row starts with
a 4-byte ID that signals the type of data stored in the chunk. The row
then continues with an 8-byte offset describing the position in the file
where that data starts. The table of contents lists the chunks in
position order so the length of a chunk can be determined by subtracting
its start position from the start position of the next chunk.

The table of contents always ends with ID 0x0000 to assist finding the
end of the last "real" chunk. Typically, this points to the trailing
hash of a file.

Convert the chunk-writing loop in commit-graph.c to use the new
write_table_of_contents() method in chunk-format.c.

The most subtle part of this conversion is the use of 'cur_offset' to
allow the caller to specify how many bytes were written in the file's
header before the table of contents. This may differ between formats.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 Makefile       |  1 +
 chunk-format.c | 26 ++++++++++++++++++++++++++
 chunk-format.h | 36 ++++++++++++++++++++++++++++++++++++
 commit-graph.c | 23 ++---------------------
 4 files changed, 65 insertions(+), 21 deletions(-)
 create mode 100644 chunk-format.c
 create mode 100644 chunk-format.h

diff --git a/Makefile b/Makefile
index d3a531d3c6..cdbcadac14 100644
--- a/Makefile
+++ b/Makefile
@@ -854,6 +854,7 @@ LIB_OBJS += bundle.o
 LIB_OBJS += cache-tree.o
 LIB_OBJS += chdir-notify.o
 LIB_OBJS += checkout.o
+LIB_OBJS += chunk-format.o
 LIB_OBJS += color.o
 LIB_OBJS += column.o
 LIB_OBJS += combine-diff.o
diff --git a/chunk-format.c b/chunk-format.c
new file mode 100644
index 0000000000..771b6d98d0
--- /dev/null
+++ b/chunk-format.c
@@ -0,0 +1,26 @@
+#include "git-compat-util.h"
+#include "chunk-format.h"
+#include "csum-file.h"
+#define CHUNK_LOOKUP_WIDTH 12
+
+void write_table_of_contents(struct hashfile *f,
+			     uint64_t cur_offset,
+			     struct chunk_info *chunks,
+			     int nr)
+{
+	int i;
+
+	/* Add the table of contents to the current offset */
+	cur_offset += (nr + 1) * CHUNK_LOOKUP_WIDTH;
+
+	for (i = 0; i < nr; i++) {
+		hashwrite_be32(f, chunks[i].id);
+		hashwrite_be64(f, cur_offset);
+
+		cur_offset += chunks[i].size;
+	}
+
+	/* Trailing entry marks the end of the chunks */
+	hashwrite_be32(f, 0);
+	hashwrite_be64(f, cur_offset);
+}
diff --git a/chunk-format.h b/chunk-format.h
new file mode 100644
index 0000000000..4b9cbeb372
--- /dev/null
+++ b/chunk-format.h
@@ -0,0 +1,36 @@
+#ifndef CHUNK_FORMAT_H
+#define CHUNK_FORMAT_H
+
+#include "git-compat-util.h"
+
+struct hashfile;
+
+typedef int (*chunk_write_fn)(struct hashfile *f,
+			      void *data);
+
+/*
+ * When writing a chunk-based file format, collect the chunks in
+ * an array of chunk_info structs. The size stores the _expected_
+ * amount of data that will be written by write_fn.
+ */
+struct chunk_info {
+	uint32_t id;
+	uint64_t size;
+	chunk_write_fn write_fn;
+};
+
+/*
+ * Write the chunk data into the supplied hashfile.
+ *
+ * * 'cur_offset' indicates the number of bytes written to the hashfile
+ *   before the table of contents starts.
+ *
+ * * 'nr' is the number of chunks with non-zero IDs, so 'nr + 1'
+ *   chunks are written in total.
+ */
+void write_table_of_contents(struct hashfile *f,
+			     uint64_t cur_offset,
+			     struct chunk_info *chunks,
+			     int nr);
+
+#endif
diff --git a/commit-graph.c b/commit-graph.c
index 6b5bb8b6b8..5494fda1d3 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -19,6 +19,7 @@
 #include "shallow.h"
 #include "json-writer.h"
 #include "trace2.h"
+#include "chunk-format.h"
 
 void git_test_write_commit_graph_or_die(void)
 {
@@ -1696,15 +1697,6 @@ static int write_graph_chunk_base(struct hashfile *f,
 	return 0;
 }
 
-typedef int (*chunk_write_fn)(struct hashfile *f,
-			      void *data);
-
-struct chunk_info {
-	uint32_t id;
-	uint64_t size;
-	chunk_write_fn write_fn;
-};
-
 static int write_commit_graph_file(struct write_commit_graph_context *ctx)
 {
 	uint32_t i;
@@ -1715,7 +1707,6 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
 	const unsigned hashsz = the_hash_algo->rawsz;
 	struct strbuf progress_title = STRBUF_INIT;
 	int num_chunks = 3;
-	uint64_t chunk_offset;
 	struct object_id file_hash;
 
 	if (ctx->split) {
@@ -1805,17 +1796,7 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
 	hashwrite_u8(f, num_chunks);
 	hashwrite_u8(f, ctx->num_commit_graphs_after - 1);
 
-	chunk_offset = 8 + (num_chunks + 1) * GRAPH_CHUNKLOOKUP_WIDTH;
-	for (i = 0; i <= num_chunks; i++) {
-		uint32_t chunk_write[3];
-
-		chunk_write[0] = htonl(chunks[i].id);
-		chunk_write[1] = htonl(chunk_offset >> 32);
-		chunk_write[2] = htonl(chunk_offset & 0xffffffff);
-		hashwrite(f, chunk_write, 12);
-
-		chunk_offset += chunks[i].size;
-	}
+	write_table_of_contents(f, /* cur_offset */ 8, chunks, num_chunks);
 
 	if (ctx->report_progress) {
 		strbuf_addf(&progress_title,
-- 
gitgitgadget


  parent reply	other threads:[~2020-12-03 16:17 UTC|newest]

Thread overview: 40+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-12-03 16:16 [PATCH 00/15] Refactor chunk-format into an API Derrick Stolee via GitGitGadget
2020-12-03 16:16 ` [PATCH 01/15] commit-graph: anonymize data in chunk_write_fn Derrick Stolee via GitGitGadget
2020-12-03 16:16 ` Derrick Stolee via GitGitGadget [this message]
2020-12-08 17:56   ` [PATCH 02/15] chunk-format: add API for writing table of contents Taylor Blau
2020-12-03 16:16 ` [PATCH 03/15] midx: rename pack_info to write_midx_context Derrick Stolee via GitGitGadget
2020-12-03 16:16 ` [PATCH 04/15] midx: use context in write_midx_pack_names() Derrick Stolee via GitGitGadget
2020-12-03 16:16 ` [PATCH 05/15] midx: add entries to write_midx_context Derrick Stolee via GitGitGadget
2020-12-03 21:42   ` Junio C Hamano
2020-12-04 13:39     ` Derrick Stolee
2020-12-08 18:00   ` Taylor Blau
2020-12-03 16:16 ` [PATCH 06/15] midx: add pack_perm " Derrick Stolee via GitGitGadget
2020-12-03 16:16 ` [PATCH 07/15] midx: add num_large_offsets " Derrick Stolee via GitGitGadget
2020-12-03 16:16 ` [PATCH 08/15] midx: convert chunk write methods to return int Derrick Stolee via GitGitGadget
2020-12-03 21:50   ` Junio C Hamano
2020-12-04 13:40     ` Derrick Stolee
2020-12-03 16:16 ` [PATCH 09/15] midx: drop chunk progress during write Derrick Stolee via GitGitGadget
2020-12-03 16:16 ` [PATCH 10/15] midx: use chunk-format API in write_midx_internal() Derrick Stolee via GitGitGadget
2020-12-08 18:42   ` Taylor Blau
2020-12-10 14:36     ` Derrick Stolee
2020-12-03 16:16 ` [PATCH 11/15] midx: use 64-bit multiplication for chunk sizes Derrick Stolee via GitGitGadget
2020-12-03 22:00   ` Junio C Hamano
2020-12-08 18:43     ` Taylor Blau
2020-12-03 16:16 ` [PATCH 12/15] chunk-format: create write_chunks() Derrick Stolee via GitGitGadget
2020-12-08 18:45   ` Taylor Blau
2020-12-03 16:16 ` [PATCH 13/15] chunk-format: create chunk reading API Derrick Stolee via GitGitGadget
2020-12-03 22:17   ` Junio C Hamano
2020-12-04 13:47     ` Derrick Stolee
2020-12-04 20:17       ` Junio C Hamano
2020-12-03 22:43   ` Junio C Hamano
2020-12-04 13:45     ` Derrick Stolee
2020-12-03 16:16 ` [PATCH 14/15] commit-graph: restore duplicate chunk checks Derrick Stolee via GitGitGadget
2020-12-07 13:43   ` Derrick Stolee
2020-12-03 16:16 ` [PATCH 15/15] chunk-format: add technical docs Derrick Stolee via GitGitGadget
2020-12-04 12:48 ` [PATCH 00/15] Refactor chunk-format into an API René Scharfe
2020-12-04 13:57   ` Derrick Stolee
2020-12-04 19:42   ` Junio C Hamano
2020-12-08 18:49   ` Taylor Blau
2020-12-09 17:13     ` René Scharfe
2020-12-10  0:50       ` Taylor Blau
2020-12-10 14:30         ` Derrick Stolee

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=da7933cc59928c7995821dcc72ceae0c690ebaca.1607012215.git.gitgitgadget@gmail.com \
    --to=gitgitgadget@gmail.com \
    --cc=derrickstolee@github.com \
    --cc=dstolee@microsoft.com \
    --cc=git@vger.kernel.org \
    --cc=me@ttaylorr.com \
    --cc=szeder.dev@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.