[PATCH v4 0/7] merge-ort: implement support for packing objects together

* [PATCH v4 0/7] merge-ort: implement support for packing objects together
@ 2023-10-19 17:28 Taylor Blau
  2023-10-19 17:28 ` [PATCH v4 1/7] bulk-checkin: extract abstract `bulk_checkin_source` Taylor Blau
                   ` (10 more replies)
  0 siblings, 11 replies; 63+ messages in thread
From: Taylor Blau @ 2023-10-19 17:28 UTC (permalink / raw)
  To: git
  Cc: Elijah Newren, Eric W. Biederman, Jeff King, Junio C Hamano,
	Patrick Steinhardt

(Rebased onto the current tip of 'master', which is 813d9a9188 (The
nineteenth batch, 2023-10-18) at the time of writing).

This series implements support for a new merge-tree option,
`--write-pack`, which causes any newly-written objects to be packed
together instead of being stored individually as loose.

I intentionally broke this off from the existing thread, since I
accidentally rerolled mine and Jonathan Tan's Bloom v2 series into it,
causing some confusion.

This is a new round that is significantly simplified thanks to
another very helpful suggestion[1] from Junio. By factoring out a common
"deflate object to pack" that takes an abstract bulk_checkin_source as a
parameter, all of the earlier refactorings can be dropped since we
retain only a single caller instead of multiple.

This resulted in a rather satisfying range-diff (included below, as
usual), and a similarly satisfying inter-diff:

    $ git diff --stat tb/ort-bulk-checkin.v3..
     bulk-checkin.c | 203 ++++++++++++++++---------------------------------
     1 file changed, 64 insertions(+), 139 deletions(-)

Beyond that, the changes since last time can be viewed in the range-diff
below. Thanks in advance for any review!

[1]: https://lore.kernel.org/git/xmqq34y7plj4.fsf@gitster.g/

Taylor Blau (7):
  bulk-checkin: extract abstract `bulk_checkin_source`
  bulk-checkin: generify `stream_blob_to_pack()` for arbitrary types
  bulk-checkin: refactor deflate routine to accept a
    `bulk_checkin_source`
  bulk-checkin: implement `SOURCE_INCORE` mode for `bulk_checkin_source`
  bulk-checkin: introduce `index_blob_bulk_checkin_incore()`
  bulk-checkin: introduce `index_tree_bulk_checkin_incore()`
  builtin/merge-tree.c: implement support for `--write-pack`

 Documentation/git-merge-tree.txt |   4 +
 builtin/merge-tree.c             |   5 +
 bulk-checkin.c                   | 161 ++++++++++++++++++++++++++-----
 bulk-checkin.h                   |   8 ++
 merge-ort.c                      |  42 ++++++--
 merge-recursive.h                |   1 +
 t/t4301-merge-tree-write-tree.sh |  93 ++++++++++++++++++
 7 files changed, 280 insertions(+), 34 deletions(-)

Range-diff against v3:
 1:  2dffa45183 <  -:  ---------- bulk-checkin: factor out `format_object_header_hash()`
 2:  7a10dc794a <  -:  ---------- bulk-checkin: factor out `prepare_checkpoint()`
 3:  20c32d2178 <  -:  ---------- bulk-checkin: factor out `truncate_checkpoint()`
 4:  893051d0b7 <  -:  ---------- bulk-checkin: factor out `finalize_checkpoint()`
 5:  da52ec8380 !  1:  97bb6e9f59 bulk-checkin: extract abstract `bulk_checkin_source`
    @@ bulk-checkin.c: static int stream_blob_to_pack(struct bulk_checkin_packfile *sta
      			if (*already_hashed_to < offset) {
      				size_t hsize = offset - *already_hashed_to;
     @@ bulk-checkin.c: static int deflate_blob_to_pack(struct bulk_checkin_packfile *state,
    - 	git_hash_ctx ctx;
    + 	unsigned header_len;
      	struct hashfile_checkpoint checkpoint = {0};
      	struct pack_idx_entry *idx = NULL;
     +	struct bulk_checkin_source source = {
    @@ bulk-checkin.c: static int deflate_blob_to_pack(struct bulk_checkin_packfile *st
      	seekback = lseek(fd, 0, SEEK_CUR);
      	if (seekback == (off_t) -1)
     @@ bulk-checkin.c: static int deflate_blob_to_pack(struct bulk_checkin_packfile *state,
    - 	while (1) {
    - 		prepare_checkpoint(state, &checkpoint, idx, flags);
    + 			crc32_begin(state->f);
    + 		}
      		if (!stream_blob_to_pack(state, &ctx, &already_hashed_to,
     -					 fd, size, path, flags))
     +					 &source, flags))
      			break;
    - 		truncate_checkpoint(state, &checkpoint, idx);
    + 		/*
    + 		 * Writing this object to the current pack will make
    +@@ bulk-checkin.c: static int deflate_blob_to_pack(struct bulk_checkin_packfile *state,
    + 		hashfile_truncate(state->f, &checkpoint);
    + 		state->offset = checkpoint.offset;
    + 		flush_bulk_checkin_packfile(state);
     -		if (lseek(fd, seekback, SEEK_SET) == (off_t) -1)
     +		if (bulk_checkin_source_seek_to(&source, seekback) == (off_t)-1)
      			return error("cannot seek back");
      	}
    - 	finalize_checkpoint(state, &ctx, &checkpoint, idx, result_oid);
    + 	the_hash_algo->final_oid_fn(result_oid, &ctx);
 7:  04ec74e357 !  2:  9d633df339 bulk-checkin: generify `stream_blob_to_pack()` for arbitrary types
    @@ bulk-checkin.c: static int stream_blob_to_pack(struct bulk_checkin_packfile *sta
      	s.avail_out = sizeof(obuf) - hdrlen;

     @@ bulk-checkin.c: static int deflate_blob_to_pack(struct bulk_checkin_packfile *state,
    - 
    - 	while (1) {
    - 		prepare_checkpoint(state, &checkpoint, idx, flags);
    + 			idx->offset = state->offset;
    + 			crc32_begin(state->f);
    + 		}
     -		if (!stream_blob_to_pack(state, &ctx, &already_hashed_to,
     -					 &source, flags))
     +		if (!stream_obj_to_pack(state, &ctx, &already_hashed_to,
     +					&source, OBJ_BLOB, flags))
      			break;
    - 		truncate_checkpoint(state, &checkpoint, idx);
    - 		if (bulk_checkin_source_seek_to(&source, seekback) == (off_t)-1)
    + 		/*
    + 		 * Writing this object to the current pack will make
 -:  ---------- >  3:  d5bbd7810e bulk-checkin: refactor deflate routine to accept a `bulk_checkin_source`
 6:  4e9bac5bc1 =  4:  e427fe6ad3 bulk-checkin: implement `SOURCE_INCORE` mode for `bulk_checkin_source`
 8:  8667b76365 !  5:  48095afe80 bulk-checkin: introduce `index_blob_bulk_checkin_incore()`
    @@ Commit message
         objects individually as loose.

         Similar to the existing `index_blob_bulk_checkin()` function, the
    -    entrypoint delegates to `deflate_blob_to_pack_incore()`, which is
    -    responsible for formatting the pack header and then deflating the
    -    contents into the pack. The latter is accomplished by calling
    -    deflate_obj_contents_to_pack_incore(), which takes advantage of the
    -    earlier refactorings and is responsible for writing the object to the
    -    pack and handling any overage from pack.packSizeLimit.
    -
    -    The bulk of the new functionality is implemented in the function
    -    `stream_obj_to_pack()`, which can handle streaming objects from memory
    -    to the bulk-checkin pack as a result of the earlier refactoring.
    +    entrypoint delegates to `deflate_obj_to_pack_incore()`. That function in
    +    turn delegates to deflate_obj_to_pack(), which is responsible for
    +    formatting the pack header and then deflating the contents into the
    +    pack.

         Consistent with the rest of the bulk-checkin mechanism, there are no
         direct tests here. In future commits when we expose this new
    @@ Commit message
         Signed-off-by: Taylor Blau <me@ttaylorr.com>

      ## bulk-checkin.c ##
    -@@ bulk-checkin.c: static void finalize_checkpoint(struct bulk_checkin_packfile *state,
    - 	}
    +@@ bulk-checkin.c: static int deflate_obj_to_pack(struct bulk_checkin_packfile *state,
    + 	return 0;
      }

    -+static int deflate_obj_contents_to_pack_incore(struct bulk_checkin_packfile *state,
    -+					       git_hash_ctx *ctx,
    -+					       struct hashfile_checkpoint *checkpoint,
    -+					       struct object_id *result_oid,
    -+					       const void *buf, size_t size,
    -+					       enum object_type type,
    -+					       const char *path, unsigned flags)
    ++static int deflate_obj_to_pack_incore(struct bulk_checkin_packfile *state,
    ++				       struct object_id *result_oid,
    ++				       const void *buf, size_t size,
    ++				       const char *path, enum object_type type,
    ++				       unsigned flags)
     +{
    -+	struct pack_idx_entry *idx = NULL;
    -+	off_t already_hashed_to = 0;
     +	struct bulk_checkin_source source = {
     +		.type = SOURCE_INCORE,
     +		.buf = buf,
    @@ bulk-checkin.c: static void finalize_checkpoint(struct bulk_checkin_packfile *st
     +		.path = path,
     +	};
     +
    -+	/* Note: idx is non-NULL when we are writing */
    -+	if (flags & HASH_WRITE_OBJECT)
    -+		CALLOC_ARRAY(idx, 1);
    -+
    -+	while (1) {
    -+		prepare_checkpoint(state, checkpoint, idx, flags);
    -+
    -+		if (!stream_obj_to_pack(state, ctx, &already_hashed_to, &source,
    -+					type, flags))
    -+			break;
    -+		truncate_checkpoint(state, checkpoint, idx);
    -+		bulk_checkin_source_seek_to(&source, 0);
    -+	}
    -+
    -+	finalize_checkpoint(state, ctx, checkpoint, idx, result_oid);
    -+
    -+	return 0;
    -+}
    -+
    -+static int deflate_blob_to_pack_incore(struct bulk_checkin_packfile *state,
    -+				       struct object_id *result_oid,
    -+				       const void *buf, size_t size,
    -+				       const char *path, unsigned flags)
    -+{
    -+	git_hash_ctx ctx;
    -+	struct hashfile_checkpoint checkpoint = {0};
    -+
    -+	format_object_header_hash(the_hash_algo, &ctx, &checkpoint, OBJ_BLOB,
    -+				  size);
    -+
    -+	return deflate_obj_contents_to_pack_incore(state, &ctx, &checkpoint,
    -+						   result_oid, buf, size,
    -+						   OBJ_BLOB, path, flags);
    ++	return deflate_obj_to_pack(state, result_oid, &source, type, 0, flags);
     +}
     +
      static int deflate_blob_to_pack(struct bulk_checkin_packfile *state,
    @@ bulk-checkin.c: int index_blob_bulk_checkin(struct object_id *oid,
     +				   const void *buf, size_t size,
     +				   const char *path, unsigned flags)
     +{
    -+	int status = deflate_blob_to_pack_incore(&bulk_checkin_packfile, oid,
    -+						 buf, size, path, flags);
    ++	int status = deflate_obj_to_pack_incore(&bulk_checkin_packfile, oid,
    ++						buf, size, path, OBJ_BLOB,
    ++						flags);
     +	if (!odb_transaction_nesting)
     +		flush_bulk_checkin_packfile(&bulk_checkin_packfile);
     +	return status;
 9:  cba043ef14 !  6:  60568f9281 bulk-checkin: introduce `index_tree_bulk_checkin_incore()`
    @@ Commit message
         machinery will have enough to keep track of the converted object's hash
         in order to update the compatibility mapping.

    -    Within `deflate_tree_to_pack_incore()`, the changes should be limited
    -    to something like:
    +    Within some thin wrapper around `deflate_obj_to_pack_incore()` (perhaps
    +    `deflate_tree_to_pack_incore()`), the changes should be limited to
    +    something like:

             struct strbuf converted = STRBUF_INIT;
             if (the_repository->compat_hash_algo) {
    @@ Commit message
         Signed-off-by: Taylor Blau <me@ttaylorr.com>

      ## bulk-checkin.c ##
    -@@ bulk-checkin.c: static int deflate_blob_to_pack_incore(struct bulk_checkin_packfile *state,
    - 						   OBJ_BLOB, path, flags);
    - }
    - 
    -+static int deflate_tree_to_pack_incore(struct bulk_checkin_packfile *state,
    -+				       struct object_id *result_oid,
    -+				       const void *buf, size_t size,
    -+				       const char *path, unsigned flags)
    -+{
    -+	git_hash_ctx ctx;
    -+	struct hashfile_checkpoint checkpoint = {0};
    -+
    -+	format_object_header_hash(the_hash_algo, &ctx, &checkpoint, OBJ_TREE,
    -+				  size);
    -+
    -+	return deflate_obj_contents_to_pack_incore(state, &ctx, &checkpoint,
    -+						   result_oid, buf, size,
    -+						   OBJ_TREE, path, flags);
    -+}
    -+
    - static int deflate_blob_to_pack(struct bulk_checkin_packfile *state,
    - 				struct object_id *result_oid,
    - 				int fd, size_t size,
     @@ bulk-checkin.c: int index_blob_bulk_checkin_incore(struct object_id *oid,
      	return status;
      }
    @@ bulk-checkin.c: int index_blob_bulk_checkin_incore(struct object_id *oid,
     +				   const void *buf, size_t size,
     +				   const char *path, unsigned flags)
     +{
    -+	int status = deflate_tree_to_pack_incore(&bulk_checkin_packfile, oid,
    -+						 buf, size, path, flags);
    ++	int status = deflate_obj_to_pack_incore(&bulk_checkin_packfile, oid,
    ++						buf, size, path, OBJ_TREE,
    ++						flags);
     +	if (!odb_transaction_nesting)
     +		flush_bulk_checkin_packfile(&bulk_checkin_packfile);
     +	return status;
10:  ae70508037 =  7:  b9be9df122 builtin/merge-tree.c: implement support for `--write-pack`
-- 
2.42.0.405.g86fe3250c2

^ permalink raw reply	[flat|nested] 63+ messages in thread