All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] mktree: learn about promised objects
@ 2022-06-14 13:36 Richard Oliver
  2022-06-14 14:14 ` Derrick Stolee
  0 siblings, 1 reply; 20+ messages in thread
From: Richard Oliver @ 2022-06-14 13:36 UTC (permalink / raw)
  To: git; +Cc: jonathantanmy

Do not use oid_object_info() to determine object type in mktree_line()
as this can cause promised objects to be dynamically faulted-in one at a
time which has poor performance. Instead, use a combination of
oid_object_info_extended() (with OBJECT_INFO_SKIP_FETCH_OBJECT option),
and the newly introduced promisor_object_type() to determine object type
before defaulting to fetch from remote.

Signed-off-by: Richard Oliver <roliver@roku.com>
---
 builtin/mktree.c         | 19 ++++++++++++++--
 packfile.c               | 49 ++++++++++++++++++++++++++++------------
 packfile.h               |  6 +++++
 t/t0410-partial-clone.sh | 28 +++++++++++++++++++++++
 4 files changed, 86 insertions(+), 16 deletions(-)

diff --git a/builtin/mktree.c b/builtin/mktree.c
index 902edba6d2..a783767b28 100644
--- a/builtin/mktree.c
+++ b/builtin/mktree.c
@@ -8,6 +8,7 @@
 #include "tree.h"
 #include "parse-options.h"
 #include "object-store.h"
+#include "packfile.h"
 
 static struct treeent {
 	unsigned mode;
@@ -116,8 +117,22 @@ static void mktree_line(char *buf, int nul_term_line, int allow_missing)
 			path, ptr, type_name(mode_type));
 	}
 
-	/* Check the type of object identified by sha1 */
-	obj_type = oid_object_info(the_repository, &oid, NULL);
+	/* Check the type of object identified by oid without fetching objects */
+	struct object_info oi = OBJECT_INFO_INIT;
+	oi.typep = &obj_type;
+	if (oid_object_info_extended(the_repository, &oid, &oi,
+				     OBJECT_INFO_LOOKUP_REPLACE |
+				     OBJECT_INFO_SKIP_FETCH_OBJECT) < 0)
+		obj_type = -1;
+
+	/* Has the object been promised to us if we can't find it */
+	if (obj_type < 0)
+		obj_type = promisor_object_type(&oid);
+
+	/* Try to fetch the object from the remote */
+	if (obj_type < 0)
+		obj_type = oid_object_info(the_repository, &oid, NULL);
+
 	if (obj_type < 0) {
 		if (allow_missing) {
 			; /* no problem - missing objects are presumed to be of the right type */
diff --git a/packfile.c b/packfile.c
index 8e812a84a3..d27cb106c1 100644
--- a/packfile.c
+++ b/packfile.c
@@ -2223,17 +2223,26 @@ int for_each_packed_object(each_packed_object_fn cb, void *data,
 	return r ? r : pack_errors;
 }
 
+static void map_put_type(kh_oid_pos_t *map,
+	const struct object_id *oid,
+	enum object_type type)
+{
+	int hash_ret;
+	khiter_t pos = kh_put_oid_pos(map, *oid, &hash_ret);
+	kh_value(map, pos) = type;
+}
+
 static int add_promisor_object(const struct object_id *oid,
 			       struct packed_git *pack,
 			       uint32_t pos,
-			       void *set_)
+			       void *map_)
 {
-	struct oidset *set = set_;
+	kh_oid_pos_t *map = map_;
 	struct object *obj = parse_object(the_repository, oid);
 	if (!obj)
 		return 1;
 
-	oidset_insert(set, oid);
+	map_put_type(map, oid, obj->type);
 
 	/*
 	 * If this is a tree, commit, or tag, the objects it refers
@@ -2250,34 +2259,46 @@ static int add_promisor_object(const struct object_id *oid,
 			 */
 			return 0;
 		while (tree_entry_gently(&desc, &entry))
-			oidset_insert(set, &entry.oid);
+			map_put_type(map,
+				     &entry.oid,
+				     object_type(entry.mode));
 		free_tree_buffer(tree);
 	} else if (obj->type == OBJ_COMMIT) {
 		struct commit *commit = (struct commit *) obj;
 		struct commit_list *parents = commit->parents;
 
-		oidset_insert(set, get_commit_tree_oid(commit));
+		map_put_type(map, get_commit_tree_oid(commit), OBJ_TREE);
 		for (; parents; parents = parents->next)
-			oidset_insert(set, &parents->item->object.oid);
+			map_put_type(map,
+				     &parents->item->object.oid,
+				     OBJ_COMMIT);
 	} else if (obj->type == OBJ_TAG) {
 		struct tag *tag = (struct tag *) obj;
-		oidset_insert(set, get_tagged_oid(tag));
+		map_put_type(map, get_tagged_oid(tag), OBJ_COMMIT);
 	}
 	return 0;
 }
 
-int is_promisor_object(const struct object_id *oid)
+enum object_type promisor_object_type(const struct object_id *oid)
 {
-	static struct oidset promisor_objects;
-	static int promisor_objects_prepared;
+	static kh_oid_pos_t *promisor_objects;
 
-	if (!promisor_objects_prepared) {
+	if (!promisor_objects) {
+		promisor_objects = kh_init_oid_pos();
 		if (has_promisor_remote()) {
 			for_each_packed_object(add_promisor_object,
-					       &promisor_objects,
+					       promisor_objects,
 					       FOR_EACH_OBJECT_PROMISOR_ONLY);
 		}
-		promisor_objects_prepared = 1;
 	}
-	return oidset_contains(&promisor_objects, oid);
+	khiter_t pos = kh_get_oid_pos(promisor_objects, *oid);
+	if (pos < kh_end(promisor_objects))
+		return kh_value(promisor_objects, pos);
+	else
+		return OBJ_BAD;
+}
+
+int is_promisor_object(const struct object_id *oid)
+{
+	return (promisor_object_type(oid) > OBJ_BAD);
 }
diff --git a/packfile.h b/packfile.h
index a3f6723857..35d0b600d2 100644
--- a/packfile.h
+++ b/packfile.h
@@ -182,6 +182,12 @@ int has_pack_index(const unsigned char *sha1);
  */
 int is_promisor_object(const struct object_id *oid);
 
+/*
+ * Return the object type if the given object is in, or referred to
+ * by, a promisor packfile; OBJ_BAD otherwise.
+ */
+enum object_type promisor_object_type(const struct object_id *oid);
+
 /*
  * Expose a function for fuzz testing.
  *
diff --git a/t/t0410-partial-clone.sh b/t/t0410-partial-clone.sh
index 1e864cf317..12bfb02187 100755
--- a/t/t0410-partial-clone.sh
+++ b/t/t0410-partial-clone.sh
@@ -660,6 +660,34 @@ test_expect_success 'lazy-fetch when accessing object not in the_repository' '
 	! grep "[?]$FILE_HASH" out
 '
 
+test_expect_success 'missing blob object promised by tree, passes mktree' '
+	rm -rf repo &&
+	test_create_repo repo &&
+	echo foo >repo/foo &&
+	A=$(git -C repo hash-object foo) &&
+	git -C repo update-index --add --info-only foo &&
+	B=$(git -C repo write-tree --missing-ok) &&
+	printf "$B\n" | pack_as_from_promisor &&
+	delete_object repo "$B" &&
+
+	git -C repo config core.repositoryformatversion 1 &&
+	git -C repo config extensions.partialclone "arbitrary string" &&
+
+	printf "100644 blob $A\tbar" | git -C repo mktree
+'
+
+test_expect_success 'missing blob object, not promised, faulted-in by mktree' '
+	test_create_repo server &&
+	A=$(echo foo | git -C server hash-object --stdin -w) &&
+	test_commit -C server 1 &&
+	test_config -C server uploadpack.allowfilter 1 &&
+	test_config -C server uploadpack.allowanysha1inwant 1 &&
+
+	rm -rf repo &&
+	git clone --filter=blob:none "file://$(pwd)/server" repo &&
+	printf "100644 blob $A\tbar" | git -C repo mktree
+'
+
 . "$TEST_DIRECTORY"/lib-httpd.sh
 start_httpd
 

base-commit: 1e59178e3f65880188caedb965e70db5ceeb2d64
-- 
2.36.1.384.gcc60f6fd7d


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH] mktree: learn about promised objects
  2022-06-14 13:36 [PATCH] mktree: learn about promised objects Richard Oliver
@ 2022-06-14 14:14 ` Derrick Stolee
  2022-06-14 16:33   ` Richard Oliver
  0 siblings, 1 reply; 20+ messages in thread
From: Derrick Stolee @ 2022-06-14 14:14 UTC (permalink / raw)
  To: Richard Oliver, git; +Cc: jonathantanmy

On 6/14/2022 9:36 AM, Richard Oliver wrote:
> Do not use oid_object_info() to determine object type in mktree_line()
> as this can cause promised objects to be dynamically faulted-in one at a
> time which has poor performance. Instead, use a combination of
> oid_object_info_extended() (with OBJECT_INFO_SKIP_FETCH_OBJECT option),
> and the newly introduced promisor_object_type() to determine object type
> before defaulting to fetch from remote.

Have you run some performance tests on this? It seems like this will
scan all packed objects, which is probably much slower than just asking
the remote for the object in most cases.

Thanks,
-Stolee

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] mktree: learn about promised objects
  2022-06-14 14:14 ` Derrick Stolee
@ 2022-06-14 16:33   ` Richard Oliver
  2022-06-14 17:27     ` Derrick Stolee
  0 siblings, 1 reply; 20+ messages in thread
From: Richard Oliver @ 2022-06-14 16:33 UTC (permalink / raw)
  To: Derrick Stolee, git; +Cc: jonathantanmy

On 14/06/2022 15:14, Derrick Stolee wrote:
> On 6/14/2022 9:36 AM, Richard Oliver wrote:
>> Do not use oid_object_info() to determine object type in mktree_line()
>> as this can cause promised objects to be dynamically faulted-in one at a
>> time which has poor performance. Instead, use a combination of
>> oid_object_info_extended() (with OBJECT_INFO_SKIP_FETCH_OBJECT option),
>> and the newly introduced promisor_object_type() to determine object type
>> before defaulting to fetch from remote.
> 
> Have you run some performance tests on this? It seems like this will
> scan all packed objects, which is probably much slower than just asking
> the remote for the object in most cases.
> 
> Thanks,
> -Stolee


Hi Stolee,

I've put together a synthetic experiment below (adding a new blob to anexisting tree) to show you the behaviour that we've been seeing.  Our
actual use-case (where we first encountered this behaviour) is updating
submodules to a known hash. As you can see, the round-trip time of fetching
objects one-by-one is very expensive.

Before, using system git (git version 2.32.0 (Apple Git-132)):

> $ git init
> # Fetch a recent tree
> $ git fetch --filter=tree:0 --depth 1 https://github.com/git/git cdb48006b0ec7fe19794daf7b5363ab42d9d9371
> remote: Enumerating objects: 1, done.
> remote: Counting objects: 100% (1/1), done.
> remote: Total 1 (delta 0), reused 0 (delta 0), pack-reused 0
> Receiving objects: 100% (1/1), 13.12 KiB | 13.12 MiB/s, done.
> From https://github.com/git/git
>  * branch            cdb48006b0ec7fe19794daf7b5363ab42d9d9371 -> FETCH_HEAD
>
> $ NEW_BLOB=$(echo zzz | git hash-object --stdin -w)
>
> $ cat <(git ls-tree FETCH_HEAD) <(printf "100644 blob ${NEW_BLOB}\tzzz") | time git mktree
> remote: Enumerating objects: 1, done.
> remote: Counting objects: 100% (1/1), done.
> remote: Total 1 (delta 0), reused 0 (delta 0), pack-reused 0
> Receiving objects: 100% (1/1), 334 bytes | 334.00 KiB/s, done.
> remote: Enumerating objects: 1, done.
> remote: Counting objects: 100% (1/1), done.
> remote: Total 1 (delta 0), reused 0 (delta 0), pack-reused 0
> Receiving objects: 100% (1/1), 2.01 KiB | 2.01 MiB/s, done.
> remote: Enumerating objects: 1, done.
> remote: Counting objects: 100% (1/1), done.
> remote: Total 1 (delta 0), reused 0 (delta 0), pack-reused 0
> Receiving objects: 100% (1/1), 256 bytes | 256.00 KiB/s, done.
> # ...
> # SOME TIME LATER
> # ...
> e26c7ce7357b1649da7b4200d4e80d0b668db8d4
>       286.49 real        15.66 user        15.59 sys

Repeated experiment, but using modified mktree:

> $ cat <(git ls-tree FETCH_HEAD) <(printf "100644 blob ${NEW_BLOB}\tzzz") | time git mktree
> e26c7ce7357b1649da7b4200d4e80d0b668db8d4
>         0.06 real         0.01 user         0.03 sys

Did you have any other sort of performance test in mind? The remotes we
typically deal with are geographically far away and deal with a high volume
of traffic so we're keen to move behaviour to the client where it makes sense
to do so.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] mktree: learn about promised objects
  2022-06-14 16:33   ` Richard Oliver
@ 2022-06-14 17:27     ` Derrick Stolee
  2022-06-15  0:35       ` Taylor Blau
  0 siblings, 1 reply; 20+ messages in thread
From: Derrick Stolee @ 2022-06-14 17:27 UTC (permalink / raw)
  To: Richard Oliver, git; +Cc: jonathantanmy

On 6/14/2022 12:33 PM, Richard Oliver wrote:
> On 14/06/2022 15:14, Derrick Stolee wrote:
>> On 6/14/2022 9:36 AM, Richard Oliver wrote:
>>> Do not use oid_object_info() to determine object type in mktree_line()
>>> as this can cause promised objects to be dynamically faulted-in one at a
>>> time which has poor performance. Instead, use a combination of
>>> oid_object_info_extended() (with OBJECT_INFO_SKIP_FETCH_OBJECT option),
>>> and the newly introduced promisor_object_type() to determine object type
>>> before defaulting to fetch from remote.
>>
>> Have you run some performance tests on this? It seems like this will
>> scan all packed objects, which is probably much slower than just asking
>> the remote for the object in most cases.
>>
>> Thanks,
>> -Stolee
> 
> 
> Hi Stolee,
> 
> I've put together a synthetic experiment below (adding a new blob to anexisting tree) to show you the behaviour that we've been seeing.  Our
> actual use-case (where we first encountered this behaviour) is updating
> submodules to a known hash. As you can see, the round-trip time of fetching
> objects one-by-one is very expensive.
> 
> Before, using system git (git version 2.32.0 (Apple Git-132)):
> 
>> $ git init
>> # Fetch a recent tree
>> $ git fetch --filter=tree:0 --depth 1 https://github.com/git/git cdb48006b0ec7fe19794daf7b5363ab42d9d9371
>> remote: Enumerating objects: 1, done.
>> remote: Counting objects: 100% (1/1), done.
>> remote: Total 1 (delta 0), reused 0 (delta 0), pack-reused 0
>> Receiving objects: 100% (1/1), 13.12 KiB | 13.12 MiB/s, done.
>> From https://github.com/git/git
>>  * branch            cdb48006b0ec7fe19794daf7b5363ab42d9d9371 -> FETCH_HEAD
>>
>> $ NEW_BLOB=$(echo zzz | git hash-object --stdin -w)
>>
>> $ cat <(git ls-tree FETCH_HEAD) <(printf "100644 blob ${NEW_BLOB}\tzzz") | time git mktree
>> remote: Enumerating objects: 1, done.
>> remote: Counting objects: 100% (1/1), done.
>> remote: Total 1 (delta 0), reused 0 (delta 0), pack-reused 0
>> Receiving objects: 100% (1/1), 334 bytes | 334.00 KiB/s, done.
>> remote: Enumerating objects: 1, done.
>> remote: Counting objects: 100% (1/1), done.
>> remote: Total 1 (delta 0), reused 0 (delta 0), pack-reused 0
>> Receiving objects: 100% (1/1), 2.01 KiB | 2.01 MiB/s, done.
>> remote: Enumerating objects: 1, done.
>> remote: Counting objects: 100% (1/1), done.
>> remote: Total 1 (delta 0), reused 0 (delta 0), pack-reused 0
>> Receiving objects: 100% (1/1), 256 bytes | 256.00 KiB/s, done.
>> # ...
>> # SOME TIME LATER
>> # ...
>> e26c7ce7357b1649da7b4200d4e80d0b668db8d4
>>       286.49 real        15.66 user        15.59 sys

I see. The problem here is that we are making _many requests_ for the same
tree, so maybe it would be better to introduce a batched download for the
list of missing objects. This would require iterating over the objects for
the tree to check existence (in quick mode) and adding the missing ones in
a list, then requesting that set altogether in a single request.

That probably won't be as fast as your modified mktree experiment below,
but would match my expectations of "probably faster assuming the repo is
big enough".

> Repeated experiment, but using modified mktree:
> 
>> $ cat <(git ls-tree FETCH_HEAD) <(printf "100644 blob ${NEW_BLOB}\tzzz") | time git mktree
>> e26c7ce7357b1649da7b4200d4e80d0b668db8d4
>>         0.06 real         0.01 user         0.03 sys
> 
> Did you have any other sort of performance test in mind? The remotes we
> typically deal with are geographically far away and deal with a high volume
> of traffic so we're keen to move behaviour to the client where it makes sense
> to do so.

I guess I wonder how large your promisor pack-files are in this test,
since your implementation depends on for_each_packed_object(), which
should be really inefficient if you're actually dealing with a large
partial clone.

Thanks,
-Stolee

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] mktree: learn about promised objects
  2022-06-14 17:27     ` Derrick Stolee
@ 2022-06-15  0:35       ` Taylor Blau
  2022-06-15  4:00         ` Jeff King
  0 siblings, 1 reply; 20+ messages in thread
From: Taylor Blau @ 2022-06-15  0:35 UTC (permalink / raw)
  To: Derrick Stolee; +Cc: Richard Oliver, git, jonathantanmy

On Tue, Jun 14, 2022 at 01:27:18PM -0400, Derrick Stolee wrote:
> > Did you have any other sort of performance test in mind? The remotes we
> > typically deal with are geographically far away and deal with a high volume
> > of traffic so we're keen to move behaviour to the client where it makes sense
> > to do so.
>
> I guess I wonder how large your promisor pack-files are in this test,
> since your implementation depends on for_each_packed_object(), which
> should be really inefficient if you're actually dealing with a large
> partial clone.

I had the same thought. Storing data available in the promisor packs
into an oid_map is going to be expensive if there are many such objects.

Is there a reason that we can't introduce a variant of
find_kept_pack_entry() that deals only with .promisor packs and look
these things up as-needed?

Thanks,
Taylor

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] mktree: learn about promised objects
  2022-06-15  0:35       ` Taylor Blau
@ 2022-06-15  4:00         ` Jeff King
  2022-06-15 17:40           ` Richard Oliver
  0 siblings, 1 reply; 20+ messages in thread
From: Jeff King @ 2022-06-15  4:00 UTC (permalink / raw)
  To: Taylor Blau; +Cc: Derrick Stolee, Richard Oliver, git, jonathantanmy

On Tue, Jun 14, 2022 at 08:35:16PM -0400, Taylor Blau wrote:

> On Tue, Jun 14, 2022 at 01:27:18PM -0400, Derrick Stolee wrote:
> > > Did you have any other sort of performance test in mind? The remotes we
> > > typically deal with are geographically far away and deal with a high volume
> > > of traffic so we're keen to move behaviour to the client where it makes sense
> > > to do so.
> >
> > I guess I wonder how large your promisor pack-files are in this test,
> > since your implementation depends on for_each_packed_object(), which
> > should be really inefficient if you're actually dealing with a large
> > partial clone.
> 
> I had the same thought. Storing data available in the promisor packs
> into an oid_map is going to be expensive if there are many such objects.
> 
> Is there a reason that we can't introduce a variant of
> find_kept_pack_entry() that deals only with .promisor packs and look
> these things up as-needed?

It's much worse than that. The promisor mechanism is fundamentally very
inefficient in runtime, optimizing instead for size. Imagine I have a
partial clone and I retrieve tree X, which points to a blob Y that I
don't get. I have X in a promisor pack, and asking about it is
efficient. But if I want to know about Y, I have no data structure
mentioning Y except the tree X itself. So to enumerate all of the
promisor edges, I have to walk all of the trees in the promisor pack.

So it is not just lookup, but actual tree walking that is expensive. The
flip side is that you don't have to store a complete separate list of
the promised objects. Whether that's a win depends on how many local
objects you have, versus how many are promised.

But it would be possible to cache the promisor list to make the tradeoff
separately. E.g., do the walk over the promisor trees once (perhaps at
pack creation time), and store a sorted list of fixed-length (oid, type)
records that could be binary searched. You could even put it in the
.promisor file. :)

-Peff

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] mktree: learn about promised objects
  2022-06-15  4:00         ` Jeff King
@ 2022-06-15 17:40           ` Richard Oliver
  2022-06-15 18:17             ` Derrick Stolee
  2022-06-15 21:01             ` Junio C Hamano
  0 siblings, 2 replies; 20+ messages in thread
From: Richard Oliver @ 2022-06-15 17:40 UTC (permalink / raw)
  To: Jeff King, Taylor Blau; +Cc: Derrick Stolee, git, jonathantanmy

On 15/06/2022 05:00, Jeff King wrote:
> On Tue, Jun 14, 2022 at 08:35:16PM -0400, Taylor Blau wrote:
> 
>> On Tue, Jun 14, 2022 at 01:27:18PM -0400, Derrick Stolee wrote:
>>>> Did you have any other sort of performance test in mind? The remotes we
>>>> typically deal with are geographically far away and deal with a high volume
>>>> of traffic so we're keen to move behaviour to the client where it makes sense
>>>> to do so.
>>>
>>> I guess I wonder how large your promisor pack-files are in this test,
>>> since your implementation depends on for_each_packed_object(), which
>>> should be really inefficient if you're actually dealing with a large
>>> partial clone.
>>
>> I had the same thought. Storing data available in the promisor packs
>> into an oid_map is going to be expensive if there are many such objects.
>>
>> Is there a reason that we can't introduce a variant of
>> find_kept_pack_entry() that deals only with .promisor packs and look
>> these things up as-needed?
> 
> It's much worse than that. The promisor mechanism is fundamentally very
> inefficient in runtime, optimizing instead for size. Imagine I have a
> partial clone and I retrieve tree X, which points to a blob Y that I
> don't get. I have X in a promisor pack, and asking about it is
> efficient. But if I want to know about Y, I have no data structure
> mentioning Y except the tree X itself. So to enumerate all of the
> promisor edges, I have to walk all of the trees in the promisor pack.
> 
> So it is not just lookup, but actual tree walking that is expensive. The
> flip side is that you don't have to store a complete separate list of
> the promised objects. Whether that's a win depends on how many local
> objects you have, versus how many are promised.
> 
> But it would be possible to cache the promisor list to make the tradeoff
> separately. E.g., do the walk over the promisor trees once (perhaps at
> pack creation time), and store a sorted list of fixed-length (oid, type)
> records that could be binary searched. You could even put it in the
> .promisor file. :)
> 
> -Peff

I like the idea of caching the promisor list at pack creation time;
I'll start work on a patch set that implements this.

Meanwhile, is it worth considering a '--promised-as-missing' option
(or a config option) for invocations such as 'mktree --missing' that
prevents promised objects being faulted-in? Currently, the only
reliable way that I've found to prevent 'mktree --missing' faulting-in
promised objects is to remove the remote. Such an option could either
set the global variable 'fetch_if_missing' to '0' or could ensure
'OBJECT_INFO_SKIP_FETCH_OBJECT' is passed appropriately.

Cheers,
Richard

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] mktree: learn about promised objects
  2022-06-15 17:40           ` Richard Oliver
@ 2022-06-15 18:17             ` Derrick Stolee
  2022-06-16  6:07               ` Jeff King
  2022-06-15 21:01             ` Junio C Hamano
  1 sibling, 1 reply; 20+ messages in thread
From: Derrick Stolee @ 2022-06-15 18:17 UTC (permalink / raw)
  To: Richard Oliver, Jeff King, Taylor Blau; +Cc: git, jonathantanmy

On 6/15/2022 1:40 PM, Richard Oliver wrote:
> On 15/06/2022 05:00, Jeff King wrote:

>> So it is not just lookup, but actual tree walking that is expensive. The
>> flip side is that you don't have to store a complete separate list of
>> the promised objects. Whether that's a win depends on how many local
>> objects you have, versus how many are promised.

This is also why blobless (or blob-size filters) are the recommended way
to use partial clone. It's just too expensive to have tree misses.

>> But it would be possible to cache the promisor list to make the tradeoff
>> separately. E.g., do the walk over the promisor trees once (perhaps at
>> pack creation time), and store a sorted list of fixed-length (oid, type)
>> records that could be binary searched. You could even put it in the
>> .promisor file. :)
>>
>> -Peff
> 
> I like the idea of caching the promisor list at pack creation time;
> I'll start work on a patch set that implements this.
> 
> Meanwhile, is it worth considering a '--promised-as-missing' option
> (or a config option) for invocations such as 'mktree --missing' that
> prevents promised objects being faulted-in? Currently, the only
> reliable way that I've found to prevent 'mktree --missing' faulting-in
> promised objects is to remove the remote. Such an option could either
> set the global variable 'fetch_if_missing' to '0' or could ensure
> 'OBJECT_INFO_SKIP_FETCH_OBJECT' is passed appropriately.

One issue I've had with the current design of partial clone is that we
put all of the filter logic into the remotes, not the repository itself.
This means that if I use "git remote add" to add a new remote, then that
remote does not inherit the filter (and hence fetches can be too large
or even fail because we are "missing" an object pointed to by the
resulting pack).

If we had a repository-scoped notion of the filter, then we could
special-case this mktree logic to assume a blob if the filter only
excludes blobs. That would be even faster than looking up the type
from a "promised objects" file.

Just thinking about smaller steps that could simplify things before
adding a new data format. Feel free to continue pursuing that lookup
file if that's what you think will work best.

Thanks,
-Stolee

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] mktree: learn about promised objects
  2022-06-15 17:40           ` Richard Oliver
  2022-06-15 18:17             ` Derrick Stolee
@ 2022-06-15 21:01             ` Junio C Hamano
  2022-06-16  5:02               ` Jeff King
  2022-06-16 15:46               ` [PATCH] mktree: Make '--missing' behave as documented Richard Oliver
  1 sibling, 2 replies; 20+ messages in thread
From: Junio C Hamano @ 2022-06-15 21:01 UTC (permalink / raw)
  To: Richard Oliver; +Cc: Jeff King, Taylor Blau, Derrick Stolee, git, jonathantanmy

Richard Oliver <roliver@roku.com> writes:

> Meanwhile, is it worth considering a '--promised-as-missing' option
> (or a config option) for invocations such as 'mktree --missing' that
> prevents promised objects being faulted-in? Currently, the only
> reliable way that I've found to prevent 'mktree --missing' faulting-in
> promised objects is to remove the remote. Such an option could either
> set the global variable 'fetch_if_missing' to '0' or could ensure
> 'OBJECT_INFO_SKIP_FETCH_OBJECT' is passed appropriately.

I didn't spend too much time on thinking this one through myself,
but do we really need a separte option?

As far as I remember, I wrote the original behaviour implemented in
1c64e79a (mktree --missing: allow missing objects, 2009-05-10) when
the command is run without --missing primarily to be extra paranoid
to detect broken repository with every chance we have to avoid
spreading existing breakage to new objects we create.  With the
input "mktree" gets from the end user, we have no need to learn
anything from existing objects in order to create the tree object at
all.

The original 1c64e79a even carefully made sure that with --missing
in effect, we do not ask the object store about an object:

	/* It is perfectly normal if we do not have a commit from a submodule */
	if (S_ISGITLINK(mode))
		allow_missing = 1;

	if (!allow_missing)
		type = sha1_object_info(sha1, NULL);
	else
		type = object_type(mode);

	if (type < 0)
		die("object %s unavailable", sha1_to_hex(sha1));


We by grave mistake at 31c8221a (mktree: validate entry type in
input, 2009-05-14) started insisting on inspecting objects even when
allow-mising was given.  I do not think it was sensible, given why
we had "--missing" as an option to allow users to say "you do not
have to be too paranoid".

The codebase is so distant but I think we should probably do a moral
revert/reconstruct of that commit so that the extra paranoia of the
said commit applies only when "--missing" is not in effect, or
something like that.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] mktree: learn about promised objects
  2022-06-15 21:01             ` Junio C Hamano
@ 2022-06-16  5:02               ` Jeff King
  2022-06-16 15:46               ` [PATCH] mktree: Make '--missing' behave as documented Richard Oliver
  1 sibling, 0 replies; 20+ messages in thread
From: Jeff King @ 2022-06-16  5:02 UTC (permalink / raw)
  To: Junio C Hamano
  Cc: Richard Oliver, Taylor Blau, Derrick Stolee, git, jonathantanmy

On Wed, Jun 15, 2022 at 02:01:33PM -0700, Junio C Hamano wrote:

> Richard Oliver <roliver@roku.com> writes:
> 
> > Meanwhile, is it worth considering a '--promised-as-missing' option
> > (or a config option) for invocations such as 'mktree --missing' that
> > prevents promised objects being faulted-in? Currently, the only
> > reliable way that I've found to prevent 'mktree --missing' faulting-in
> > promised objects is to remove the remote. Such an option could either
> > set the global variable 'fetch_if_missing' to '0' or could ensure
> > 'OBJECT_INFO_SKIP_FETCH_OBJECT' is passed appropriately.
> 
> I didn't spend too much time on thinking this one through myself,
> but do we really need a separte option?
> [...]
> We by grave mistake at 31c8221a (mktree: validate entry type in
> input, 2009-05-14) started insisting on inspecting objects even when
> allow-mising was given.  I do not think it was sensible, given why
> we had "--missing" as an option to allow users to say "you do not
> have to be too paranoid".
> 
> The codebase is so distant but I think we should probably do a moral
> revert/reconstruct of that commit so that the extra paranoia of the
> said commit applies only when "--missing" is not in effect, or
> something like that.

FWIW, I had the same reaction. I think fixing "--missing" should be the
first step, and would unstick Richard's use case, as I understand it.

There is some value to improving the promisor case, since using
"--missing" is a pretty broad stroke (i.e., you'd fail to actual
corruption of missing non-promisor objects). That could either be
checking the promisor object and type without faulting it in, or just
skipping the type-check for objects after confirming that they're
promisors.

But that can come on top, I think. The use case there is also a bit
narrower.  The local repository does not know about all promised
objects. It can only see the boundaries of objects it doesn't have (so
with --filter=tree:0, for example, a partial clone of a repo with path
"a/b/c" would know about "b" but not "c"). So in the most general case
you'd still have to resort to "--missing", but I suspect in practice
you'd always feed things at that boundary to mktree (otherwise, how
would you even know the oid of "c").

-Peff

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] mktree: learn about promised objects
  2022-06-15 18:17             ` Derrick Stolee
@ 2022-06-16  6:07               ` Jeff King
  2022-06-16  6:54                 ` [PATCH] is_promisor_object(): walk promisor packs in pack-order Jeff King
  2022-06-16 13:59                 ` [PATCH] mktree: learn about promised objects Derrick Stolee
  0 siblings, 2 replies; 20+ messages in thread
From: Jeff King @ 2022-06-16  6:07 UTC (permalink / raw)
  To: Derrick Stolee; +Cc: Richard Oliver, Taylor Blau, git, jonathantanmy

On Wed, Jun 15, 2022 at 02:17:58PM -0400, Derrick Stolee wrote:

> On 6/15/2022 1:40 PM, Richard Oliver wrote:
> > On 15/06/2022 05:00, Jeff King wrote:
> 
> >> So it is not just lookup, but actual tree walking that is expensive. The
> >> flip side is that you don't have to store a complete separate list of
> >> the promised objects. Whether that's a win depends on how many local
> >> objects you have, versus how many are promised.
> 
> This is also why blobless (or blob-size filters) are the recommended way
> to use partial clone. It's just too expensive to have tree misses.

I agree that tree misses are awful, but I'm actually talking about
something different: traversing the local trees we _do_ have in order to
find the set of promised objects. Which is worse for blob:none, because
it means you have more trees locally. :)

Try this with a big repo like linux.git:

  git clone --no-local --filter=blob:none linux.git repo
  cd repo

  # this is fast; we mark the promisor trees as UNINTERESTING, so we do
  # not look at them as part of the traversal, and never call
  # is_promisor_object().
  time git rev-list --count --objects --all --exclude-promisor-objects

  # but imagine we had a fixed mktree[1] that did not fault in the blobs
  # unnecessarily, and we made a new tree that references a promised
  # blob.
  tree=$(git ls-tree HEAD~1000 | grep Makefile | git mktree --missing)
  commit=$(echo foo | git commit-tree -p HEAD $tree)
  git update-ref refs/heads/foo $commit

  # this is now slow; even though we only call is_promisor_object()
  # once, we have to open every single tree in the pack to find it!
  time git rev-list --count --objects --all --exclude-promisor-objects

Those rev-lists run in 1.7s and 224s respectively. Ouch!

Now the setup there is kind of contrived, and it's actually not trivial
to convince rev-list to actually call is_promisor_object() these days.
That's because promisor-ness (promisosity?) tends to be one-way
transitive. A promised blob is usually either only referenced by
promised trees (which we have in this case), or is faulted in as part of
making a new tree.

But it's not guaranteed, and certainly a faulted-in object could later
be deleted from the local repo, since it's promised. I suspect there are
less convoluted workflows to get to a similar state, but I don't know
them offhand. There's more discussion of this issue in this thread from
last summer:

  https://lore.kernel.org/git/20210403090412.GH2271@szeder.dev/

-Peff

[1] The mktree I used was the fix discussed elsewhere in the thread:

diff --git a/builtin/mktree.c b/builtin/mktree.c
index 902edba6d2..42ae82230c 100644
--- a/builtin/mktree.c
+++ b/builtin/mktree.c
@@ -117,15 +117,11 @@ static void mktree_line(char *buf, int nul_term_line, int allow_missing)
 	}
 
 	/* Check the type of object identified by sha1 */
-	obj_type = oid_object_info(the_repository, &oid, NULL);
-	if (obj_type < 0) {
-		if (allow_missing) {
-			; /* no problem - missing objects are presumed to be of the right type */
-		} else {
+	if (!allow_missing) {
+		obj_type = oid_object_info(the_repository, &oid, NULL);
+		if (obj_type < 0)
 			die("entry '%s' object %s is unavailable", path, oid_to_hex(&oid));
-		}
-	} else {
-		if (obj_type != mode_type) {
+		else if (obj_type != mode_type) {
 			/*
 			 * The object exists but is of the wrong type.
 			 * This is a problem regardless of allow_missing

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH] is_promisor_object(): walk promisor packs in pack-order
  2022-06-16  6:07               ` Jeff King
@ 2022-06-16  6:54                 ` Jeff King
  2022-06-16 14:00                   ` Derrick Stolee
  2022-06-17 19:50                   ` Jonathan Tan
  2022-06-16 13:59                 ` [PATCH] mktree: learn about promised objects Derrick Stolee
  1 sibling, 2 replies; 20+ messages in thread
From: Jeff King @ 2022-06-16  6:54 UTC (permalink / raw)
  To: Derrick Stolee; +Cc: Richard Oliver, Taylor Blau, git, jonathantanmy

On Thu, Jun 16, 2022 at 02:07:41AM -0400, Jeff King wrote:

> Those rev-lists run in 1.7s and 224s respectively. Ouch!

Even though I expected the second one to be slow, I was shocked at just
how slow it was. The patch below speeds it up by a factor of 2, and I
think is worth applying separately, regardless of anything else being
discussed in this thread.

-- >8 --
Subject: is_promisor_object(): walk promisor packs in pack-order

When we generate the list of promisor objects, we walk every pack with a
.promisor file and examine its objects for any links to other objects.
By default, for_each_packed_object() will go in pack .idx order.

This is the worst case with respect to our delta base cache. If we have
a delta chain of A->B->C->D, then visiting A may require reconstructing
both B and C, unless we also visited B recently, in which case we may
have cached its value. Because .idx order is based on sha1, it's random
with respect to the actual object contents and deltas, and thus we're
unlikely to get many cache hits.

If we instead traverse in pack order, then we get the optimal case:
packs are written to keep delta families together, and to place bases
before their children.

Even on a modest repository like git.git, this has a noticeable speedup
on p5600.4, which runs "fsck" on a partial clone with blob:none (so lots
of trees which need to be walked, and which delta well):

Test       HEAD^               HEAD
-------------------------------------------------------
5600.4:    17.87(17.83+0.04)   15.42(15.35+0.06) -13.7%

On a larger repository like linux.git, the speedup is even more
pronounced:

Test       HEAD^                 HEAD
-----------------------------------------------------------
5600.4:    322.47(322.01+0.42)   186.41(185.76+0.63) -42.2%

Any other operations that call is_promisor_object(), like "rev-list
--exclude-promisor-objects", would similarly benefit, but the
invocations in p5600 don't actually trigger any such cases.

Note that we may pay a small price to build a rev-index in-memory to do
the pack-order traversal. But it's still a big net win, and even that
small cost goes away if you are using pack.writeReverseIndex.

Signed-off-by: Jeff King <peff@peff.net>
---
As an aside, feels like pack.writeReverseIndex ought to become the
default (which AFAIK hasn't happened yet).

 packfile.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/packfile.c b/packfile.c
index 8e812a84a3..ed69fe457b 100644
--- a/packfile.c
+++ b/packfile.c
@@ -2275,7 +2275,8 @@ int is_promisor_object(const struct object_id *oid)
 		if (has_promisor_remote()) {
 			for_each_packed_object(add_promisor_object,
 					       &promisor_objects,
-					       FOR_EACH_OBJECT_PROMISOR_ONLY);
+					       FOR_EACH_OBJECT_PROMISOR_ONLY |
+					       FOR_EACH_OBJECT_PACK_ORDER);
 		}
 		promisor_objects_prepared = 1;
 	}
-- 
2.37.0.rc0.352.g10876ef154


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH] mktree: learn about promised objects
  2022-06-16  6:07               ` Jeff King
  2022-06-16  6:54                 ` [PATCH] is_promisor_object(): walk promisor packs in pack-order Jeff King
@ 2022-06-16 13:59                 ` Derrick Stolee
  1 sibling, 0 replies; 20+ messages in thread
From: Derrick Stolee @ 2022-06-16 13:59 UTC (permalink / raw)
  To: Jeff King; +Cc: Richard Oliver, Taylor Blau, git, jonathantanmy

On 6/16/2022 2:07 AM, Jeff King wrote:
> On Wed, Jun 15, 2022 at 02:17:58PM -0400, Derrick Stolee wrote:
> 
>> On 6/15/2022 1:40 PM, Richard Oliver wrote:
>>> On 15/06/2022 05:00, Jeff King wrote:
>>
>>>> So it is not just lookup, but actual tree walking that is expensive. The
>>>> flip side is that you don't have to store a complete separate list of
>>>> the promised objects. Whether that's a win depends on how many local
>>>> objects you have, versus how many are promised.
>>
>> This is also why blobless (or blob-size filters) are the recommended way
>> to use partial clone. It's just too expensive to have tree misses.
> 
> I agree that tree misses are awful, but I'm actually talking about
> something different: traversing the local trees we _do_ have in order to
> find the set of promised objects. Which is worse for blob:none, because
> it means you have more trees locally. :)

Ah, I misread your email. I agree that walking trees is far too
expensive to do just to find an object type.

> Try this with a big repo like linux.git:
> 
>   git clone --no-local --filter=blob:none linux.git repo
>   cd repo
> 
>   # this is fast; we mark the promisor trees as UNINTERESTING, so we do
>   # not look at them as part of the traversal, and never call
>   # is_promisor_object().
>   time git rev-list --count --objects --all --exclude-promisor-objects
> 
>   # but imagine we had a fixed mktree[1] that did not fault in the blobs
>   # unnecessarily, and we made a new tree that references a promised
>   # blob.
>   tree=$(git ls-tree HEAD~1000 | grep Makefile | git mktree --missing)
>   commit=$(echo foo | git commit-tree -p HEAD $tree)
>   git update-ref refs/heads/foo $commit
> 
>   # this is now slow; even though we only call is_promisor_object()
>   # once, we have to open every single tree in the pack to find it!
>   time git rev-list --count --objects --all --exclude-promisor-objects
> 
> Those rev-lists run in 1.7s and 224s respectively. Ouch!

This is exactly the reason I thought just asking for the objects
directly is faster than scanning all the packs. Thanks for giving
concrete numbers that support that assumption.

Thanks,
-Stolee

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] is_promisor_object(): walk promisor packs in pack-order
  2022-06-16  6:54                 ` [PATCH] is_promisor_object(): walk promisor packs in pack-order Jeff King
@ 2022-06-16 14:00                   ` Derrick Stolee
  2022-06-17 19:50                   ` Jonathan Tan
  1 sibling, 0 replies; 20+ messages in thread
From: Derrick Stolee @ 2022-06-16 14:00 UTC (permalink / raw)
  To: Jeff King; +Cc: Richard Oliver, Taylor Blau, git, jonathantanmy

On 6/16/2022 2:54 AM, Jeff King wrote:
> On Thu, Jun 16, 2022 at 02:07:41AM -0400, Jeff King wrote:
> 
>> Those rev-lists run in 1.7s and 224s respectively. Ouch!
> 
> Even though I expected the second one to be slow, I was shocked at just
> how slow it was. The patch below speeds it up by a factor of 2, and I
> think is worth applying separately, regardless of anything else being
> discussed in this thread.

I agree. It's extremely simple for the value it provides.

> diff --git a/packfile.c b/packfile.c
> index 8e812a84a3..ed69fe457b 100644
> --- a/packfile.c
> +++ b/packfile.c
> @@ -2275,7 +2275,8 @@ int is_promisor_object(const struct object_id *oid)
>  		if (has_promisor_remote()) {
>  			for_each_packed_object(add_promisor_object,
>  					       &promisor_objects,
> -					       FOR_EACH_OBJECT_PROMISOR_ONLY);
> +					       FOR_EACH_OBJECT_PROMISOR_ONLY |
> +					       FOR_EACH_OBJECT_PACK_ORDER);
>  		}

Thanks,
-Stolee


^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH] mktree: Make '--missing' behave as documented
  2022-06-15 21:01             ` Junio C Hamano
  2022-06-16  5:02               ` Jeff King
@ 2022-06-16 15:46               ` Richard Oliver
  2022-06-16 17:44                 ` Junio C Hamano
  1 sibling, 1 reply; 20+ messages in thread
From: Richard Oliver @ 2022-06-16 15:46 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Jeff King, Taylor Blau, Derrick Stolee, git, jonathantanmy

On 15/06/2022 22:01, Junio C Hamano wrote:
> We by grave mistake at 31c8221a (mktree: validate entry type in
> input, 2009-05-14) started insisting on inspecting objects even when
> allow-mising was given.  I do not think it was sensible, given why
> we had "--missing" as an option to allow users to say "you do not
> have to be too paranoid".
> 
> The codebase is so distant but I think we should probably do a moral
> revert/reconstruct of that commit so that the extra paranoia of the
> said commit applies only when "--missing" is not in effect, or
> something like that.

-- >8 --
Subject: [PATCH] mktree: Make '--missing' behave as documented

Do not make use of the object store to verify object type for
'allow_missing' objects in mktree_line(). GIT-MKTREE(1) documents
'--missing' as disabling "verif[ication] that each tree entry's sha1
identifies an existing object".

This change retains the check for '<mode>' and '<type>' for
'allow_missing' objects as this check is merely input validation that
avoids interrogating the object store.

Signed-off-by: Richard Oliver <roliver@roku.com>
---
 builtin/mktree.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/builtin/mktree.c b/builtin/mktree.c
index 902edba6d2..f41fda6e7d 100644
--- a/builtin/mktree.c
+++ b/builtin/mktree.c
@@ -116,15 +116,12 @@ static void mktree_line(char *buf, int nul_term_line, int allow_missing)
 			path, ptr, type_name(mode_type));
 	}
 
-	/* Check the type of object identified by sha1 */
-	obj_type = oid_object_info(the_repository, &oid, NULL);
-	if (obj_type < 0) {
-		if (allow_missing) {
-			; /* no problem - missing objects are presumed to be of the right type */
-		} else {
+	if (!allow_missing) {
+		/* Check the type of object identified by sha1 */
+		obj_type = oid_object_info(the_repository, &oid, NULL);
+		if (obj_type < 0) {
 			die("entry '%s' object %s is unavailable", path, oid_to_hex(&oid));
 		}
-	} else {
 		if (obj_type != mode_type) {
 			/*
 			 * The object exists but is of the wrong type.
-- 
2.36.1.467.g4f6db706e6.dirty


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH] mktree: Make '--missing' behave as documented
  2022-06-16 15:46               ` [PATCH] mktree: Make '--missing' behave as documented Richard Oliver
@ 2022-06-16 17:44                 ` Junio C Hamano
  2022-06-21 13:59                   ` [PATCH] mktree: do not check type of remote objects Richard Oliver
  0 siblings, 1 reply; 20+ messages in thread
From: Junio C Hamano @ 2022-06-16 17:44 UTC (permalink / raw)
  To: Richard Oliver; +Cc: Jeff King, Taylor Blau, Derrick Stolee, git, jonathantanmy

Richard Oliver <roliver@roku.com> writes:

> Subject: [PATCH] mktree: Make '--missing' behave as documented

I would intead title the change as

	   mktree: disable object verification under '--missing'

if I were writing this change.  It is a tad longer, but "as
documented" is much less informative when this appears in "git
shortlog --no-merges".

But wait for a bit.  I am not sold on "disable" 100% yet.

> Do not make use of the object store to verify object type for
> 'allow_missing' objects in mktree_line(). GIT-MKTREE(1) documents
> '--missing' as disabling "verif[ication] that each tree entry's sha1
> identifies an existing object".

I am on the fence on this.  The current behaviour is not exactly
forcing verification with "--missing".  Even if oid_object_info(),
after consulting the promisor remote, says that the object is not
available, the code does not complain, and in that sense, it is
working as documented.  It's just that it is doing unnecessary work
that you know you are not going to use.

> This change retains the check for '<mode>' and '<type>' for
> 'allow_missing' objects as this check is merely input validation that
> avoids interrogating the object store.
>
> Signed-off-by: Richard Oliver <roliver@roku.com>
> ---
>  builtin/mktree.c | 11 ++++-------
>  1 file changed, 4 insertions(+), 7 deletions(-)

31c8221a (mktree: validate entry type in input, 2009-05-14) was
overly eager to check with the local object store. They called the
sha1_object_info() API to obtain the type information, allowed the
call to silently fail when the object was missing locally, but
sanity checked the types opportunistically when the object did
exist.  That implementation is hurting us now, but it is sort of
excusable because back then there was no lazy/on-demand downloading
of individual objects that causes a long delay and materializes the
object, hence defeating the point of using "--missing".

And that is why the code back then was OK but not anymore in today's
world.  Something like the analysis in the above paragraph should be
in the commit log message to explain why we are making this change
today.

This patch would be a good first cut as a starting point, but we
probably can do better by doing oid_object_info_extended() call with
OBJECT_INFO_SKIP_FETCH_OBJECT bit (and probably QUICK bit, too) set,
with the current code structure.

And when we do so, the title would not match the purpose of the
change.  The verification was disabled with "--missing" all along
and that is not what we are changing.  What we will be fixing is the
wasteful implementation.

    mktree: do not check types of remote objects

    With 31c8221a (mktree: validate entry type in input, 2009-05-14), we
    called the sha1_object_info() API to obtain the type information,
    but allowed the call to silently fail when the object was missing
    locally, so that we can sanity-check the types opportunistically
    when the object did exist.

    The implementation is understandable because back then there was no
    lazy/on-demand downloading of individual objects from the promisor
    remotes that causes a long delay and materializes the object, hence
    defeating the point of using "--missing".  The design is hurting us
    now.

    We could bypass the opportunistic type/mode consistency check
    altogether when "--missing" is given, but instead, use the
    oid_object_info_extended() API and tell it that we are only
    interested in objects that locally exist and are immediately
    available by passing OBJECT_INFO_SKIP_FETCH_OBJECT bit to it.  That
    way, we will still retain the cheap and opportunistic sanity check
    for local objects.

or something like that, perhaps?

Thanks.

> diff --git a/builtin/mktree.c b/builtin/mktree.c
> index 902edba6d2..f41fda6e7d 100644
> --- a/builtin/mktree.c
> +++ b/builtin/mktree.c
> @@ -116,15 +116,12 @@ static void mktree_line(char *buf, int nul_term_line, int allow_missing)
>  			path, ptr, type_name(mode_type));
>  	}
>  
> -	/* Check the type of object identified by sha1 */
> -	obj_type = oid_object_info(the_repository, &oid, NULL);
> -	if (obj_type < 0) {
> -		if (allow_missing) {
> -			; /* no problem - missing objects are presumed to be of the right type */
> -		} else {
> +	if (!allow_missing) {
> +		/* Check the type of object identified by sha1 */
> +		obj_type = oid_object_info(the_repository, &oid, NULL);
> +		if (obj_type < 0) {
>  			die("entry '%s' object %s is unavailable", path, oid_to_hex(&oid));
>  		}
> -	} else {
>  		if (obj_type != mode_type) {
>  			/*
>  			 * The object exists but is of the wrong type.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] is_promisor_object(): walk promisor packs in pack-order
  2022-06-16  6:54                 ` [PATCH] is_promisor_object(): walk promisor packs in pack-order Jeff King
  2022-06-16 14:00                   ` Derrick Stolee
@ 2022-06-17 19:50                   ` Jonathan Tan
  1 sibling, 0 replies; 20+ messages in thread
From: Jonathan Tan @ 2022-06-17 19:50 UTC (permalink / raw)
  To: Jeff King; +Cc: Jonathan Tan, Derrick Stolee, Richard Oliver, Taylor Blau, git

Jeff King <peff@peff.net> writes:
> On Thu, Jun 16, 2022 at 02:07:41AM -0400, Jeff King wrote:
> 
> > Those rev-lists run in 1.7s and 224s respectively. Ouch!
> 
> Even though I expected the second one to be slow, I was shocked at just
> how slow it was. The patch below speeds it up by a factor of 2, and I
> think is worth applying separately, regardless of anything else being
> discussed in this thread.

Thanks for the patch and the benchmark. This makes sense to me.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH] mktree: do not check type of remote objects
  2022-06-16 17:44                 ` Junio C Hamano
@ 2022-06-21 13:59                   ` Richard Oliver
  2022-06-21 16:51                     ` Junio C Hamano
  2022-06-21 17:48                     ` Junio C Hamano
  0 siblings, 2 replies; 20+ messages in thread
From: Richard Oliver @ 2022-06-21 13:59 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Jeff King, Taylor Blau, Derrick Stolee, git, jonathantanmy

On 16/06/2022 18:44, Junio C Hamano wrote:
> This patch would be a good first cut as a starting point, but we
> probably can do better by doing oid_object_info_extended() call with
> OBJECT_INFO_SKIP_FETCH_OBJECT bit (and probably QUICK bit, too) set,
> with the current code structure.
> 
> And when we do so, the title would not match the purpose of the
> change.  The verification was disabled with "--missing" all along
> and that is not what we are changing.  What we will be fixing is the
> wasteful implementation.
> 
>     mktree: do not check types of remote objects
> 
>     With 31c8221a (mktree: validate entry type in input, 2009-05-14), we
>     called the sha1_object_info() API to obtain the type information,
>     but allowed the call to silently fail when the object was missing
>     locally, so that we can sanity-check the types opportunistically
>     when the object did exist.
> 
>     The implementation is understandable because back then there was no
>     lazy/on-demand downloading of individual objects from the promisor
>     remotes that causes a long delay and materializes the object, hence
>     defeating the point of using "--missing".  The design is hurting us
>     now.
> 
>     We could bypass the opportunistic type/mode consistency check
>     altogether when "--missing" is given, but instead, use the
>     oid_object_info_extended() API and tell it that we are only
>     interested in objects that locally exist and are immediately
>     available by passing OBJECT_INFO_SKIP_FETCH_OBJECT bit to it.  That
>     way, we will still retain the cheap and opportunistic sanity check
>     for local objects.

I've prepared a patch below as per your suggestion.

As a side note, do you think we need to re-work some uses of the word
'missing' in the documentation? Some uses of the word, such as in
mktree, predate the concept of promisor remotes. The partial-clone.txt
documentation differentiates between missing "due to a partial clone
or fetch" and missing "due to repository corruption".  Would making
such a distinction elsewhere be useful?

Cheers,
Richard

-- >8 --
Subject: [PATCH] mktree: do not check type of remote objects

With 31c8221a (mktree: validate entry type in input, 2009-05-14), we
called the sha1_object_info() API to obtain the type information, but
allowed the call to silently fail when the object was missing locally,
so that we can sanity-check the types opportunistically when the
object did exist.

The implementation is understandable because back then there was no
lazy/on-demand downloading of individual objects from the promisor
remotes that causes a long delay and materializes the object, hence
defeating the point of using "--missing".  The design is hurting us
now.

We could bypass the opportunistic type/mode consistency check
altogether when "--missing" is given, but instead, use the
oid_object_info_extended() API and tell it that we are only interested
in objects that locally exist and are immediately available by passing
OBJECT_INFO_SKIP_FETCH_OBJECT bit to it.  That way, we will still
retain the cheap and opportunistic sanity check for local objects.

Signed-off-by: Richard Oliver <roliver@roku.com>
---
 builtin/mktree.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/builtin/mktree.c b/builtin/mktree.c
index 902edba6d2..cfadb52670 100644
--- a/builtin/mktree.c
+++ b/builtin/mktree.c
@@ -116,8 +116,15 @@ static void mktree_line(char *buf, int nul_term_line, int allow_missing)
 			path, ptr, type_name(mode_type));
 	}
 
-	/* Check the type of object identified by sha1 */
-	obj_type = oid_object_info(the_repository, &oid, NULL);
+	/* Check the type of object identified by oid without fetching objects */
+	struct object_info oi = OBJECT_INFO_INIT;
+	oi.typep = &obj_type;
+	if (oid_object_info_extended(the_repository, &oid, &oi,
+				     OBJECT_INFO_LOOKUP_REPLACE |
+				     OBJECT_INFO_QUICK |
+				     OBJECT_INFO_SKIP_FETCH_OBJECT) < 0)
+		obj_type = -1;
+
 	if (obj_type < 0) {
 		if (allow_missing) {
 			; /* no problem - missing objects are presumed to be of the right type */
-- 
2.36.1.467.g4f6db706e6.dirty


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH] mktree: do not check type of remote objects
  2022-06-21 13:59                   ` [PATCH] mktree: do not check type of remote objects Richard Oliver
@ 2022-06-21 16:51                     ` Junio C Hamano
  2022-06-21 17:48                     ` Junio C Hamano
  1 sibling, 0 replies; 20+ messages in thread
From: Junio C Hamano @ 2022-06-21 16:51 UTC (permalink / raw)
  To: Richard Oliver; +Cc: Jeff King, Taylor Blau, Derrick Stolee, git, jonathantanmy

Richard Oliver <roliver@roku.com> writes:

> As a side note, do you think we need to re-work some uses of the word
> 'missing' in the documentation? Some uses of the word, such as in
> mktree, predate the concept of promisor remotes. The partial-clone.txt
> documentation differentiates between missing "due to a partial clone
> or fetch" and missing "due to repository corruption".  Would making
> such a distinction elsewhere be useful?

I do agree with that direction.  Back when the world was simpler,
there were just "we have the object" and "we ought to have the
object but we do not see it", the latter of which was a clear
repository corruption.  There was no third choice.

With the lazy/partial clone stuff, it is not a corrupted repository
anymore when certain objects that are reachable during traversal is
missing (this is slightly different from a shallow clone in that a
shallow history makes the traversal stop early, so at least objects
in a shallow clone that are reachable during traversal must exist).

So, yes, I think the distinction between "missing but shouldn't be"
and "missing by design" is a good thing to keep in mind.

Thanks.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] mktree: do not check type of remote objects
  2022-06-21 13:59                   ` [PATCH] mktree: do not check type of remote objects Richard Oliver
  2022-06-21 16:51                     ` Junio C Hamano
@ 2022-06-21 17:48                     ` Junio C Hamano
  1 sibling, 0 replies; 20+ messages in thread
From: Junio C Hamano @ 2022-06-21 17:48 UTC (permalink / raw)
  To: Richard Oliver; +Cc: Jeff King, Taylor Blau, Derrick Stolee, git, jonathantanmy

Richard Oliver <roliver@roku.com> writes:

> +	/* Check the type of object identified by oid without fetching objects */
> +	struct object_info oi = OBJECT_INFO_INIT;

This is -Wdecl-after-statement.

> +	oi.typep = &obj_type;
> +	if (oid_object_info_extended(the_repository, &oid, &oi,
> +				     OBJECT_INFO_LOOKUP_REPLACE |
> +				     OBJECT_INFO_QUICK |
> +				     OBJECT_INFO_SKIP_FETCH_OBJECT) < 0)
> +		obj_type = -1;
> +
>  	if (obj_type < 0) {
>  		if (allow_missing) {
>  			; /* no problem - missing objects are presumed to be of the right type */


I've done an obvious and trivial fix-up locally while queueing.

Thanks.


diff --git a/builtin/mktree.c b/builtin/mktree.c
index cfadb52670..06d81400f5 100644
--- a/builtin/mktree.c
+++ b/builtin/mktree.c
@@ -74,6 +74,7 @@ static void mktree_line(char *buf, int nul_term_line, int allow_missing)
 	unsigned mode;
 	enum object_type mode_type; /* object type derived from mode */
 	enum object_type obj_type; /* object type derived from sha */
+	struct object_info oi = OBJECT_INFO_INIT;
 	char *path, *to_free = NULL;
 	struct object_id oid;
 
@@ -117,7 +118,6 @@ static void mktree_line(char *buf, int nul_term_line, int allow_missing)
 	}
 
 	/* Check the type of object identified by oid without fetching objects */
-	struct object_info oi = OBJECT_INFO_INIT;
 	oi.typep = &obj_type;
 	if (oid_object_info_extended(the_repository, &oid, &oi,
 				     OBJECT_INFO_LOOKUP_REPLACE |

^ permalink raw reply related	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2022-06-21 17:48 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-06-14 13:36 [PATCH] mktree: learn about promised objects Richard Oliver
2022-06-14 14:14 ` Derrick Stolee
2022-06-14 16:33   ` Richard Oliver
2022-06-14 17:27     ` Derrick Stolee
2022-06-15  0:35       ` Taylor Blau
2022-06-15  4:00         ` Jeff King
2022-06-15 17:40           ` Richard Oliver
2022-06-15 18:17             ` Derrick Stolee
2022-06-16  6:07               ` Jeff King
2022-06-16  6:54                 ` [PATCH] is_promisor_object(): walk promisor packs in pack-order Jeff King
2022-06-16 14:00                   ` Derrick Stolee
2022-06-17 19:50                   ` Jonathan Tan
2022-06-16 13:59                 ` [PATCH] mktree: learn about promised objects Derrick Stolee
2022-06-15 21:01             ` Junio C Hamano
2022-06-16  5:02               ` Jeff King
2022-06-16 15:46               ` [PATCH] mktree: Make '--missing' behave as documented Richard Oliver
2022-06-16 17:44                 ` Junio C Hamano
2022-06-21 13:59                   ` [PATCH] mktree: do not check type of remote objects Richard Oliver
2022-06-21 16:51                     ` Junio C Hamano
2022-06-21 17:48                     ` Junio C Hamano

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.