All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/3] packfile: use oidset for bad objects
@ 2021-09-11  7:50 René Scharfe
  2021-09-11  7:59 ` [PATCH 1/3] packfile: convert mark_bad_packed_object() to object_id René Scharfe
                   ` (5 more replies)
  0 siblings, 6 replies; 28+ messages in thread
From: René Scharfe @ 2021-09-11  7:50 UTC (permalink / raw)
  To: Git List; +Cc: Junio C Hamano

Replace the custom hash array for remembering corrupt pack entries with
an oidset.  This shortens and simplifies the code.

  packfile: convert mark_bad_packed_object() to object_id
  packfile: convert has_packed_and_bad() to object_id
  packfile: use oidset for bad objects

 midx.c         | 13 ++++---------
 object-file.c  |  4 ++--
 object-store.h |  4 ++--
 packfile.c     | 37 ++++++++++---------------------------
 packfile.h     |  4 ++--
 5 files changed, 20 insertions(+), 42 deletions(-)

--
2.33.0

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 1/3] packfile: convert mark_bad_packed_object() to object_id
  2021-09-11  7:50 [PATCH 0/3] packfile: use oidset for bad objects René Scharfe
@ 2021-09-11  7:59 ` René Scharfe
  2021-09-11  8:00 ` [PATCH 2/3] packfile: convert has_packed_and_bad() " René Scharfe
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 28+ messages in thread
From: René Scharfe @ 2021-09-11  7:59 UTC (permalink / raw)
  To: Git List; +Cc: Junio C Hamano

All callers have full object IDs, so pass them on instead of just their
hash member.

Signed-off-by: René Scharfe <l.s.r@web.de>
---
 object-file.c |  2 +-
 packfile.c    | 12 ++++++------
 packfile.h    |  2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/object-file.c b/object-file.c
index a8be899481..fb5a385a06 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1616,7 +1616,7 @@ static int do_oid_object_info_extended(struct repository *r,
 		return 0;
 	rtype = packed_object_info(r, e.p, e.offset, oi);
 	if (rtype < 0) {
-		mark_bad_packed_object(e.p, real->hash);
+		mark_bad_packed_object(e.p, real);
 		return do_oid_object_info_extended(r, real, oi, 0);
 	} else if (oi->whence == OI_PACKED) {
 		oi->u.packed.offset = e.offset;
diff --git a/packfile.c b/packfile.c
index 4d0d625238..fb15fc5b49 100644
--- a/packfile.c
+++ b/packfile.c
@@ -1161,17 +1161,17 @@ int unpack_object_header(struct packed_git *p,
 	return type;
 }

-void mark_bad_packed_object(struct packed_git *p, const unsigned char *sha1)
+void mark_bad_packed_object(struct packed_git *p, const struct object_id *oid)
 {
 	unsigned i;
 	const unsigned hashsz = the_hash_algo->rawsz;
 	for (i = 0; i < p->num_bad_objects; i++)
-		if (hasheq(sha1, p->bad_object_sha1 + hashsz * i))
+		if (hasheq(oid->hash, p->bad_object_sha1 + hashsz * i))
 			return;
 	p->bad_object_sha1 = xrealloc(p->bad_object_sha1,
 				      st_mult(GIT_MAX_RAWSZ,
 					      st_add(p->num_bad_objects, 1)));
-	hashcpy(p->bad_object_sha1 + hashsz * p->num_bad_objects, sha1);
+	hashcpy(p->bad_object_sha1 + hashsz * p->num_bad_objects, oid->hash);
 	p->num_bad_objects++;
 }

@@ -1272,7 +1272,7 @@ static int retry_bad_packed_offset(struct repository *r,
 	if (offset_to_pack_pos(p, obj_offset, &pos) < 0)
 		return OBJ_BAD;
 	nth_packed_object_id(&oid, p, pack_pos_to_index(p, pos));
-	mark_bad_packed_object(p, oid.hash);
+	mark_bad_packed_object(p, &oid);
 	type = oid_object_info(r, &oid, NULL);
 	if (type <= OBJ_NONE)
 		return OBJ_BAD;
@@ -1722,7 +1722,7 @@ void *unpack_entry(struct repository *r, struct packed_git *p, off_t obj_offset,
 				nth_packed_object_id(&oid, p, index_pos);
 				error("bad packed object CRC for %s",
 				      oid_to_hex(&oid));
-				mark_bad_packed_object(p, oid.hash);
+				mark_bad_packed_object(p, &oid);
 				data = NULL;
 				goto out;
 			}
@@ -1811,7 +1811,7 @@ void *unpack_entry(struct repository *r, struct packed_git *p, off_t obj_offset,
 				      " at offset %"PRIuMAX" from %s",
 				      oid_to_hex(&base_oid), (uintmax_t)obj_offset,
 				      p->pack_name);
-				mark_bad_packed_object(p, base_oid.hash);
+				mark_bad_packed_object(p, &base_oid);
 				base = read_object(r, &base_oid, &type, &base_size);
 				external_base = base;
 			}
diff --git a/packfile.h b/packfile.h
index 3ae117a8ae..a982ed9994 100644
--- a/packfile.h
+++ b/packfile.h
@@ -159,7 +159,7 @@ int packed_object_info(struct repository *r,
 		       struct packed_git *pack,
 		       off_t offset, struct object_info *);

-void mark_bad_packed_object(struct packed_git *p, const unsigned char *sha1);
+void mark_bad_packed_object(struct packed_git *, const struct object_id *);
 const struct packed_git *has_packed_and_bad(struct repository *r, const unsigned char *sha1);

 #define ON_DISK_KEEP_PACKS 1
--
2.33.0

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 2/3] packfile: convert has_packed_and_bad() to object_id
  2021-09-11  7:50 [PATCH 0/3] packfile: use oidset for bad objects René Scharfe
  2021-09-11  7:59 ` [PATCH 1/3] packfile: convert mark_bad_packed_object() to object_id René Scharfe
@ 2021-09-11  8:00 ` René Scharfe
  2021-09-11  8:01 ` [PATCH 3/3] packfile: use oidset for bad objects René Scharfe
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 28+ messages in thread
From: René Scharfe @ 2021-09-11  8:00 UTC (permalink / raw)
  To: Git List; +Cc: Junio C Hamano

The single caller has a full object ID, so pass it on instead of just
its hash member.

Signed-off-by: René Scharfe <l.s.r@web.de>
---
 object-file.c | 2 +-
 packfile.c    | 4 ++--
 packfile.h    | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/object-file.c b/object-file.c
index fb5a385a06..01e7058b4e 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1725,7 +1725,7 @@ void *read_object_file_extended(struct repository *r,
 		die(_("loose object %s (stored in %s) is corrupt"),
 		    oid_to_hex(repl), path);

-	if ((p = has_packed_and_bad(r, repl->hash)) != NULL)
+	if ((p = has_packed_and_bad(r, repl)) != NULL)
 		die(_("packed object %s (stored in %s) is corrupt"),
 		    oid_to_hex(repl), p->pack_name);
 	obj_read_unlock();
diff --git a/packfile.c b/packfile.c
index fb15fc5b49..04080a558b 100644
--- a/packfile.c
+++ b/packfile.c
@@ -1176,14 +1176,14 @@ void mark_bad_packed_object(struct packed_git *p, const struct object_id *oid)
 }

 const struct packed_git *has_packed_and_bad(struct repository *r,
-					    const unsigned char *sha1)
+					    const struct object_id *oid)
 {
 	struct packed_git *p;
 	unsigned i;

 	for (p = r->objects->packed_git; p; p = p->next)
 		for (i = 0; i < p->num_bad_objects; i++)
-			if (hasheq(sha1,
+			if (hasheq(oid->hash,
 				   p->bad_object_sha1 + the_hash_algo->rawsz * i))
 				return p;
 	return NULL;
diff --git a/packfile.h b/packfile.h
index a982ed9994..186146779d 100644
--- a/packfile.h
+++ b/packfile.h
@@ -160,7 +160,7 @@ int packed_object_info(struct repository *r,
 		       off_t offset, struct object_info *);

 void mark_bad_packed_object(struct packed_git *, const struct object_id *);
-const struct packed_git *has_packed_and_bad(struct repository *r, const unsigned char *sha1);
+const struct packed_git *has_packed_and_bad(struct repository *, const struct object_id *);

 #define ON_DISK_KEEP_PACKS 1
 #define IN_CORE_KEEP_PACKS 2
--
2.33.0

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 3/3] packfile: use oidset for bad objects
  2021-09-11  7:50 [PATCH 0/3] packfile: use oidset for bad objects René Scharfe
  2021-09-11  7:59 ` [PATCH 1/3] packfile: convert mark_bad_packed_object() to object_id René Scharfe
  2021-09-11  8:00 ` [PATCH 2/3] packfile: convert has_packed_and_bad() " René Scharfe
@ 2021-09-11  8:01 ` René Scharfe
  2021-09-11 14:26   ` Jeff King
  2021-09-11 14:27 ` [PATCH 0/3] " Jeff King
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 28+ messages in thread
From: René Scharfe @ 2021-09-11  8:01 UTC (permalink / raw)
  To: Git List; +Cc: Junio C Hamano

Store the object ID of broken pack entries in an oidset instead of
keeping only their hashes in an unsorted array.  The resulting code is
shorter and easier to read.  It also handles the (hopefully) very rare
case of having a high number of bad objects better.

Signed-off-by: René Scharfe <l.s.r@web.de>
---
 midx.c         | 13 ++++---------
 object-store.h |  4 ++--
 packfile.c     | 27 +++++----------------------
 3 files changed, 11 insertions(+), 33 deletions(-)

diff --git a/midx.c b/midx.c
index 321c6fdd2f..01623fb339 100644
--- a/midx.c
+++ b/midx.c
@@ -283,6 +283,7 @@ static int nth_midxed_pack_entry(struct repository *r,
 {
 	uint32_t pack_int_id;
 	struct packed_git *p;
+	struct object_id oid;

 	if (pos >= m->num_objects)
 		return 0;
@@ -303,15 +304,9 @@ static int nth_midxed_pack_entry(struct repository *r,
 	if (!is_pack_valid(p))
 		return 0;

-	if (p->num_bad_objects) {
-		uint32_t i;
-		struct object_id oid;
-		nth_midxed_object_oid(&oid, m, pos);
-		for (i = 0; i < p->num_bad_objects; i++)
-			if (hasheq(oid.hash,
-				   p->bad_object_sha1 + the_hash_algo->rawsz * i))
-				return 0;
-	}
+	nth_midxed_object_oid(&oid, m, pos);
+	if (oidset_contains(&p->bad_objects, &oid))
+		return 0;

 	e->offset = nth_midxed_offset(m, pos);
 	e->p = p;
diff --git a/object-store.h b/object-store.h
index b4dc6668aa..c7bead66f6 100644
--- a/object-store.h
+++ b/object-store.h
@@ -10,6 +10,7 @@
 #include "khash.h"
 #include "dir.h"
 #include "oidtree.h"
+#include "oidset.h"

 struct object_directory {
 	struct object_directory *next;
@@ -75,9 +76,8 @@ struct packed_git {
 	const void *index_data;
 	size_t index_size;
 	uint32_t num_objects;
-	uint32_t num_bad_objects;
 	uint32_t crc_offset;
-	unsigned char *bad_object_sha1;
+	struct oidset bad_objects;
 	int index_version;
 	time_t mtime;
 	int pack_fd;
diff --git a/packfile.c b/packfile.c
index 04080a558b..8f6d1d6328 100644
--- a/packfile.c
+++ b/packfile.c
@@ -1163,29 +1163,17 @@ int unpack_object_header(struct packed_git *p,

 void mark_bad_packed_object(struct packed_git *p, const struct object_id *oid)
 {
-	unsigned i;
-	const unsigned hashsz = the_hash_algo->rawsz;
-	for (i = 0; i < p->num_bad_objects; i++)
-		if (hasheq(oid->hash, p->bad_object_sha1 + hashsz * i))
-			return;
-	p->bad_object_sha1 = xrealloc(p->bad_object_sha1,
-				      st_mult(GIT_MAX_RAWSZ,
-					      st_add(p->num_bad_objects, 1)));
-	hashcpy(p->bad_object_sha1 + hashsz * p->num_bad_objects, oid->hash);
-	p->num_bad_objects++;
+	oidset_insert(&p->bad_objects, oid);
 }

 const struct packed_git *has_packed_and_bad(struct repository *r,
 					    const struct object_id *oid)
 {
 	struct packed_git *p;
-	unsigned i;

 	for (p = r->objects->packed_git; p; p = p->next)
-		for (i = 0; i < p->num_bad_objects; i++)
-			if (hasheq(oid->hash,
-				   p->bad_object_sha1 + the_hash_algo->rawsz * i))
-				return p;
+		if (oidset_contains(&p->bad_objects, oid))
+			return p;
 	return NULL;
 }

@@ -2016,13 +2004,8 @@ static int fill_pack_entry(const struct object_id *oid,
 {
 	off_t offset;

-	if (p->num_bad_objects) {
-		unsigned i;
-		for (i = 0; i < p->num_bad_objects; i++)
-			if (hasheq(oid->hash,
-				   p->bad_object_sha1 + the_hash_algo->rawsz * i))
-				return 0;
-	}
+	if (oidset_contains(&p->bad_objects, oid))
+		return 0;

 	offset = find_pack_entry_one(oid->hash, p);
 	if (!offset)
--
2.33.0

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH 3/3] packfile: use oidset for bad objects
  2021-09-11  8:01 ` [PATCH 3/3] packfile: use oidset for bad objects René Scharfe
@ 2021-09-11 14:26   ` Jeff King
  2021-09-11 16:08     ` René Scharfe
  0 siblings, 1 reply; 28+ messages in thread
From: Jeff King @ 2021-09-11 14:26 UTC (permalink / raw)
  To: René Scharfe; +Cc: Git List, Junio C Hamano

On Sat, Sep 11, 2021 at 10:01:40AM +0200, René Scharfe wrote:

> Store the object ID of broken pack entries in an oidset instead of
> keeping only their hashes in an unsorted array.  The resulting code is
> shorter and easier to read.  It also handles the (hopefully) very rare
> case of having a high number of bad objects better.

Yay, I'm very happy to see this kind of cleanup replacing ad hoc data
structures with well-tested ones.

> @@ -303,15 +304,9 @@ static int nth_midxed_pack_entry(struct repository *r,
>  	if (!is_pack_valid(p))
>  		return 0;
> 
> -	if (p->num_bad_objects) {
> -		uint32_t i;
> -		struct object_id oid;
> -		nth_midxed_object_oid(&oid, m, pos);
> -		for (i = 0; i < p->num_bad_objects; i++)
> -			if (hasheq(oid.hash,
> -				   p->bad_object_sha1 + the_hash_algo->rawsz * i))
> -				return 0;
> -	}
> +	nth_midxed_object_oid(&oid, m, pos);
> +	if (oidset_contains(&p->bad_objects, &oid))
> +		return 0;

Calling nth_midxed_object_oid() implies a memcpy() under the hood. In
the old code, we'd skip that in the common case that we had no corrupt
objects, but now we'll pay the cost regardless. memcpy() isn't _that_
expensive, but I'd expect this to be a relatively hot code path.

Is it worth sticking all of this inside:

  if (oidset_size(&p->bad_objects))

?

> diff --git a/packfile.c b/packfile.c
> index 04080a558b..8f6d1d6328 100644
> --- a/packfile.c
> +++ b/packfile.c
> @@ -1163,29 +1163,17 @@ int unpack_object_header(struct packed_git *p,
> 
>  void mark_bad_packed_object(struct packed_git *p, const struct object_id *oid)
>  {
> -	unsigned i;
> -	const unsigned hashsz = the_hash_algo->rawsz;
> -	for (i = 0; i < p->num_bad_objects; i++)
> -		if (hasheq(oid->hash, p->bad_object_sha1 + hashsz * i))
> -			return;

I cringed at the hasheq() and hashcpy() calls in the earlier patches.
Happy to see them go away now. :)

> -	p->bad_object_sha1 = xrealloc(p->bad_object_sha1,
> -				      st_mult(GIT_MAX_RAWSZ,
> -					      st_add(p->num_bad_objects, 1)));
> -	hashcpy(p->bad_object_sha1 + hashsz * p->num_bad_objects, oid->hash);
> -	p->num_bad_objects++;
> +	oidset_insert(&p->bad_objects, oid);
>  }

So now marking a bad object  is a one-liner. We _could_ just inline it
at the callers, but I like keeping the implementation abstract.

>  const struct packed_git *has_packed_and_bad(struct repository *r,
>  					    const struct object_id *oid)
>  {
>  	struct packed_git *p;
> -	unsigned i;
> 
>  	for (p = r->objects->packed_git; p; p = p->next)
> -		for (i = 0; i < p->num_bad_objects; i++)
> -			if (hasheq(oid->hash,
> -				   p->bad_object_sha1 + the_hash_algo->rawsz * i))
> -				return p;
> +		if (oidset_contains(&p->bad_objects, oid))
> +			return p;
>  	return NULL;
>  }

Not related to your patch, but I noticed how terribly inefficient this
function could be in a repo with a lot of packs. But we only call it
once in the error case right before we die(), so a linear scan is no
problem.

> @@ -2016,13 +2004,8 @@ static int fill_pack_entry(const struct object_id *oid,
>  {
>  	off_t offset;
> 
> -	if (p->num_bad_objects) {
> -		unsigned i;
> -		for (i = 0; i < p->num_bad_objects; i++)
> -			if (hasheq(oid->hash,
> -				   p->bad_object_sha1 + the_hash_algo->rawsz * i))
> -				return 0;
> -	}
> +	if (oidset_contains(&p->bad_objects, oid))
> +		return 0;

And this one (and the previous) have the oid already, so they don't have
to worry about optimizing the is-it-empty check first.

-Peff

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 0/3] packfile: use oidset for bad objects
  2021-09-11  7:50 [PATCH 0/3] packfile: use oidset for bad objects René Scharfe
                   ` (2 preceding siblings ...)
  2021-09-11  8:01 ` [PATCH 3/3] packfile: use oidset for bad objects René Scharfe
@ 2021-09-11 14:27 ` Jeff King
  2021-09-11 16:08 ` [PATCH 4/3] midx: inline nth_midxed_pack_entry() René Scharfe
  2021-09-11 20:31 ` [PATCH v2 0/5] packfile: use oidset for bad objects René Scharfe
  5 siblings, 0 replies; 28+ messages in thread
From: Jeff King @ 2021-09-11 14:27 UTC (permalink / raw)
  To: René Scharfe; +Cc: Git List, Junio C Hamano

On Sat, Sep 11, 2021 at 09:50:36AM +0200, René Scharfe wrote:

> Replace the custom hash array for remembering corrupt pack entries with
> an oidset.  This shortens and simplifies the code.

Thanks, these were a pleasure to read.

I noticed one possible small performance change in the third patch. It
should be easy to remedy if we want (but I could also believe that it
doesn't matter either way, if you care to argue that :) ).

-Peff

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 3/3] packfile: use oidset for bad objects
  2021-09-11 14:26   ` Jeff King
@ 2021-09-11 16:08     ` René Scharfe
  2021-09-11 17:03       ` Jeff King
  0 siblings, 1 reply; 28+ messages in thread
From: René Scharfe @ 2021-09-11 16:08 UTC (permalink / raw)
  To: Jeff King; +Cc: Git List, Junio C Hamano

Am 11.09.21 um 16:26 schrieb Jeff King:
> On Sat, Sep 11, 2021 at 10:01:40AM +0200, René Scharfe wrote:
>
>> Store the object ID of broken pack entries in an oidset instead of
>> keeping only their hashes in an unsorted array.  The resulting code is
>> shorter and easier to read.  It also handles the (hopefully) very rare
>> case of having a high number of bad objects better.
>
> Yay, I'm very happy to see this kind of cleanup replacing ad hoc data
> structures with well-tested ones.
>
>> @@ -303,15 +304,9 @@ static int nth_midxed_pack_entry(struct repository *r,
>>  	if (!is_pack_valid(p))
>>  		return 0;
>>
>> -	if (p->num_bad_objects) {
>> -		uint32_t i;
>> -		struct object_id oid;
>> -		nth_midxed_object_oid(&oid, m, pos);
>> -		for (i = 0; i < p->num_bad_objects; i++)
>> -			if (hasheq(oid.hash,
>> -				   p->bad_object_sha1 + the_hash_algo->rawsz * i))
>> -				return 0;
>> -	}
>> +	nth_midxed_object_oid(&oid, m, pos);
>> +	if (oidset_contains(&p->bad_objects, &oid))
>> +		return 0;
>
> Calling nth_midxed_object_oid() implies a memcpy() under the hood. In
> the old code, we'd skip that in the common case that we had no corrupt
> objects, but now we'll pay the cost regardless. memcpy() isn't _that_
> expensive, but I'd expect this to be a relatively hot code path.
>
> Is it worth sticking all of this inside:
>
>   if (oidset_size(&p->bad_objects))
>
> ?

Hard to say.  It would certainly match the old code more closely.  Is a
function call cheaper than copying 32 bytes?  Depends on the CPU and
whether the hash is cached, I guess.  And cached it probably is, because
the caller did a binary search for it..

We can pass on the original oid to avoid the nth_midxed_object_oid()
call, but inlining the whole thing might even be nicer.

René

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 4/3] midx: inline nth_midxed_pack_entry()
  2021-09-11  7:50 [PATCH 0/3] packfile: use oidset for bad objects René Scharfe
                   ` (3 preceding siblings ...)
  2021-09-11 14:27 ` [PATCH 0/3] " Jeff King
@ 2021-09-11 16:08 ` René Scharfe
  2021-09-11 17:03   ` René Scharfe
  2021-09-11 17:07   ` Jeff King
  2021-09-11 20:31 ` [PATCH v2 0/5] packfile: use oidset for bad objects René Scharfe
  5 siblings, 2 replies; 28+ messages in thread
From: René Scharfe @ 2021-09-11 16:08 UTC (permalink / raw)
  To: Git List; +Cc: Junio C Hamano, Jeff King

fill_midx_entry() finds the position of an object ID and passes it to
nth_midxed_pack_entry(), which uses the position to look up the object
ID for its own purposes.  Inline the latter into the former to avoid
that lookup.

Signed-off-by: René Scharfe <l.s.r@web.de>
---
 midx.c | 29 +++++++++--------------------
 1 file changed, 9 insertions(+), 20 deletions(-)

diff --git a/midx.c b/midx.c
index 01623fb339..59517938a8 100644
--- a/midx.c
+++ b/midx.c
@@ -276,14 +276,17 @@ uint32_t nth_midxed_pack_int_id(struct multi_pack_index *m, uint32_t pos)
 			(off_t)pos * MIDX_CHUNK_OFFSET_WIDTH);
 }

-static int nth_midxed_pack_entry(struct repository *r,
-				 struct multi_pack_index *m,
-				 struct pack_entry *e,
-				 uint32_t pos)
+int fill_midx_entry(struct repository * r,
+		    const struct object_id *oid,
+		    struct pack_entry *e,
+		    struct multi_pack_index *m)
 {
+	uint32_t pos;
 	uint32_t pack_int_id;
 	struct packed_git *p;
-	struct object_id oid;
+
+	if (!bsearch_midx(oid, m, &pos))
+		return 0;

 	if (pos >= m->num_objects)
 		return 0;
@@ -304,8 +307,7 @@ static int nth_midxed_pack_entry(struct repository *r,
 	if (!is_pack_valid(p))
 		return 0;

-	nth_midxed_object_oid(&oid, m, pos);
-	if (oidset_contains(&p->bad_objects, &oid))
+	if (oidset_contains(&p->bad_objects, oid))
 		return 0;

 	e->offset = nth_midxed_offset(m, pos);
@@ -314,19 +316,6 @@ static int nth_midxed_pack_entry(struct repository *r,
 	return 1;
 }

-int fill_midx_entry(struct repository * r,
-		    const struct object_id *oid,
-		    struct pack_entry *e,
-		    struct multi_pack_index *m)
-{
-	uint32_t pos;
-
-	if (!bsearch_midx(oid, m, &pos))
-		return 0;
-
-	return nth_midxed_pack_entry(r, m, e, pos);
-}
-
 /* Match "foo.idx" against either "foo.pack" _or_ "foo.idx". */
 static int cmp_idx_or_pack_name(const char *idx_or_pack_name,
 				const char *idx_name)
--
2.33.0

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH 3/3] packfile: use oidset for bad objects
  2021-09-11 16:08     ` René Scharfe
@ 2021-09-11 17:03       ` Jeff King
  2021-09-11 17:16         ` René Scharfe
  0 siblings, 1 reply; 28+ messages in thread
From: Jeff King @ 2021-09-11 17:03 UTC (permalink / raw)
  To: René Scharfe; +Cc: Git List, Junio C Hamano

On Sat, Sep 11, 2021 at 06:08:38PM +0200, René Scharfe wrote:

> >> +	nth_midxed_object_oid(&oid, m, pos);
> >> +	if (oidset_contains(&p->bad_objects, &oid))
> >> +		return 0;
> >
> > Calling nth_midxed_object_oid() implies a memcpy() under the hood. In
> > the old code, we'd skip that in the common case that we had no corrupt
> > objects, but now we'll pay the cost regardless. memcpy() isn't _that_
> > expensive, but I'd expect this to be a relatively hot code path.
> >
> > Is it worth sticking all of this inside:
> >
> >   if (oidset_size(&p->bad_objects))
> >
> > ?
> 
> Hard to say.  It would certainly match the old code more closely.  Is a
> function call cheaper than copying 32 bytes?  Depends on the CPU and
> whether the hash is cached, I guess.  And cached it probably is, because
> the caller did a binary search for it..

You already have a function call for nth_midxed_object_oid(), so
checking oidset_size() would be a strict improvement.

> We can pass on the original oid to avoid the nth_midxed_object_oid()
> call, but inlining the whole thing might even be nicer.

Yeah, it occurs to me that oidset_size() would be a good candidate for
inlining, if that's what you mean.

-Peff

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 4/3] midx: inline nth_midxed_pack_entry()
  2021-09-11 16:08 ` [PATCH 4/3] midx: inline nth_midxed_pack_entry() René Scharfe
@ 2021-09-11 17:03   ` René Scharfe
  2021-09-11 17:07   ` Jeff King
  1 sibling, 0 replies; 28+ messages in thread
From: René Scharfe @ 2021-09-11 17:03 UTC (permalink / raw)
  To: Git List; +Cc: Junio C Hamano, Jeff King

Am 11.09.21 um 18:08 schrieb René Scharfe:
> fill_midx_entry() finds the position of an object ID and passes it to
> nth_midxed_pack_entry(), which uses the position to look up the object
> ID for its own purposes.  Inline the latter into the former to avoid
> that lookup.
>

Forgot:

Reported-by: Jeff King <peff@peff.net>

> Signed-off-by: René Scharfe <l.s.r@web.de>

René

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 4/3] midx: inline nth_midxed_pack_entry()
  2021-09-11 16:08 ` [PATCH 4/3] midx: inline nth_midxed_pack_entry() René Scharfe
  2021-09-11 17:03   ` René Scharfe
@ 2021-09-11 17:07   ` Jeff King
  2021-09-11 20:31     ` René Scharfe
  1 sibling, 1 reply; 28+ messages in thread
From: Jeff King @ 2021-09-11 17:07 UTC (permalink / raw)
  To: René Scharfe; +Cc: Git List, Junio C Hamano

On Sat, Sep 11, 2021 at 06:08:42PM +0200, René Scharfe wrote:

> fill_midx_entry() finds the position of an object ID and passes it to
> nth_midxed_pack_entry(), which uses the position to look up the object
> ID for its own purposes.  Inline the latter into the former to avoid
> that lookup.

Ah, I see what you mean now by "inline" in the other part of the thread.

Yes, I think this makes sense since there is no other reasonable caller
of the nth_midxed_pack_entry() helper (and its one caller is itself
trivial).

> @@ -304,8 +307,7 @@ static int nth_midxed_pack_entry(struct repository *r,
>  	if (!is_pack_valid(p))
>  		return 0;
> 
> -	nth_midxed_object_oid(&oid, m, pos);
> -	if (oidset_contains(&p->bad_objects, &oid))
> +	if (oidset_contains(&p->bad_objects, oid))
>  		return 0;

So we get to avoid the nth_midxed_object_oid() copy entirely. Very nice.

Compared to the code before your series, we still have an extra function
call to oidset_contains(), which will (in the common case) notice we
have no entries and immediately return. But I think that's getting into
pointless micro-optimization.

-Peff

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 3/3] packfile: use oidset for bad objects
  2021-09-11 17:03       ` Jeff King
@ 2021-09-11 17:16         ` René Scharfe
  0 siblings, 0 replies; 28+ messages in thread
From: René Scharfe @ 2021-09-11 17:16 UTC (permalink / raw)
  To: Jeff King; +Cc: Git List, Junio C Hamano

Am 11.09.21 um 19:03 schrieb Jeff King:
> On Sat, Sep 11, 2021 at 06:08:38PM +0200, René Scharfe wrote:
>
>>>> +	nth_midxed_object_oid(&oid, m, pos);
>>>> +	if (oidset_contains(&p->bad_objects, &oid))
>>>> +		return 0;
>>>
>>> Calling nth_midxed_object_oid() implies a memcpy() under the hood. In
>>> the old code, we'd skip that in the common case that we had no corrupt
>>> objects, but now we'll pay the cost regardless. memcpy() isn't _that_
>>> expensive, but I'd expect this to be a relatively hot code path.
>>>
>>> Is it worth sticking all of this inside:
>>>
>>>   if (oidset_size(&p->bad_objects))
>>>
>>> ?
>>
>> Hard to say.  It would certainly match the old code more closely.  Is a
>> function call cheaper than copying 32 bytes?  Depends on the CPU and
>> whether the hash is cached, I guess.  And cached it probably is, because
>> the caller did a binary search for it..
>
> You already have a function call for nth_midxed_object_oid(), so
> checking oidset_size() would be a strict improvement.

If I read the assembly correctly nth_midxed_object_oid() is inlined by the
compiler in my build, as is nth_midxed_pack_entry().  Both are defined in
the same file, so other compilers may easily do the same.

>> We can pass on the original oid to avoid the nth_midxed_object_oid()
>> call, but inlining the whole thing might even be nicer.
>
> Yeah, it occurs to me that oidset_size() would be a good candidate for
> inlining, if that's what you mean.

True, but I meant something else (see patch 4/3). :)

René

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 4/3] midx: inline nth_midxed_pack_entry()
  2021-09-11 17:07   ` Jeff King
@ 2021-09-11 20:31     ` René Scharfe
  2021-09-11 21:20       ` Jeff King
  0 siblings, 1 reply; 28+ messages in thread
From: René Scharfe @ 2021-09-11 20:31 UTC (permalink / raw)
  To: Jeff King; +Cc: Git List, Junio C Hamano

Am 11.09.21 um 19:07 schrieb Jeff King:
> On Sat, Sep 11, 2021 at 06:08:42PM +0200, René Scharfe wrote:
>
>> @@ -304,8 +307,7 @@ static int nth_midxed_pack_entry(struct repository *r,
>>  	if (!is_pack_valid(p))
>>  		return 0;
>>
>> -	nth_midxed_object_oid(&oid, m, pos);
>> -	if (oidset_contains(&p->bad_objects, &oid))
>> +	if (oidset_contains(&p->bad_objects, oid))
>>  		return 0;
>
> So we get to avoid the nth_midxed_object_oid() copy entirely. Very nice.
>
> Compared to the code before your series, we still have an extra function
> call to oidset_contains(), which will (in the common case) notice we
> have no entries and immediately return. But I think that's getting into
> pointless micro-optimization.

Right.  I measure a 0.5% slowdown for git multi-pack-index verify.  An
inline oidset_size call avoids it.  That's easy enough to add, so let's
have it!

René

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH v2 0/5] packfile: use oidset for bad objects
  2021-09-11  7:50 [PATCH 0/3] packfile: use oidset for bad objects René Scharfe
                   ` (4 preceding siblings ...)
  2021-09-11 16:08 ` [PATCH 4/3] midx: inline nth_midxed_pack_entry() René Scharfe
@ 2021-09-11 20:31 ` René Scharfe
  2021-09-11 20:36   ` [PATCH 1/5] oidset: make oidset_size() an inline function René Scharfe
                     ` (7 more replies)
  5 siblings, 8 replies; 28+ messages in thread
From: René Scharfe @ 2021-09-11 20:31 UTC (permalink / raw)
  To: Git List; +Cc: Junio C Hamano, Jeff King

Replace the custom hash array for remembering corrupt pack entries with
an oidset.  This shortens and simplifies the code.

Changes since v1:
- inline oidset_size()
- inline nth_midxed_pack_entry() early
- use oidset_size() to avoid a function call if no bad objects exist

  oidset: make oidset_size() an inline function
  midx: inline nth_midxed_pack_entry()
  packfile: convert mark_bad_packed_object() to object_id
  packfile: convert has_packed_and_bad() to object_id
  packfile: use oidset for bad objects

 midx.c         | 37 +++++++++++--------------------------
 object-file.c  |  4 ++--
 object-store.h |  4 ++--
 oidset.c       |  5 -----
 oidset.h       |  5 ++++-
 packfile.c     | 38 +++++++++++---------------------------
 packfile.h     |  4 ++--
 7 files changed, 32 insertions(+), 65 deletions(-)

--
2.33.0

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 1/5] oidset: make oidset_size() an inline function
  2021-09-11 20:31 ` [PATCH v2 0/5] packfile: use oidset for bad objects René Scharfe
@ 2021-09-11 20:36   ` René Scharfe
  2021-09-11 20:39   ` [PATCH 2/5] midx: inline nth_midxed_pack_entry() René Scharfe
                     ` (6 subsequent siblings)
  7 siblings, 0 replies; 28+ messages in thread
From: René Scharfe @ 2021-09-11 20:36 UTC (permalink / raw)
  To: Git List; +Cc: Junio C Hamano, Jeff King

oidset_size() just reads a single word from memory and returns it.
Avoid the function call overhead for this trivial operation by turning
it into an inline function.

While we're at it, declare its parameter const to allow it to be used
on read-only oidsets.

Suggested-by: Jeff King <peff@peff.net>
Signed-off-by: René Scharfe <l.s.r@web.de>
---
 oidset.c | 5 -----
 oidset.h | 5 ++++-
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/oidset.c b/oidset.c
index 5aac633c1f..b36a2bae86 100644
--- a/oidset.c
+++ b/oidset.c
@@ -36,11 +36,6 @@ void oidset_clear(struct oidset *set)
 	oidset_init(set, 0);
 }

-int oidset_size(struct oidset *set)
-{
-	return kh_size(&set->set);
-}
-
 void oidset_parse_file(struct oidset *set, const char *path)
 {
 	oidset_parse_file_carefully(set, path, NULL, NULL);
diff --git a/oidset.h b/oidset.h
index 01f6560283..ba4a5a2cd3 100644
--- a/oidset.h
+++ b/oidset.h
@@ -57,7 +57,10 @@ int oidset_remove(struct oidset *set, const struct object_id *oid);
 /**
  * Returns the number of oids in the set.
  */
-int oidset_size(struct oidset *set);
+static inline int oidset_size(const struct oidset *set)
+{
+	return kh_size(&set->set);
+}

 /**
  * Remove all entries from the oidset, freeing any resources associated with
--
2.33.0

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 2/5] midx: inline nth_midxed_pack_entry()
  2021-09-11 20:31 ` [PATCH v2 0/5] packfile: use oidset for bad objects René Scharfe
  2021-09-11 20:36   ` [PATCH 1/5] oidset: make oidset_size() an inline function René Scharfe
@ 2021-09-11 20:39   ` René Scharfe
  2021-09-11 20:40   ` [PATCH 3/5] packfile: convert mark_bad_packed_object() to object_id René Scharfe
                     ` (5 subsequent siblings)
  7 siblings, 0 replies; 28+ messages in thread
From: René Scharfe @ 2021-09-11 20:39 UTC (permalink / raw)
  To: Git List; +Cc: Junio C Hamano, Jeff King

fill_midx_entry() finds the position of an object ID and passes it to
nth_midxed_pack_entry(), which uses the position to look up the object
ID for its own purposes.  Inline the latter into the former to avoid
that lookup.


Signed-off-by: René Scharfe <l.s.r@web.de>
---
 midx.c | 29 +++++++++--------------------
 1 file changed, 9 insertions(+), 20 deletions(-)

diff --git a/midx.c b/midx.c
index 321c6fdd2f..8cb063023c 100644
--- a/midx.c
+++ b/midx.c
@@ -276,14 +276,18 @@ uint32_t nth_midxed_pack_int_id(struct multi_pack_index *m, uint32_t pos)
 			(off_t)pos * MIDX_CHUNK_OFFSET_WIDTH);
 }

-static int nth_midxed_pack_entry(struct repository *r,
-				 struct multi_pack_index *m,
-				 struct pack_entry *e,
-				 uint32_t pos)
+int fill_midx_entry(struct repository * r,
+		    const struct object_id *oid,
+		    struct pack_entry *e,
+		    struct multi_pack_index *m)
 {
+	uint32_t pos;
 	uint32_t pack_int_id;
 	struct packed_git *p;

+	if (!bsearch_midx(oid, m, &pos))
+		return 0;
+
 	if (pos >= m->num_objects)
 		return 0;

@@ -305,10 +309,8 @@ static int nth_midxed_pack_entry(struct repository *r,

 	if (p->num_bad_objects) {
 		uint32_t i;
-		struct object_id oid;
-		nth_midxed_object_oid(&oid, m, pos);
 		for (i = 0; i < p->num_bad_objects; i++)
-			if (hasheq(oid.hash,
+			if (hasheq(oid->hash,
 				   p->bad_object_sha1 + the_hash_algo->rawsz * i))
 				return 0;
 	}
@@ -319,19 +321,6 @@ static int nth_midxed_pack_entry(struct repository *r,
 	return 1;
 }

-int fill_midx_entry(struct repository * r,
-		    const struct object_id *oid,
-		    struct pack_entry *e,
-		    struct multi_pack_index *m)
-{
-	uint32_t pos;
-
-	if (!bsearch_midx(oid, m, &pos))
-		return 0;
-
-	return nth_midxed_pack_entry(r, m, e, pos);
-}
-
 /* Match "foo.idx" against either "foo.pack" _or_ "foo.idx". */
 static int cmp_idx_or_pack_name(const char *idx_or_pack_name,
 				const char *idx_name)
--
2.33.0

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 3/5] packfile: convert mark_bad_packed_object() to object_id
  2021-09-11 20:31 ` [PATCH v2 0/5] packfile: use oidset for bad objects René Scharfe
  2021-09-11 20:36   ` [PATCH 1/5] oidset: make oidset_size() an inline function René Scharfe
  2021-09-11 20:39   ` [PATCH 2/5] midx: inline nth_midxed_pack_entry() René Scharfe
@ 2021-09-11 20:40   ` René Scharfe
  2021-09-11 20:42   ` [PATCH v2 4/5] packfile: convert has_packed_and_bad() " René Scharfe
                     ` (4 subsequent siblings)
  7 siblings, 0 replies; 28+ messages in thread
From: René Scharfe @ 2021-09-11 20:40 UTC (permalink / raw)
  To: Git List; +Cc: Junio C Hamano, Jeff King

All callers have full object IDs, so pass them on instead of just their
hash member.

Signed-off-by: René Scharfe <l.s.r@web.de>
---
 object-file.c |  2 +-
 packfile.c    | 12 ++++++------
 packfile.h    |  2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/object-file.c b/object-file.c
index a8be899481..fb5a385a06 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1616,7 +1616,7 @@ static int do_oid_object_info_extended(struct repository *r,
 		return 0;
 	rtype = packed_object_info(r, e.p, e.offset, oi);
 	if (rtype < 0) {
-		mark_bad_packed_object(e.p, real->hash);
+		mark_bad_packed_object(e.p, real);
 		return do_oid_object_info_extended(r, real, oi, 0);
 	} else if (oi->whence == OI_PACKED) {
 		oi->u.packed.offset = e.offset;
diff --git a/packfile.c b/packfile.c
index 4d0d625238..fb15fc5b49 100644
--- a/packfile.c
+++ b/packfile.c
@@ -1161,17 +1161,17 @@ int unpack_object_header(struct packed_git *p,
 	return type;
 }

-void mark_bad_packed_object(struct packed_git *p, const unsigned char *sha1)
+void mark_bad_packed_object(struct packed_git *p, const struct object_id *oid)
 {
 	unsigned i;
 	const unsigned hashsz = the_hash_algo->rawsz;
 	for (i = 0; i < p->num_bad_objects; i++)
-		if (hasheq(sha1, p->bad_object_sha1 + hashsz * i))
+		if (hasheq(oid->hash, p->bad_object_sha1 + hashsz * i))
 			return;
 	p->bad_object_sha1 = xrealloc(p->bad_object_sha1,
 				      st_mult(GIT_MAX_RAWSZ,
 					      st_add(p->num_bad_objects, 1)));
-	hashcpy(p->bad_object_sha1 + hashsz * p->num_bad_objects, sha1);
+	hashcpy(p->bad_object_sha1 + hashsz * p->num_bad_objects, oid->hash);
 	p->num_bad_objects++;
 }

@@ -1272,7 +1272,7 @@ static int retry_bad_packed_offset(struct repository *r,
 	if (offset_to_pack_pos(p, obj_offset, &pos) < 0)
 		return OBJ_BAD;
 	nth_packed_object_id(&oid, p, pack_pos_to_index(p, pos));
-	mark_bad_packed_object(p, oid.hash);
+	mark_bad_packed_object(p, &oid);
 	type = oid_object_info(r, &oid, NULL);
 	if (type <= OBJ_NONE)
 		return OBJ_BAD;
@@ -1722,7 +1722,7 @@ void *unpack_entry(struct repository *r, struct packed_git *p, off_t obj_offset,
 				nth_packed_object_id(&oid, p, index_pos);
 				error("bad packed object CRC for %s",
 				      oid_to_hex(&oid));
-				mark_bad_packed_object(p, oid.hash);
+				mark_bad_packed_object(p, &oid);
 				data = NULL;
 				goto out;
 			}
@@ -1811,7 +1811,7 @@ void *unpack_entry(struct repository *r, struct packed_git *p, off_t obj_offset,
 				      " at offset %"PRIuMAX" from %s",
 				      oid_to_hex(&base_oid), (uintmax_t)obj_offset,
 				      p->pack_name);
-				mark_bad_packed_object(p, base_oid.hash);
+				mark_bad_packed_object(p, &base_oid);
 				base = read_object(r, &base_oid, &type, &base_size);
 				external_base = base;
 			}
diff --git a/packfile.h b/packfile.h
index 3ae117a8ae..a982ed9994 100644
--- a/packfile.h
+++ b/packfile.h
@@ -159,7 +159,7 @@ int packed_object_info(struct repository *r,
 		       struct packed_git *pack,
 		       off_t offset, struct object_info *);

-void mark_bad_packed_object(struct packed_git *p, const unsigned char *sha1);
+void mark_bad_packed_object(struct packed_git *, const struct object_id *);
 const struct packed_git *has_packed_and_bad(struct repository *r, const unsigned char *sha1);

 #define ON_DISK_KEEP_PACKS 1
--
2.33.0

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH v2 4/5] packfile: convert has_packed_and_bad() to object_id
  2021-09-11 20:31 ` [PATCH v2 0/5] packfile: use oidset for bad objects René Scharfe
                     ` (2 preceding siblings ...)
  2021-09-11 20:40   ` [PATCH 3/5] packfile: convert mark_bad_packed_object() to object_id René Scharfe
@ 2021-09-11 20:42   ` René Scharfe
  2021-09-11 20:43   ` [PATCH v2 5/5] packfile: use oidset for bad objects René Scharfe
                     ` (3 subsequent siblings)
  7 siblings, 0 replies; 28+ messages in thread
From: René Scharfe @ 2021-09-11 20:42 UTC (permalink / raw)
  To: Git List; +Cc: Junio C Hamano, Jeff King

The single caller has a full object ID, so pass it on instead of just
its hash member.

Signed-off-by: René Scharfe <l.s.r@web.de>
---
 object-file.c | 2 +-
 packfile.c    | 4 ++--
 packfile.h    | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/object-file.c b/object-file.c
index fb5a385a06..01e7058b4e 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1725,7 +1725,7 @@ void *read_object_file_extended(struct repository *r,
 		die(_("loose object %s (stored in %s) is corrupt"),
 		    oid_to_hex(repl), path);

-	if ((p = has_packed_and_bad(r, repl->hash)) != NULL)
+	if ((p = has_packed_and_bad(r, repl)) != NULL)
 		die(_("packed object %s (stored in %s) is corrupt"),
 		    oid_to_hex(repl), p->pack_name);
 	obj_read_unlock();
diff --git a/packfile.c b/packfile.c
index fb15fc5b49..04080a558b 100644
--- a/packfile.c
+++ b/packfile.c
@@ -1176,14 +1176,14 @@ void mark_bad_packed_object(struct packed_git *p, const struct object_id *oid)
 }

 const struct packed_git *has_packed_and_bad(struct repository *r,
-					    const unsigned char *sha1)
+					    const struct object_id *oid)
 {
 	struct packed_git *p;
 	unsigned i;

 	for (p = r->objects->packed_git; p; p = p->next)
 		for (i = 0; i < p->num_bad_objects; i++)
-			if (hasheq(sha1,
+			if (hasheq(oid->hash,
 				   p->bad_object_sha1 + the_hash_algo->rawsz * i))
 				return p;
 	return NULL;
diff --git a/packfile.h b/packfile.h
index a982ed9994..186146779d 100644
--- a/packfile.h
+++ b/packfile.h
@@ -160,7 +160,7 @@ int packed_object_info(struct repository *r,
 		       off_t offset, struct object_info *);

 void mark_bad_packed_object(struct packed_git *, const struct object_id *);
-const struct packed_git *has_packed_and_bad(struct repository *r, const unsigned char *sha1);
+const struct packed_git *has_packed_and_bad(struct repository *, const struct object_id *);

 #define ON_DISK_KEEP_PACKS 1
 #define IN_CORE_KEEP_PACKS 2
--
2.33.0

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH v2 5/5] packfile: use oidset for bad objects
  2021-09-11 20:31 ` [PATCH v2 0/5] packfile: use oidset for bad objects René Scharfe
                     ` (3 preceding siblings ...)
  2021-09-11 20:42   ` [PATCH v2 4/5] packfile: convert has_packed_and_bad() " René Scharfe
@ 2021-09-11 20:43   ` René Scharfe
  2021-09-11 20:45   ` [PATCH v2 0/5] " René Scharfe
                     ` (2 subsequent siblings)
  7 siblings, 0 replies; 28+ messages in thread
From: René Scharfe @ 2021-09-11 20:43 UTC (permalink / raw)
  To: Git List; +Cc: Junio C Hamano, Jeff King

Store the object ID of broken pack entries in an oidset instead of
keeping only their hashes in an unsorted array.  The resulting code is
shorter and easier to read.  It also handles the (hopefully) very rare
case of having a high number of bad objects better.

Helped-by: Jeff King <peff@peff.net>
Signed-off-by: René Scharfe <l.s.r@web.de>
---
 midx.c         | 10 +++-------
 object-store.h |  4 ++--
 packfile.c     | 28 ++++++----------------------
 3 files changed, 11 insertions(+), 31 deletions(-)

diff --git a/midx.c b/midx.c
index 8cb063023c..76322b713c 100644
--- a/midx.c
+++ b/midx.c
@@ -307,13 +307,9 @@ int fill_midx_entry(struct repository * r,
 	if (!is_pack_valid(p))
 		return 0;

-	if (p->num_bad_objects) {
-		uint32_t i;
-		for (i = 0; i < p->num_bad_objects; i++)
-			if (hasheq(oid->hash,
-				   p->bad_object_sha1 + the_hash_algo->rawsz * i))
-				return 0;
-	}
+	if (oidset_size(&p->bad_objects) &&
+	    oidset_contains(&p->bad_objects, oid))
+		return 0;

 	e->offset = nth_midxed_offset(m, pos);
 	e->p = p;
diff --git a/object-store.h b/object-store.h
index b4dc6668aa..c7bead66f6 100644
--- a/object-store.h
+++ b/object-store.h
@@ -10,6 +10,7 @@
 #include "khash.h"
 #include "dir.h"
 #include "oidtree.h"
+#include "oidset.h"

 struct object_directory {
 	struct object_directory *next;
@@ -75,9 +76,8 @@ struct packed_git {
 	const void *index_data;
 	size_t index_size;
 	uint32_t num_objects;
-	uint32_t num_bad_objects;
 	uint32_t crc_offset;
-	unsigned char *bad_object_sha1;
+	struct oidset bad_objects;
 	int index_version;
 	time_t mtime;
 	int pack_fd;
diff --git a/packfile.c b/packfile.c
index 04080a558b..caba29c624 100644
--- a/packfile.c
+++ b/packfile.c
@@ -1163,29 +1163,17 @@ int unpack_object_header(struct packed_git *p,

 void mark_bad_packed_object(struct packed_git *p, const struct object_id *oid)
 {
-	unsigned i;
-	const unsigned hashsz = the_hash_algo->rawsz;
-	for (i = 0; i < p->num_bad_objects; i++)
-		if (hasheq(oid->hash, p->bad_object_sha1 + hashsz * i))
-			return;
-	p->bad_object_sha1 = xrealloc(p->bad_object_sha1,
-				      st_mult(GIT_MAX_RAWSZ,
-					      st_add(p->num_bad_objects, 1)));
-	hashcpy(p->bad_object_sha1 + hashsz * p->num_bad_objects, oid->hash);
-	p->num_bad_objects++;
+	oidset_insert(&p->bad_objects, oid);
 }

 const struct packed_git *has_packed_and_bad(struct repository *r,
 					    const struct object_id *oid)
 {
 	struct packed_git *p;
-	unsigned i;

 	for (p = r->objects->packed_git; p; p = p->next)
-		for (i = 0; i < p->num_bad_objects; i++)
-			if (hasheq(oid->hash,
-				   p->bad_object_sha1 + the_hash_algo->rawsz * i))
-				return p;
+		if (oidset_contains(&p->bad_objects, oid))
+			return p;
 	return NULL;
 }

@@ -2016,13 +2004,9 @@ static int fill_pack_entry(const struct object_id *oid,
 {
 	off_t offset;

-	if (p->num_bad_objects) {
-		unsigned i;
-		for (i = 0; i < p->num_bad_objects; i++)
-			if (hasheq(oid->hash,
-				   p->bad_object_sha1 + the_hash_algo->rawsz * i))
-				return 0;
-	}
+	if (oidset_size(&p->bad_objects) &&
+	    oidset_contains(&p->bad_objects, oid))
+		return 0;

 	offset = find_pack_entry_one(oid->hash, p);
 	if (!offset)
--
2.33.0

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH v2 0/5] packfile: use oidset for bad objects
  2021-09-11 20:31 ` [PATCH v2 0/5] packfile: use oidset for bad objects René Scharfe
                     ` (4 preceding siblings ...)
  2021-09-11 20:43   ` [PATCH v2 5/5] packfile: use oidset for bad objects René Scharfe
@ 2021-09-11 20:45   ` René Scharfe
  2021-09-11 21:22   ` Jeff King
  2021-09-11 23:59   ` Derrick Stolee
  7 siblings, 0 replies; 28+ messages in thread
From: René Scharfe @ 2021-09-11 20:45 UTC (permalink / raw)
  To: Git List; +Cc: Junio C Hamano, Jeff King

Am 11.09.21 um 22:31 schrieb René Scharfe:
> Replace the custom hash array for remembering corrupt pack entries with
> an oidset.  This shortens and simplifies the code.
>
> Changes since v1:
> - inline oidset_size()
> - inline nth_midxed_pack_entry() early
> - use oidset_size() to avoid a function call if no bad objects exist

- forgot to add "v2" to the subject of patches 1, 2, 3 :-/

René

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 4/3] midx: inline nth_midxed_pack_entry()
  2021-09-11 20:31     ` René Scharfe
@ 2021-09-11 21:20       ` Jeff King
  2021-09-11 23:39         ` René Scharfe
  0 siblings, 1 reply; 28+ messages in thread
From: Jeff King @ 2021-09-11 21:20 UTC (permalink / raw)
  To: René Scharfe; +Cc: Git List, Junio C Hamano

On Sat, Sep 11, 2021 at 10:31:34PM +0200, René Scharfe wrote:

> Am 11.09.21 um 19:07 schrieb Jeff King:
> > On Sat, Sep 11, 2021 at 06:08:42PM +0200, René Scharfe wrote:
> >
> >> @@ -304,8 +307,7 @@ static int nth_midxed_pack_entry(struct repository *r,
> >>  	if (!is_pack_valid(p))
> >>  		return 0;
> >>
> >> -	nth_midxed_object_oid(&oid, m, pos);
> >> -	if (oidset_contains(&p->bad_objects, &oid))
> >> +	if (oidset_contains(&p->bad_objects, oid))
> >>  		return 0;
> >
> > So we get to avoid the nth_midxed_object_oid() copy entirely. Very nice.
> >
> > Compared to the code before your series, we still have an extra function
> > call to oidset_contains(), which will (in the common case) notice we
> > have no entries and immediately return. But I think that's getting into
> > pointless micro-optimization.
> 
> Right.  I measure a 0.5% slowdown for git multi-pack-index verify.  An
> inline oidset_size call avoids it.  That's easy enough to add, so let's
> have it!

I don't mind that, but I wonder if we can have our cake and eat it, too.

oidset_contains() is short, too, and could be inlined. Or if we're
worried about the size of the embedded kh_get_oid_set() getting inlined,
we could do something like:

  static inline int oidset_contains(const struct oidset *set, const
				    struct object_id *oid)
  {
	if (!oidset_size(set))
		return 0;
	return oidset_contains_func(set, oid);
  }

That saves callers from having to deal with it, at the expense of a
slightly complicated oidset implementation.

I guess it's an extra integer comparison for callers that _do_ expect to
have a non-empty set. So maybe it is better left to the caller to
decide whether to optimize in this way.

(A totally inline oidset_contains() avoids the extra check, but possibly
at the cost of larger code size).

-Peff

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v2 0/5] packfile: use oidset for bad objects
  2021-09-11 20:31 ` [PATCH v2 0/5] packfile: use oidset for bad objects René Scharfe
                     ` (5 preceding siblings ...)
  2021-09-11 20:45   ` [PATCH v2 0/5] " René Scharfe
@ 2021-09-11 21:22   ` Jeff King
  2021-09-11 23:59   ` Derrick Stolee
  7 siblings, 0 replies; 28+ messages in thread
From: Jeff King @ 2021-09-11 21:22 UTC (permalink / raw)
  To: René Scharfe; +Cc: Git List, Junio C Hamano

On Sat, Sep 11, 2021 at 10:31:52PM +0200, René Scharfe wrote:

> Replace the custom hash array for remembering corrupt pack entries with
> an oidset.  This shortens and simplifies the code.
> 
> Changes since v1:
> - inline oidset_size()
> - inline nth_midxed_pack_entry() early
> - use oidset_size() to avoid a function call if no bad objects exist

Thanks, these all look fine to me. I raised a question elsewhere in the
thread about inlining oidset_contains(), but I think that can be
considered separately (and is less clear-cut; we might win by saving a
function call, but we might lose due to larger code size).

-Peff

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 4/3] midx: inline nth_midxed_pack_entry()
  2021-09-11 21:20       ` Jeff King
@ 2021-09-11 23:39         ` René Scharfe
  0 siblings, 0 replies; 28+ messages in thread
From: René Scharfe @ 2021-09-11 23:39 UTC (permalink / raw)
  To: Jeff King; +Cc: Git List, Junio C Hamano

Am 11.09.21 um 23:20 schrieb Jeff King:
> On Sat, Sep 11, 2021 at 10:31:34PM +0200, René Scharfe wrote:
>
>> Am 11.09.21 um 19:07 schrieb Jeff King:
>>> On Sat, Sep 11, 2021 at 06:08:42PM +0200, René Scharfe wrote:
>>>
>>>> @@ -304,8 +307,7 @@ static int nth_midxed_pack_entry(struct repository *r,
>>>>  	if (!is_pack_valid(p))
>>>>  		return 0;
>>>>
>>>> -	nth_midxed_object_oid(&oid, m, pos);
>>>> -	if (oidset_contains(&p->bad_objects, &oid))
>>>> +	if (oidset_contains(&p->bad_objects, oid))
>>>>  		return 0;
>>>
>>> So we get to avoid the nth_midxed_object_oid() copy entirely. Very nice.
>>>
>>> Compared to the code before your series, we still have an extra function
>>> call to oidset_contains(), which will (in the common case) notice we
>>> have no entries and immediately return. But I think that's getting into
>>> pointless micro-optimization.
>>
>> Right.  I measure a 0.5% slowdown for git multi-pack-index verify.  An
>> inline oidset_size call avoids it.  That's easy enough to add, so let's
>> have it!
>
> I don't mind that, but I wonder if we can have our cake and eat it, too.
>
> oidset_contains() is short, too, and could be inlined. Or if we're
> worried about the size of the embedded kh_get_oid_set() getting inlined,
> we could do something like:
>
>   static inline int oidset_contains(const struct oidset *set, const
> 				    struct object_id *oid)
>   {
> 	if (!oidset_size(set))
> 		return 0;
> 	return oidset_contains_func(set, oid);
>   }
>
> That saves callers from having to deal with it, at the expense of a
> slightly complicated oidset implementation.
>
> I guess it's an extra integer comparison for callers that _do_ expect to
> have a non-empty set. So maybe it is better left to the caller to
> decide whether to optimize in this way.
>
> (A totally inline oidset_contains() avoids the extra check, but possibly
> at the cost of larger code size).

I wondered the same.

Inlining oidset_contains() would follow the spirit of khash.  It adds
16KB to my build (ca. 688 bytes per caller).  Hmm.

I expected the hybrid approach with an inlined emptiness check and a
shared actual contains function to be as fast as the original code, due
to caching.  I actually saw the 0.1% slowdown of git multi-pack-index
verify when I added a fake bad object at the end of prepare_midx_pack()
to simulate a non-empty oidset.  Hmm!

Both are probably defensible, but for this series I took the more
targeted approach to limit the impact.

René

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v2 0/5] packfile: use oidset for bad objects
  2021-09-11 20:31 ` [PATCH v2 0/5] packfile: use oidset for bad objects René Scharfe
                     ` (6 preceding siblings ...)
  2021-09-11 21:22   ` Jeff King
@ 2021-09-11 23:59   ` Derrick Stolee
  2021-09-12  1:51     ` Taylor Blau
  7 siblings, 1 reply; 28+ messages in thread
From: Derrick Stolee @ 2021-09-11 23:59 UTC (permalink / raw)
  To: René Scharfe, Git List; +Cc: Junio C Hamano, Jeff King

On 9/11/21 4:31 PM, René Scharfe wrote:
> Replace the custom hash array for remembering corrupt pack entries with
> an oidset.  This shortens and simplifies the code.
> 
> Changes since v1:
> - inline oidset_size()
> - inline nth_midxed_pack_entry() early
> - use oidset_size() to avoid a function call if no bad objects exist
> 
>   oidset: make oidset_size() an inline function
>   midx: inline nth_midxed_pack_entry()
>   packfile: convert mark_bad_packed_object() to object_id
>   packfile: convert has_packed_and_bad() to object_id
>   packfile: use oidset for bad objects

These were easy reads, and I understand the value of them.

I initially hesitated to support the drop of
nth_midxed_pack_entry(), since it was designed with things
like midx bitmaps in mind (specifically, to also support
lex-order-to-stable-order conversions). However, it seems
that the midx bitmap series by Taylor is succeeding without
needing such a translation.

Thanks,
-Stolee

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v2 0/5] packfile: use oidset for bad objects
  2021-09-11 23:59   ` Derrick Stolee
@ 2021-09-12  1:51     ` Taylor Blau
  2021-09-12  2:29       ` Taylor Blau
  0 siblings, 1 reply; 28+ messages in thread
From: Taylor Blau @ 2021-09-12  1:51 UTC (permalink / raw)
  To: Derrick Stolee; +Cc: René Scharfe, Git List, Junio C Hamano, Jeff King

On Sat, Sep 11, 2021 at 07:59:40PM -0400, Derrick Stolee wrote:
> I initially hesitated to support the drop of
> nth_midxed_pack_entry(), since it was designed with things
> like midx bitmaps in mind (specifically, to also support
> lex-order-to-stable-order conversions).

I didn't know that nth_midxed_pack_entry was designed with either
purpose in mind, since it predates midx bitmaps by quite a bit.

> However, it seems that the midx bitmap series by Taylor is succeeding
> without needing such a translation.

Right, it looks like that function is only called by fill_midx_entry()
where it was inlined.

Thanks,
Taylor

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v2 0/5] packfile: use oidset for bad objects
  2021-09-12  1:51     ` Taylor Blau
@ 2021-09-12  2:29       ` Taylor Blau
  2021-09-12  3:51         ` Derrick Stolee
  0 siblings, 1 reply; 28+ messages in thread
From: Taylor Blau @ 2021-09-12  2:29 UTC (permalink / raw)
  To: Derrick Stolee; +Cc: René Scharfe, Git List, Junio C Hamano, Jeff King

On Sat, Sep 11, 2021 at 09:51:04PM -0400, Taylor Blau wrote:
> On Sat, Sep 11, 2021 at 07:59:40PM -0400, Derrick Stolee wrote:
> > I initially hesitated to support the drop of
> > nth_midxed_pack_entry(), since it was designed with things
> > like midx bitmaps in mind (specifically, to also support
> > lex-order-to-stable-order conversions).
>
> I didn't know that nth_midxed_pack_entry was designed with either
> purpose in mind, since it predates midx bitmaps by quite a bit.

Thinking on it more, I can imagine that you wrote this function
aspirationally envisioning something like MIDX bitmaps. And since you
and I discussed the design together quite a bit, I imagine that that's
the case ;-).

But I agree that after reading this series again, that the inline-ing
suggested makes sense (and doesn't conflict with any series I have in
flight which don't add any new callers).

Thanks,
Taylor

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v2 0/5] packfile: use oidset for bad objects
  2021-09-12  2:29       ` Taylor Blau
@ 2021-09-12  3:51         ` Derrick Stolee
  2021-09-12  4:01           ` Taylor Blau
  0 siblings, 1 reply; 28+ messages in thread
From: Derrick Stolee @ 2021-09-12  3:51 UTC (permalink / raw)
  To: Taylor Blau; +Cc: René Scharfe, Git List, Junio C Hamano, Jeff King

On 9/11/21 10:29 PM, Taylor Blau wrote:
> On Sat, Sep 11, 2021 at 09:51:04PM -0400, Taylor Blau wrote:
>> On Sat, Sep 11, 2021 at 07:59:40PM -0400, Derrick Stolee wrote:
>>> I initially hesitated to support the drop of
>>> nth_midxed_pack_entry(), since it was designed with things
>>> like midx bitmaps in mind (specifically, to also support
>>> lex-order-to-stable-order conversions).
>>
>> I didn't know that nth_midxed_pack_entry was designed with either
>> purpose in mind, since it predates midx bitmaps by quite a bit.
> 
> Thinking on it more, I can imagine that you wrote this function
> aspirationally envisioning something like MIDX bitmaps. And since you
> and I discussed the design together quite a bit, I imagine that that's
> the case ;-).
> 
> But I agree that after reading this series again, that the inline-ing
> suggested makes sense (and doesn't conflict with any series I have in
> flight which don't add any new callers).

I'm thinking more to my original design of the multi-pack-index.
At that time, I was thinking about the possible integration
with bitmaps based on my experience in other systems which used
a stable object order to allow writing bitmaps asynchronously
with respect to the multi-pack-index write and object packing.

One thing that you did when first considering bitmaps over the
multi-pack-index was to demonstrate that a stable object order
is not required, which surprised and delighted me. It greatly
reduced the complexity of the problem, and being able to inline
this method is only one small fallout from that simplicity.

Thanks,
-Stolee

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v2 0/5] packfile: use oidset for bad objects
  2021-09-12  3:51         ` Derrick Stolee
@ 2021-09-12  4:01           ` Taylor Blau
  0 siblings, 0 replies; 28+ messages in thread
From: Taylor Blau @ 2021-09-12  4:01 UTC (permalink / raw)
  To: Derrick Stolee; +Cc: René Scharfe, Git List, Junio C Hamano, Jeff King

On Sat, Sep 11, 2021 at 11:51:36PM -0400, Derrick Stolee wrote:
> On 9/11/21 10:29 PM, Taylor Blau wrote:
> > On Sat, Sep 11, 2021 at 09:51:04PM -0400, Taylor Blau wrote:
> >> On Sat, Sep 11, 2021 at 07:59:40PM -0400, Derrick Stolee wrote:
> >>> I initially hesitated to support the drop of
> >>> nth_midxed_pack_entry(), since it was designed with things
> >>> like midx bitmaps in mind (specifically, to also support
> >>> lex-order-to-stable-order conversions).
> >>
> >> I didn't know that nth_midxed_pack_entry was designed with either
> >> purpose in mind, since it predates midx bitmaps by quite a bit.
> >
> > Thinking on it more, I can imagine that you wrote this function
> > aspirationally envisioning something like MIDX bitmaps. And since you
> > and I discussed the design together quite a bit, I imagine that that's
> > the case ;-).
> >
> > But I agree that after reading this series again, that the inline-ing
> > suggested makes sense (and doesn't conflict with any series I have in
> > flight which don't add any new callers).
>
> I'm thinking more to my original design of the multi-pack-index.
> At that time, I was thinking about the possible integration
> with bitmaps based on my experience in other systems which used
> a stable object order to allow writing bitmaps asynchronously
> with respect to the multi-pack-index write and object packing.

Makes sense, and thank you for clarifying. After re-reading my first
email, I figured that this is what you must have been talking about,
which is why I felt like I should rephrase (hence the follow-up email).

> One thing that you did when first considering bitmaps over the
> multi-pack-index was to demonstrate that a stable object order
> is not required, which surprised and delighted me. It greatly
> reduced the complexity of the problem, and being able to inline
> this method is only one small fallout from that simplicity.

This was definitely a consequence of what I had observed from seeing
what was "slow" when running bitmaps in production at GitHub. There,
repacking a repository's objects all into one pack each time we ran our
automated background jobs far outpaced the amount of time we spent
generating bitmaps.

And with your and Peff's work on improving bitmap generation itself,
things are in a pretty good place. I do have some potential ideas for
future improvement, like a mode where bitmaps are only "fast forwarded",
meaning that new bitmaps are only added between the commits selected for
bitmapping in the previous round and the current reference tips.

I think things like that end up getting you pretty far, but it may be
interesting to come back eventually and revisit adding a stable object
order. In the meantime, though... ;)

Thanks,
Taylor

^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2021-09-12  4:01 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-09-11  7:50 [PATCH 0/3] packfile: use oidset for bad objects René Scharfe
2021-09-11  7:59 ` [PATCH 1/3] packfile: convert mark_bad_packed_object() to object_id René Scharfe
2021-09-11  8:00 ` [PATCH 2/3] packfile: convert has_packed_and_bad() " René Scharfe
2021-09-11  8:01 ` [PATCH 3/3] packfile: use oidset for bad objects René Scharfe
2021-09-11 14:26   ` Jeff King
2021-09-11 16:08     ` René Scharfe
2021-09-11 17:03       ` Jeff King
2021-09-11 17:16         ` René Scharfe
2021-09-11 14:27 ` [PATCH 0/3] " Jeff King
2021-09-11 16:08 ` [PATCH 4/3] midx: inline nth_midxed_pack_entry() René Scharfe
2021-09-11 17:03   ` René Scharfe
2021-09-11 17:07   ` Jeff King
2021-09-11 20:31     ` René Scharfe
2021-09-11 21:20       ` Jeff King
2021-09-11 23:39         ` René Scharfe
2021-09-11 20:31 ` [PATCH v2 0/5] packfile: use oidset for bad objects René Scharfe
2021-09-11 20:36   ` [PATCH 1/5] oidset: make oidset_size() an inline function René Scharfe
2021-09-11 20:39   ` [PATCH 2/5] midx: inline nth_midxed_pack_entry() René Scharfe
2021-09-11 20:40   ` [PATCH 3/5] packfile: convert mark_bad_packed_object() to object_id René Scharfe
2021-09-11 20:42   ` [PATCH v2 4/5] packfile: convert has_packed_and_bad() " René Scharfe
2021-09-11 20:43   ` [PATCH v2 5/5] packfile: use oidset for bad objects René Scharfe
2021-09-11 20:45   ` [PATCH v2 0/5] " René Scharfe
2021-09-11 21:22   ` Jeff King
2021-09-11 23:59   ` Derrick Stolee
2021-09-12  1:51     ` Taylor Blau
2021-09-12  2:29       ` Taylor Blau
2021-09-12  3:51         ` Derrick Stolee
2021-09-12  4:01           ` Taylor Blau

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.