[PATCH 1/2] fsck: use strbuf_getline() to read skiplist file

git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 1/2] fsck: use strbuf_getline() to read skiplist file
@ 2018-08-11 15:39 René Scharfe
  2018-08-11 15:47 ` [PATCH 2/2] fsck: use oidset for skiplist René Scharfe
                   ` (3 more replies)
  0 siblings, 4 replies; 31+ messages in thread
From: René Scharfe @ 2018-08-11 15:39 UTC (permalink / raw)
  To: Git List
  Cc: Ævar Arnfjörð Bjarmason, Ramsay Jones,
	Johannes Schindelin, Junio C Hamano

The char array named "buffer" is unlikely to contain a NUL character, so
printing its contents using %s in a die() format is unsafe.  Clang's
ASan reports running over the end of buffer in the recently added
skiplist tests in t5504-fetch-receive-strict.sh as a result.

Use an idiomatic strbuf_getline() loop instead, which ensures the buffer
is always NUL-terminated.  As a side-effect this also adds support for
skiplist files with CRLF line endings.

Signed-off-by: Rene Scharfe <l.s.r@web.de>
---
  fsck.c | 23 ++++++++++-------------
  1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/fsck.c b/fsck.c
index a0cee0be59..83f4562390 100644
--- a/fsck.c
+++ b/fsck.c
@@ -183,8 +183,9 @@ static int fsck_msg_type(enum fsck_msg_id msg_id,
  static void init_skiplist(struct fsck_options *options, const char *path)
  {
  	static struct oid_array skiplist = OID_ARRAY_INIT;
-	int sorted, fd;
-	char buffer[GIT_MAX_HEXSZ + 1];
+	int sorted;
+	FILE *fp;
+	struct strbuf sb = STRBUF_INIT;
  	struct object_id oid;
  
  	if (options->skiplist)
@@ -194,25 +195,21 @@ static void init_skiplist(struct fsck_options *options, const char *path)
  		options->skiplist = &skiplist;
  	}
  
-	fd = open(path, O_RDONLY);
-	if (fd < 0)
+	fp = fopen(path, "r");
+	if (!fp)
  		die("Could not open skip list: %s", path);
-	for (;;) {
+	while (!strbuf_getline(&sb, fp)) {
  		const char *p;
-		int result = read_in_full(fd, buffer, sizeof(buffer));
-		if (result < 0)
-			die_errno("Could not read '%s'", path);
-		if (!result)
-			break;
-		if (parse_oid_hex(buffer, &oid, &p) || *p != '\n')
-			die("Invalid SHA-1: %s", buffer);
+		if (parse_oid_hex(sb.buf, &oid, &p) || *p != '\0')
+			die("Invalid SHA-1: %s", sb.buf);
  		oid_array_append(&skiplist, &oid);
  		if (sorted && skiplist.nr > 1 &&
  				oidcmp(&skiplist.oid[skiplist.nr - 2],
  				       &oid) > 0)
  			sorted = 0;
  	}
-	close(fd);
+	fclose(fp);
+	strbuf_release(&sb);
  
  	if (sorted)
  		skiplist.sorted = 1;
-- 
2.18.0

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 2/2] fsck: use oidset for skiplist
  2018-08-11 15:39 [PATCH 1/2] fsck: use strbuf_getline() to read skiplist file René Scharfe
@ 2018-08-11 15:47 ` René Scharfe
  2018-08-11 16:54   ` Ævar Arnfjörð Bjarmason
                     ` (3 more replies)
  2018-08-11 16:48 ` [PATCH 1/2] fsck: use strbuf_getline() to read skiplist file Jeff King
                   ` (2 subsequent siblings)
  3 siblings, 4 replies; 31+ messages in thread
From: René Scharfe @ 2018-08-11 15:47 UTC (permalink / raw)
  To: Git List
  Cc: Ævar Arnfjörð Bjarmason, Ramsay Jones,
	Johannes Schindelin, Junio C Hamano

Object IDs to skip are stored in a shared static oid_array.  Lookups do
a binary search on the sorted array.  The code checks if the object IDs
are already in the correct order while loading and skips sorting in that
case.

Simplify the code by using an oidset instead.  Memory usage is a bit
higher, but lookups are done in constant time and there is no need to
worry about any sort order.

Embed the oidset into struct fsck_options to make its ownership clear
(no hidden sharing) and avoid unnecessary pointer indirection.

Signed-off-by: Rene Scharfe <l.s.r@web.de>
---
  fsck.c | 23 ++---------------------
  fsck.h |  8 +++++---
  2 files changed, 7 insertions(+), 24 deletions(-)

diff --git a/fsck.c b/fsck.c
index 83f4562390..9246afee22 100644
--- a/fsck.c
+++ b/fsck.c
@@ -10,7 +10,6 @@
  #include "fsck.h"
  #include "refs.h"
  #include "utf8.h"
-#include "sha1-array.h"
  #include "decorate.h"
  #include "oidset.h"
  #include "packfile.h"
@@ -182,19 +181,10 @@ static int fsck_msg_type(enum fsck_msg_id msg_id,
  
  static void init_skiplist(struct fsck_options *options, const char *path)
  {
-	static struct oid_array skiplist = OID_ARRAY_INIT;
-	int sorted;
  	FILE *fp;
  	struct strbuf sb = STRBUF_INIT;
  	struct object_id oid;
  
-	if (options->skiplist)
-		sorted = options->skiplist->sorted;
-	else {
-		sorted = 1;
-		options->skiplist = &skiplist;
-	}
-
  	fp = fopen(path, "r");
  	if (!fp)
  		die("Could not open skip list: %s", path);
@@ -202,17 +192,10 @@ static void init_skiplist(struct fsck_options *options, const char *path)
  		const char *p;
  		if (parse_oid_hex(sb.buf, &oid, &p) || *p != '\0')
  			die("Invalid SHA-1: %s", sb.buf);
-		oid_array_append(&skiplist, &oid);
-		if (sorted && skiplist.nr > 1 &&
-				oidcmp(&skiplist.oid[skiplist.nr - 2],
-				       &oid) > 0)
-			sorted = 0;
+		oidset_insert(&options->skiplist, &oid);
  	}
  	fclose(fp);
  	strbuf_release(&sb);
-
-	if (sorted)
-		skiplist.sorted = 1;
  }
  
  static int parse_msg_type(const char *str)
@@ -317,9 +300,7 @@ static void append_msg_id(struct strbuf *sb, const char *msg_id)
  
  static int object_on_skiplist(struct fsck_options *opts, struct object *obj)
  {
-	if (opts && opts->skiplist && obj)
-		return oid_array_lookup(opts->skiplist, &obj->oid) >= 0;
-	return 0;
+	return opts && obj && oidset_contains(&opts->skiplist, &obj->oid);
  }
  
  __attribute__((format (printf, 4, 5)))
diff --git a/fsck.h b/fsck.h
index c3cf5e0034..26120e6186 100644
--- a/fsck.h
+++ b/fsck.h
@@ -1,6 +1,8 @@
  #ifndef GIT_FSCK_H
  #define GIT_FSCK_H
  
+#include "oidset.h"
+
  #define FSCK_ERROR 1
  #define FSCK_WARN 2
  #define FSCK_IGNORE 3
@@ -34,12 +36,12 @@ struct fsck_options {
  	fsck_error error_func;
  	unsigned strict:1;
  	int *msg_type;
-	struct oid_array *skiplist;
+	struct oidset skiplist;
  	struct decoration *object_names;
  };
  
-#define FSCK_OPTIONS_DEFAULT { NULL, fsck_error_function, 0, NULL }
-#define FSCK_OPTIONS_STRICT { NULL, fsck_error_function, 1, NULL }
+#define FSCK_OPTIONS_DEFAULT { NULL, fsck_error_function, 0, NULL, OIDSET_INIT }
+#define FSCK_OPTIONS_STRICT { NULL, fsck_error_function, 1, NULL, OIDSET_INIT }
  
  /* descend in all linked child objects
   * the return value is:
-- 
2.18.0

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] fsck: use oidset for skiplist
  2018-08-11 15:47 ` [PATCH 2/2] fsck: use oidset for skiplist René Scharfe
@ 2018-08-11 16:54   ` Ævar Arnfjörð Bjarmason
  2018-08-25 18:49     ` René Scharfe
  2018-08-11 17:02   ` Jeff King
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 31+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2018-08-11 16:54 UTC (permalink / raw)
  To: René Scharfe
  Cc: Git List, Ramsay Jones, Johannes Schindelin, Junio C Hamano

On Sat, Aug 11 2018, René Scharfe wrote:

> Object IDs to skip are stored in a shared static oid_array.  Lookups do
> a binary search on the sorted array.  The code checks if the object IDs
> are already in the correct order while loading and skips sorting in that
> case.

I think this change makes sense, but it's missing an update to the
relevant documentation in Documentation/config.txt:

    fsck.skipList::
    	The path to a sorted list of object names (i.e. one SHA-1 per
    	line) that are known to be broken in a non-fatal way and should
    	be ignored. This feature is useful when an established project
    	should be accepted despite early commits containing errors that
    	can be safely ignored such as invalid committer email addresses.
    	Note: corrupt objects cannot be skipped with this setting.

Also, while I use the skipList feature it's for something on the order
of 10-100 objects, so whatever algorithm the lookup uses isn't going to
matter, but I think it's interesting to describe the trade-off in the
commit message.

I.e. what if I have 100K objects listed in the skipList, is it only
going to be read lazily during fsck if there's an issue, or on every
object etc? What's the difference in performance?

Before this change, I wanted to follow-up my ab/fsck-transfer-updates
with something where we'd die if we found the skipList wasn't ordered as
we read it, but from a UI POV this is even better.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] fsck: use oidset for skiplist
  2018-08-11 16:54   ` Ævar Arnfjörð Bjarmason
@ 2018-08-25 18:49     ` René Scharfe
  0 siblings, 0 replies; 31+ messages in thread
From: René Scharfe @ 2018-08-25 18:49 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Git List, Ramsay Jones, Johannes Schindelin, Junio C Hamano

Am 11.08.2018 um 18:54 schrieb Ævar Arnfjörð Bjarmason:
> 
> On Sat, Aug 11 2018, René Scharfe wrote:
> 
>> Object IDs to skip are stored in a shared static oid_array.  Lookups do
>> a binary search on the sorted array.  The code checks if the object IDs
>> are already in the correct order while loading and skips sorting in that
>> case.
> 
> I think this change makes sense, but it's missing an update to the
> relevant documentation in Documentation/config.txt:
> 
>     fsck.skipList::
>     	The path to a sorted list of object names (i.e. one SHA-1 per
>     	line) that are known to be broken in a non-fatal way and should
>     	be ignored. This feature is useful when an established project
>     	should be accepted despite early commits containing errors that
>     	can be safely ignored such as invalid committer email addresses.
>     	Note: corrupt objects cannot be skipped with this setting.
> 
> Also, while I use the skipList feature it's for something on the order
> of 10-100 objects, so whatever algorithm the lookup uses isn't going to
> matter, but I think it's interesting to describe the trade-off in the
> commit message.
> 
> I.e. what if I have 100K objects listed in the skipList, is it only
> going to be read lazily during fsck if there's an issue, or on every
> object etc? What's the difference in performance?

The list is loaded once up-front and a lookup is done for each
reportable issue and .gitmodules file.  Repositories without them
won't be affected at all.

100K bad objects sounds a bit extreme, but a fast-import can create
such a repository relatively quickly.  Here are the numbers with and
without the two patches:

Test                                            origin/master     HEAD
----------------------------------------------------------------------------------------
1450.3: fsck with 0 skipped bad commits         0.84(0.78+0.05)   0.83(0.80+0.03) -1.2%
1450.5: fsck with 1 skipped bad commits         0.84(0.77+0.07)   0.84(0.79+0.05) +0.0%
1450.7: fsck with 10 skipped bad commits        0.86(0.81+0.05)   0.84(0.78+0.06) -2.3%
1450.9: fsck with 100 skipped bad commits       0.85(0.78+0.07)   0.84(0.78+0.05) -1.2%
1450.11: fsck with 1000 skipped bad commits     0.85(0.80+0.05)   0.84(0.79+0.05) -1.2%
1450.13: fsck with 10000 skipped bad commits    0.85(0.78+0.07)   0.82(0.75+0.06) -3.5%
1450.15: fsck with 100000 skipped bad commits   0.73(0.64+0.09)   0.64(0.62+0.02) -12.3%

They look the same for most sizes, but with all 100000 bad objects in
the skiplist the oidset wins decisively.  Dialing it up a notch:

Test                                             origin/master     HEAD
-----------------------------------------------------------------------------------------
1450.3: fsck with 0 skipped bad commits          8.61(7.94+0.66)   8.72(8.14+0.58) +1.3%
1450.5: fsck with 1 skipped bad commits          8.51(7.87+0.63)   8.53(7.93+0.60) +0.2%
1450.7: fsck with 10 skipped bad commits         8.56(7.98+0.57)   8.54(7.91+0.63) -0.2%
1450.9: fsck with 100 skipped bad commits        8.65(8.00+0.65)   8.47(7.93+0.53) -2.1%
1450.11: fsck with 1000 skipped bad commits      8.69(8.16+0.53)   8.49(8.00+0.49) -2.3%
1450.13: fsck with 10000 skipped bad commits     8.69(8.13+0.56)   8.50(7.93+0.56) -2.2%
1450.15: fsck with 100000 skipped bad commits    8.78(8.18+0.60)   8.36(7.85+0.50) -4.8%
1450.17: fsck with 1000000 skipped bad commits   7.83(7.07+0.76)   6.55(6.42+0.12) -16.3%

So it looks like successful lookups are faster in the oidset and
the oid_array is quicker with just a handful of entries.

A L1 cache line of 64 bytes holds 3 consecutive SHA1 hashes, which
might explain it.

Here's the perf test code:

---
 t/perf/p1450-fsck.sh | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100755 t/perf/p1450-fsck.sh

diff --git a/t/perf/p1450-fsck.sh b/t/perf/p1450-fsck.sh
new file mode 100755
index 0000000000..7c89690777
--- /dev/null
+++ b/t/perf/p1450-fsck.sh
@@ -0,0 +1,40 @@
+#!/bin/sh
+
+test_description='Test fsck skipList performance'
+
+. ./perf-lib.sh
+
+test_perf_fresh_repo
+
+n=100000
+
+test_expect_success "setup $n bad commits" '
+	for i in $(test_seq 1 $n)
+	do
+		echo "commit refs/heads/master" &&
+		echo "committer C <c@example.com> 1234567890 +0000" &&
+		echo "data <<EOF" &&
+		echo "$i.Q." &&
+		echo "EOF"
+	done | q_to_nul | git fast-import
+'
+
+skip=0
+while test $skip -le $n
+do
+	test_expect_success "create skipList for $skip bad commits" '
+		git log --format=%H --max-count=$skip |
+		sort >skiplist
+	'
+
+	test_perf "fsck with $skip skipped bad commits"	'
+		git -c fsck.skipList=skiplist fsck
+	'
+
+	case $skip in
+	0) skip=1 ;;
+	*) skip=${skip}0 ;;
+	esac
+done
+
+test_done
-- 
2.18.0

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] fsck: use oidset for skiplist
  2018-08-11 15:47 ` [PATCH 2/2] fsck: use oidset for skiplist René Scharfe
  2018-08-11 16:54   ` Ævar Arnfjörð Bjarmason
@ 2018-08-11 17:02   ` Jeff King
  2018-08-11 17:23     ` Jeff King
  2018-08-11 20:48   ` Ramsay Jones
  2018-08-13 18:43   ` Junio C Hamano
  3 siblings, 1 reply; 31+ messages in thread
From: Jeff King @ 2018-08-11 17:02 UTC (permalink / raw)
  To: René Scharfe
  Cc: Git List, Ævar Arnfjörð Bjarmason, Ramsay Jones,
	Johannes Schindelin, Junio C Hamano

On Sat, Aug 11, 2018 at 05:47:56PM +0200, René Scharfe wrote:

> Object IDs to skip are stored in a shared static oid_array.  Lookups do
> a binary search on the sorted array.  The code checks if the object IDs
> are already in the correct order while loading and skips sorting in that
> case.
> 
> Simplify the code by using an oidset instead.  Memory usage is a bit
> higher, but lookups are done in constant time and there is no need to
> worry about any sort order.
> 
> Embed the oidset into struct fsck_options to make its ownership clear
> (no hidden sharing) and avoid unnecessary pointer indirection.

I actually had a case[1] yesterday where it seems like oidset is a fair
bit slower than oid_array for a large set.

But:

  - loading the skiplist into memory has pretty lousy performance
    anyway. If we really care about performance of large lists, we
    should define a sorted on-disk format that can be mmap'd and
    searched directly.  Or if people are willing to tolerate false
    positives, even a bloom filter.

    I've never really used a big skiplist myself, so I haven't done any
    work towards those things.

  - we could probably improve the speed of oidset. Two things I notice
    about its implementation:

      - it has to malloc for each entry, which I suspect is the main
	bottleneck. We could probably pool-allocate blocks, and when
	entries get removed just leave the allocations in place until we
	clear(). Most callers tend to build up a set and then query it a
	lot, or possibly remove items from the set until it's empty. But
	my guess is that few or none want a long-lived set that they add
	and remove from randomly.

      - insertion lets you do check-and-insert as a single operation
	(something I failed to notice in [1]). But it's not implemented
	as efficiently as it could be, since the "contains" and "put"
	operations do separate lookups. This doesn't matter for a set
	that's queried a lot more, but for something like de-duping
	(like I was doing in [1]) most operations are check-and-insert.

Most of that is just food for thought, but it possibly argues that we
should not care about performance characteristics for swapping out
oid_array and oidset here (i.e., that your patch is fine, and the
simplicity benefit is the most important thing).

[1] https://public-inbox.org/git/20180810232457.GG19875@sigill.intra.peff.net/
    but note that it's buried pretty deep.

> ---
>  fsck.c | 23 ++---------------------
>  fsck.h |  8 +++++---
>  2 files changed, 7 insertions(+), 24 deletions(-)

Again, I didn't see anything wrong with the patch itself.

-Peff

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] fsck: use oidset for skiplist
  2018-08-11 17:02   ` Jeff King
@ 2018-08-11 17:23     ` Jeff King
  2018-08-11 20:59       ` René Scharfe
  2018-08-13 17:15       ` René Scharfe
  0 siblings, 2 replies; 31+ messages in thread
From: Jeff King @ 2018-08-11 17:23 UTC (permalink / raw)
  To: René Scharfe
  Cc: Git List, Ævar Arnfjörð Bjarmason, Ramsay Jones,
	Johannes Schindelin, Junio C Hamano

On Sat, Aug 11, 2018 at 01:02:48PM -0400, Jeff King wrote:

>   - we could probably improve the speed of oidset. Two things I notice
>     about its implementation:
> 
>       - it has to malloc for each entry, which I suspect is the main
> 	bottleneck. We could probably pool-allocate blocks, and when
> 	entries get removed just leave the allocations in place until we
> 	clear(). Most callers tend to build up a set and then query it a
> 	lot, or possibly remove items from the set until it's empty. But
> 	my guess is that few or none want a long-lived set that they add
> 	and remove from randomly.
> 
>       - insertion lets you do check-and-insert as a single operation
> 	(something I failed to notice in [1]). But it's not implemented
> 	as efficiently as it could be, since the "contains" and "put"
> 	operations do separate lookups. This doesn't matter for a set
> 	that's queried a lot more, but for something like de-duping
> 	(like I was doing in [1]) most operations are check-and-insert.
> [...]
> [1] https://public-inbox.org/git/20180810232457.GG19875@sigill.intra.peff.net/
>     but note that it's buried pretty deep.

Some notes on this, based on the cat-file patch that I linked to.

Before any optimizations, my best-of-five timing for:

  git cat-file --batch-all-objects --unordered --buffer \
               --batch-check='%(objectname)' >/dev/null

in git.git was:

  real	0m0.434s
  user	0m0.414s
  sys	0m0.020s

That's enumerating every object in the repo but not doing much more than
de-duping the names and printing them.

Applying this patch:

diff --git a/builtin/cat-file.c b/builtin/cat-file.c
index 45992c9be9..04b5cda191 100644
--- a/builtin/cat-file.c
+++ b/builtin/cat-file.c
@@ -443,9 +443,8 @@ static int batch_unordered_object(const struct object_id *oid, void *vdata)
 {
 	struct object_cb_data *data = vdata;
 
-	if (oidset_contains(data->seen, oid))
+	if (oidset_insert(data->seen, oid))
 		return 0;
-	oidset_insert(data->seen, oid);
 
 	return batch_object_cb(oid, data);
 }

to use the single-call set-and-replace doesn't seem to help (any
improvement is within the run-to-run noise). So a single hash lookup per
object does not seem to be measurable. And thus teaching oidset_insert()
to do a single hash lookup for check-and-insert is unlikely to help us.

On top of that, I tried using a pool to store the set entries:

diff --git a/oidset.c b/oidset.c
index 454c54f933..504929f177 100644
--- a/oidset.c
+++ b/oidset.c
@@ -17,7 +17,10 @@ int oidset_insert(struct oidset *set, const struct object_id *oid)
 	else if (oidset_contains(set, oid))
 		return 1;
 
-	entry = xmalloc(sizeof(*entry));
+	if (!set->pool)
+		mem_pool_init(&set->pool, 0);
+
+	entry = mem_pool_alloc(set->pool, sizeof(*entry));
 	oidcpy(&entry->oid, oid);
 
 	oidmap_put(&set->map, entry);
@@ -29,12 +32,13 @@ int oidset_remove(struct oidset *set, const struct object_id *oid)
 	struct oidmap_entry *entry;
 
 	entry = oidmap_remove(&set->map, oid);
-	free(entry);
+	/* abandon pool memory for "entry" */
 
 	return (entry != NULL);
 }
 
 void oidset_clear(struct oidset *set)
 {
-	oidmap_free(&set->map, 1);
+	oidmap_free(&set->map, 0);
+	mem_pool_discard(set->pool, 0);
 }
diff --git a/oidset.h b/oidset.h
index 40ec5f87fe..6b8b802987 100644
--- a/oidset.h
+++ b/oidset.h
@@ -20,6 +20,7 @@
  */
 struct oidset {
 	struct oidmap map;
+	struct mem_pool *pool;
 };
 
 #define OIDSET_INIT { OIDMAP_INIT }

That drops my best-of-five to:

  real	0m0.300s
  user	0m0.288s
  sys	0m0.012s

which is over a 25% speedup. So that does seem worth pursuing.

For reference, the oid_array code path for cat-file is still:

  real	0m0.161s
  user	0m0.157s
  sys	0m0.004s

but that's not completely apples to apples. The oidset code is also
iterating the packfiles in a different order and generating a revidx
(which I know is about 25ms in this repo). So a better test would
actually swap one data structure out for the other with no other changes
(I just happened to have this test handy, and really wanted to know
whether the mem_pool stuff would help).

-Peff

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] fsck: use oidset for skiplist
  2018-08-11 17:23     ` Jeff King
@ 2018-08-11 20:59       ` René Scharfe
  2018-08-13 17:15         ` René Scharfe
  2018-08-13 17:15       ` René Scharfe
  1 sibling, 1 reply; 31+ messages in thread
From: René Scharfe @ 2018-08-11 20:59 UTC (permalink / raw)
  To: Jeff King
  Cc: Git List, Ævar Arnfjörð Bjarmason, Ramsay Jones,
	Johannes Schindelin, Junio C Hamano

Am 11.08.2018 um 19:23 schrieb Jeff King:
> On Sat, Aug 11, 2018 at 01:02:48PM -0400, Jeff King wrote:
> 
>>    - we could probably improve the speed of oidset. Two things I notice
>>      about its implementation:

> Before any optimizations, my best-of-five timing for:
> 
>    git cat-file --batch-all-objects --unordered --buffer \
>                 --batch-check='%(objectname)' >/dev/null
> 
> in git.git was:
> 
>    real	0m0.434s
>    user	0m0.414s
>    sys	0m0.020s
> 
> That's enumerating every object in the repo but not doing much more than
> de-duping the names and printing them.

> So a single hash lookup per
> object does not seem to be measurable. And thus teaching oidset_insert()
> to do a single hash lookup for check-and-insert is unlikely to help us.

> On top of that, I tried using a pool to store the set entries:

> That drops my best-of-five to:
> 
>    real	0m0.300s
>    user	0m0.288s
>    sys	0m0.012s
> 
> which is over a 25% speedup. So that does seem worth pursuing.

> For reference, the oid_array code path for cat-file is still:
> 
>    real	0m0.161s
>    user	0m0.157s
>    sys	0m0.004s
> 
> but that's not completely apples to apples. The oidset code is also
> iterating the packfiles in a different order and generating a revidx
> (which I know is about 25ms in this repo). So a better test would
> actually swap one data structure out for the other with no other changes
> (I just happened to have this test handy, and really wanted to know
> whether the mem_pool stuff would help).

If the current oidset implementation is so bad, why not replace it with
one based on oid_array? ;-)

Intuitively I'd try a hashmap with no payload and open addressing via
sha1hash(), which should reduce memory allocations quite a bit -- no
need to store hash codes and next pointers, only an array of object IDs
with a fill rate of 50% or so.  Deletions are a bit awkward with that
scheme, though; they could perhaps be implemented as insertions into a
second hashmap.

Balancing might become a problem.  Your cat-file example should be fine,
but someone could decide to add only hashes with a certain prefix to
their skiplist, and lookups would lopsided.

But first we'd need something like test-sha1-array for oidset and
some kind of performance tests based on these helpers, right?

René

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] fsck: use oidset for skiplist
  2018-08-11 20:59       ` René Scharfe
@ 2018-08-13 17:15         ` René Scharfe
  2018-08-14  1:58           ` Jeff King
  0 siblings, 1 reply; 31+ messages in thread
From: René Scharfe @ 2018-08-13 17:15 UTC (permalink / raw)
  To: Jeff King
  Cc: Git List, Ævar Arnfjörð Bjarmason, Ramsay Jones,
	Johannes Schindelin, Junio C Hamano

Am 11.08.2018 um 22:59 schrieb René Scharfe:
> If the current oidset implementation is so bad, why not replace it with
> one based on oid_array? ;-)
> 
> Intuitively I'd try a hashmap with no payload and open addressing via
> sha1hash(), which should reduce memory allocations quite a bit -- no
> need to store hash codes and next pointers, only an array of object IDs
> with a fill rate of 50% or so.  Deletions are a bit awkward with that
> scheme, though; they could perhaps be implemented as insertions into a
> second hashmap.

Here's roughly what I had in mind, only with a free/occupied bitmap (or
a one-bit payload, if you will).  I tried a variant that encoded empty
slots as null_oid first, which has lower memory usage, but isn't any
faster than the current code.

# in git.git
$ hyperfine "./git-cat-file --batch-all-objects --buffer --unordered --batch-check='%(objectname)'"

Before:
Benchmark #1: ./git-cat-file --batch-all-objects --buffer --unordered --batch-check='%(objectname)'

   Time (mean ± σ):     269.5 ms ±  26.7 ms    [User: 247.7 ms, System: 21.4 ms]

   Range (min … max):   240.3 ms … 339.3 ms

After:
Benchmark #1: ./git-cat-file --batch-all-objects --buffer --unordered --batch-check='%(objectname)'

   Time (mean ± σ):     224.2 ms ±  18.2 ms    [User: 201.7 ms, System: 22.1 ms]

   Range (min … max):   205.0 ms … 259.0 ms

So that's only slightly faster. :-|

---
  builtin/cat-file.c | 93 +++++++++++++++++++++++++++++++++++++++++++---
  1 file changed, 88 insertions(+), 5 deletions(-)

diff --git a/builtin/cat-file.c b/builtin/cat-file.c
index 45992c9be9..b197cca861 100644
--- a/builtin/cat-file.c
+++ b/builtin/cat-file.c
@@ -408,10 +408,93 @@ static void batch_one_object(const char *obj_name, struct batch_options *opt,
  	batch_object_write(obj_name, opt, data);
  }
  
+struct oidset2 {
+	size_t nr, alloc;
+	struct object_id *entries;
+	uint32_t *bitmap;
+};
+
+static int is_bit_set(const uint32_t *bitmap, size_t idx)
+{
+	uint32_t mask = 1 << (idx % bitsizeof(bitmap[0]));
+	return bitmap[idx / bitsizeof(bitmap[0])] & mask;
+}
+
+static void set_bit(uint32_t *bitmap, size_t idx)
+{
+	uint32_t mask = 1 << (idx % bitsizeof(bitmap[0]));
+	bitmap[idx / bitsizeof(bitmap[0])] |= mask;
+}
+
+static void oidset2_add(struct oidset2 *set, const struct object_id *oid)
+{
+	size_t idx;
+
+	for (idx = sha1hash(oid->hash) % set->alloc;;) {
+		if (!is_bit_set(set->bitmap, idx))
+			break;
+		if (!oidcmp(&set->entries[idx], oid))
+			return;
+		if (++idx >= set->alloc)
+			idx = 0;
+	}
+	oidcpy(&set->entries[idx], oid);
+	set_bit(set->bitmap, idx);
+	set->nr++;
+}
+
+static void oidset2_grow(struct oidset2 *set)
+{
+	struct oidset2 old_set = *set;
+	size_t idx;
+
+	set->alloc = (old_set.alloc + 1000) * 3 / 2;
+	set->nr = 0;
+	set->entries = xcalloc(set->alloc, sizeof(set->entries[0]));
+	set->bitmap = xcalloc(set->alloc / 32 + 1, sizeof(set->bitmap[0]));
+	for (idx = 0; idx < old_set.alloc; idx++) {
+		if (!is_bit_set(old_set.bitmap, idx))
+			continue;
+		oidset2_add(set, &old_set.entries[idx]);
+	}
+	free(old_set.entries);
+	free(old_set.bitmap);
+}
+
+static void oidset2_insert(struct oidset2 *set, const struct object_id *oid)
+{
+	if (set->nr + 1 > set->alloc * 2 / 3)
+		oidset2_grow(set);
+	oidset2_add(set, oid);
+}
+
+static int oidset2_contains(struct oidset2 *set, const struct object_id *oid)
+{
+	size_t idx;
+
+	if (!set->nr)
+		return 0;
+	for (idx = sha1hash(oid->hash) % set->alloc;;) {
+		if (!is_bit_set(set->bitmap, idx))
+			return 0;
+		if (!oidcmp(&set->entries[idx], oid))
+			return 1;
+		if (++idx >= set->alloc)
+			idx = 0;
+	}
+}
+
+static void oidset2_clear(struct oidset2 *set)
+{
+	FREE_AND_NULL(set->entries);
+	FREE_AND_NULL(set->bitmap);
+	set->alloc = set->nr = 0;
+}
+
  struct object_cb_data {
  	struct batch_options *opt;
  	struct expand_data *expand;
-	struct oidset *seen;
+	struct oidset2 *seen;
  };
  
  static int batch_object_cb(const struct object_id *oid, void *vdata)
@@ -443,9 +526,9 @@ static int batch_unordered_object(const struct object_id *oid, void *vdata)
  {
  	struct object_cb_data *data = vdata;
  
-	if (oidset_contains(data->seen, oid))
+	if (oidset2_contains(data->seen, oid))
  		return 0;
-	oidset_insert(data->seen, oid);
+	oidset2_insert(data->seen, oid);
  
  	return batch_object_cb(oid, data);
  }
@@ -510,7 +593,7 @@ static int batch_objects(struct batch_options *opt)
  		cb.expand = &data;
  
  		if (opt->unordered) {
-			struct oidset seen = OIDSET_INIT;
+			struct oidset2 seen = { 0 };
  
  			cb.seen = &seen;
  
@@ -518,7 +601,7 @@ static int batch_objects(struct batch_options *opt)
  			for_each_packed_object(batch_unordered_packed, &cb,
  					       FOR_EACH_OBJECT_PACK_ORDER);
  
-			oidset_clear(&seen);
+			oidset2_clear(&seen);
  		} else {
  			struct oid_array sa = OID_ARRAY_INIT;
  
-- 
2.18.0

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] fsck: use oidset for skiplist
  2018-08-13 17:15         ` René Scharfe
@ 2018-08-14  1:58           ` Jeff King
  2018-08-14  2:03             ` Jeff King
  2018-08-26 11:37             ` René Scharfe
  0 siblings, 2 replies; 31+ messages in thread
From: Jeff King @ 2018-08-14  1:58 UTC (permalink / raw)
  To: René Scharfe
  Cc: Git List, Ævar Arnfjörð Bjarmason, Ramsay Jones,
	Johannes Schindelin, Junio C Hamano

On Mon, Aug 13, 2018 at 07:15:23PM +0200, René Scharfe wrote:

> Am 11.08.2018 um 22:59 schrieb René Scharfe:
> > If the current oidset implementation is so bad, why not replace it with
> > one based on oid_array? ;-)
> > 
> > Intuitively I'd try a hashmap with no payload and open addressing via
> > sha1hash(), which should reduce memory allocations quite a bit -- no
> > need to store hash codes and next pointers, only an array of object IDs
> > with a fill rate of 50% or so.  Deletions are a bit awkward with that
> > scheme, though; they could perhaps be implemented as insertions into a
> > second hashmap.
> 
> Here's roughly what I had in mind, only with a free/occupied bitmap (or
> a one-bit payload, if you will).  I tried a variant that encoded empty
> slots as null_oid first, which has lower memory usage, but isn't any
> faster than the current code.

Hmph, I thought I had sent my version out last night, but it looks like
I didn't. I got similarly mediocre results.

Your suggestion can be implemented using khash (my patch below).

> Before:
> Benchmark #1: ./git-cat-file --batch-all-objects --buffer --unordered --batch-check='%(objectname)'
> 
>   Time (mean ± σ):     269.5 ms ±  26.7 ms    [User: 247.7 ms, System: 21.4 ms]
> 
>   Range (min … max):   240.3 ms … 339.3 ms
> 
> After:
> Benchmark #1: ./git-cat-file --batch-all-objects --buffer --unordered --batch-check='%(objectname)'
> 
>   Time (mean ± σ):     224.2 ms ±  18.2 ms    [User: 201.7 ms, System: 22.1 ms]
> 
>   Range (min … max):   205.0 ms … 259.0 ms

Yeah. My best-of-five dropped from 300ms to 247ms. That 300 was using
the memory pool, though khash's deletion strategy isn't all that
different (the wasted memory hangs around until the next hash resize,
but if you're evenly dropping and adding, you likely won't need to
resize).

Applying your patch, I get 337ms, worse than the hashmap with a memory
pool. I'm not sure why.

>  builtin/cat-file.c | 93 +++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 88 insertions(+), 5 deletions(-)

By the way, your patch seemed damaged (wouldn't apply, and "am -3"
complained of hand-editing). It looks like maybe there's an extra space
inserted in the context lines?

Anyway, here's the khash patch for reference.

diff --git a/fetch-pack.c b/fetch-pack.c
index 5714bcbddd..5a86b10a5e 100644
--- a/fetch-pack.c
+++ b/fetch-pack.c
@@ -534,7 +534,7 @@ static int tip_oids_contain(struct oidset *tip_oids,
 	 * add to "newlist" between calls, the additions will always be for
 	 * oids that are already in the set.
 	 */
-	if (!tip_oids->map.map.tablesize) {
+	if (!tip_oids->map) {
 		add_refs_to_oidset(tip_oids, unmatched);
 		add_refs_to_oidset(tip_oids, newlist);
 	}
diff --git a/oidset.c b/oidset.c
index 454c54f933..2964b43b2d 100644
--- a/oidset.c
+++ b/oidset.c
@@ -3,38 +3,44 @@
 
 int oidset_contains(const struct oidset *set, const struct object_id *oid)
 {
-	if (!set->map.map.tablesize)
+	khiter_t pos;
+
+	if (!set->map)
 		return 0;
-	return !!oidmap_get(&set->map, oid);
+
+	pos = kh_get_oid(set->map, *oid);
+	return pos < kh_end(set->map);
 }
 
 int oidset_insert(struct oidset *set, const struct object_id *oid)
 {
-	struct oidmap_entry *entry;
+	int hash_ret;
 
-	if (!set->map.map.tablesize)
-		oidmap_init(&set->map, 0);
-	else if (oidset_contains(set, oid))
-		return 1;
+	if (!set->map)
+		set->map = kh_init_oid();
 
-	entry = xmalloc(sizeof(*entry));
-	oidcpy(&entry->oid, oid);
-
-	oidmap_put(&set->map, entry);
-	return 0;
+	kh_put_oid(set->map, *oid, &hash_ret);
+	return !hash_ret;
 }
 
 int oidset_remove(struct oidset *set, const struct object_id *oid)
 {
-	struct oidmap_entry *entry;
+	khiter_t pos;
 
-	entry = oidmap_remove(&set->map, oid);
-	free(entry);
+	if (!set->map)
+		return 0;
+
+	pos = kh_get_oid(set->map, *oid);
+	if (pos < kh_end(set->map)) {
+		kh_del_oid(set->map, pos);
+		return 1;
+	}
 
-	return (entry != NULL);
+	return 0;
 }
 
 void oidset_clear(struct oidset *set)
 {
-	oidmap_free(&set->map, 1);
+	kh_destroy_oid(set->map);
+	set->map = NULL;
 }
diff --git a/oidset.h b/oidset.h
index 40ec5f87fe..4c4c5a42fe 100644
--- a/oidset.h
+++ b/oidset.h
@@ -2,6 +2,7 @@
 #define OIDSET_H
 
 #include "oidmap.h"
+#include "khash.h"
 
 /**
  * This API is similar to sha1-array, in that it maintains a set of object ids
@@ -15,19 +16,34 @@
  *      table overhead.
  */
 
+static inline unsigned int oid_hash(const struct object_id oid)
+{
+	unsigned int hash;
+	memcpy(&hash, oid.hash, sizeof(hash));
+	return hash;
+}
+
+static inline int oid_equal(const struct object_id a,
+			    const struct object_id b)
+{
+	return !oidcmp(&a, &b);
+}
+
+KHASH_INIT(oid, struct object_id, int, 0, oid_hash, oid_equal)
+
+
 /**
  * A single oidset; should be zero-initialized (or use OIDSET_INIT).
  */
 struct oidset {
-	struct oidmap map;
+	kh_oid_t *map;
 };
 
-#define OIDSET_INIT { OIDMAP_INIT }
-
+#define OIDSET_INIT { NULL }
 
 static inline void oidset_init(struct oidset *set, size_t initial_size)
 {
-	oidmap_init(&set->map, initial_size);
+	set->map = NULL;
 }
 
 /**
@@ -58,19 +74,25 @@ int oidset_remove(struct oidset *set, const struct object_id *oid);
 void oidset_clear(struct oidset *set);
 
 struct oidset_iter {
-	struct oidmap_iter m_iter;
+	kh_oid_t *map;
+	khiter_t iter;
 };
 
 static inline void oidset_iter_init(struct oidset *set,
 				    struct oidset_iter *iter)
 {
-	oidmap_iter_init(&set->map, &iter->m_iter);
+	iter->map = set->map;
+	iter->iter = kh_begin(iter->map);
 }
 
 static inline struct object_id *oidset_iter_next(struct oidset_iter *iter)
 {
-	struct oidmap_entry *e = oidmap_iter_next(&iter->m_iter);
-	return e ? &e->oid : NULL;
+	for (; iter->iter != kh_end(iter->map); iter->iter++) {
+		if (!kh_exist(iter->map, iter->iter))
+			continue;
+		return &kh_key(iter->map, iter->iter);
+	}
+	return NULL;
 }
 
 static inline struct object_id *oidset_iter_first(struct oidset *set,

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] fsck: use oidset for skiplist
  2018-08-14  1:58           ` Jeff King
@ 2018-08-14  2:03             ` Jeff King
  2018-08-26 11:37             ` René Scharfe
  1 sibling, 0 replies; 31+ messages in thread
From: Jeff King @ 2018-08-14  2:03 UTC (permalink / raw)
  To: René Scharfe
  Cc: Git List, Ævar Arnfjörð Bjarmason, Ramsay Jones,
	Johannes Schindelin, Junio C Hamano

On Mon, Aug 13, 2018 at 09:58:42PM -0400, Jeff King wrote:

> >  builtin/cat-file.c | 93 +++++++++++++++++++++++++++++++++++++++++++---
> >  1 file changed, 88 insertions(+), 5 deletions(-)
> 
> By the way, your patch seemed damaged (wouldn't apply, and "am -3"
> complained of hand-editing). It looks like maybe there's an extra space
> inserted in the context lines?

Oh nevermind, I see you guys dug to the bottom of it elsewhere in the
thread. :)

-Peff

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] fsck: use oidset for skiplist
  2018-08-14  1:58           ` Jeff King
  2018-08-14  2:03             ` Jeff King
@ 2018-08-26 11:37             ` René Scharfe
  2018-08-27 23:03               ` Jeff King
  1 sibling, 1 reply; 31+ messages in thread
From: René Scharfe @ 2018-08-26 11:37 UTC (permalink / raw)
  To: Jeff King
  Cc: Git List, Ævar Arnfjörð Bjarmason, Ramsay Jones,
	Johannes Schindelin, Junio C Hamano

Am 14.08.2018 um 03:58 schrieb Jeff King:
> Your suggestion can be implemented using khash (my patch below).
> 
>> Before:
>> Benchmark #1: ./git-cat-file --batch-all-objects --buffer --unordered --batch-check='%(objectname)'
>>
>>   Time (mean ± σ):     269.5 ms ±  26.7 ms    [User: 247.7 ms, System: 21.4 ms]
>>
>>   Range (min … max):   240.3 ms … 339.3 ms
>>
>> After:
>> Benchmark #1: ./git-cat-file --batch-all-objects --buffer --unordered --batch-check='%(objectname)'
>>
>>   Time (mean ± σ):     224.2 ms ±  18.2 ms    [User: 201.7 ms, System: 22.1 ms]
>>
>>   Range (min … max):   205.0 ms … 259.0 ms
> 
> Yeah. My best-of-five dropped from 300ms to 247ms. That 300 was using
> the memory pool, though khash's deletion strategy isn't all that
> different (the wasted memory hangs around until the next hash resize,
> but if you're evenly dropping and adding, you likely won't need to
> resize).

With your khash patch:

Benchmark #1: ./git-cat-file --batch-all-objects --buffer --unordered --batch-check='%(objectname)'

  Time (mean ± σ):     159.1 ms ±  20.5 ms    [User: 140.3 ms, System: 18.5 ms]

  Range (min … max):   140.0 ms … 214.0 ms

So it seems worth it.

> Anyway, here's the khash patch for reference.
> 
> diff --git a/fetch-pack.c b/fetch-pack.c
> index 5714bcbddd..5a86b10a5e 100644
> --- a/fetch-pack.c
> +++ b/fetch-pack.c
> @@ -534,7 +534,7 @@ static int tip_oids_contain(struct oidset *tip_oids,
>  	 * add to "newlist" between calls, the additions will always be for
>  	 * oids that are already in the set.
>  	 */
> -	if (!tip_oids->map.map.tablesize) {
> +	if (!tip_oids->map) {
>  		add_refs_to_oidset(tip_oids, unmatched);
>  		add_refs_to_oidset(tip_oids, newlist);
>  	}

The caller shouldn't look at the private parts of the implementation
like this.  tablesize is the allocation count, which becomes non-zero
if at least one entry was added.  tip_oids is only inserted into, never
deleted from, so a count or size function could be used instead as a
cleaner interface.  (In a separate patch..)

> diff --git a/oidset.c b/oidset.c
> index 454c54f933..2964b43b2d 100644
> --- a/oidset.c
> +++ b/oidset.c
> @@ -3,38 +3,44 @@
>  
>  int oidset_contains(const struct oidset *set, const struct object_id *oid)
>  {
> -	if (!set->map.map.tablesize)
> +	khiter_t pos;
> +
> +	if (!set->map)
>  		return 0;
> -	return !!oidmap_get(&set->map, oid);
> +
> +	pos = kh_get_oid(set->map, *oid);
> +	return pos < kh_end(set->map);
>  }
>  
>  int oidset_insert(struct oidset *set, const struct object_id *oid)
>  {
> -	struct oidmap_entry *entry;
> +	int hash_ret;
>  
> -	if (!set->map.map.tablesize)
> -		oidmap_init(&set->map, 0);
> -	else if (oidset_contains(set, oid))
> -		return 1;
> +	if (!set->map)
> +		set->map = kh_init_oid();
>  
> -	entry = xmalloc(sizeof(*entry));
> -	oidcpy(&entry->oid, oid);
> -
> -	oidmap_put(&set->map, entry);
> -	return 0;
> +	kh_put_oid(set->map, *oid, &hash_ret);
> +	return !hash_ret;
>  }

So initialization is deferred to the first insert, and the empty set
can be represented in two ways -- map == NULL and map->size == 0.

I wondered about the performance impact of all those NULL checks at
insert and lookup and converted it to stack storage, with a dirty
hand-rolled oidset_clear() implementation.  It wasn't any faster.

>  
>  int oidset_remove(struct oidset *set, const struct object_id *oid)
>  {
> -	struct oidmap_entry *entry;
> +	khiter_t pos;
>  
> -	entry = oidmap_remove(&set->map, oid);
> -	free(entry);
> +	if (!set->map)
> +		return 0;
> +
> +	pos = kh_get_oid(set->map, *oid);
> +	if (pos < kh_end(set->map)) {
> +		kh_del_oid(set->map, pos);
> +		return 1;
> +	}
>  
> -	return (entry != NULL);
> +	return 0;
>  }
>  
>  void oidset_clear(struct oidset *set)
>  {
> -	oidmap_free(&set->map, 1);
> +	kh_destroy_oid(set->map);
> +	set->map = NULL;
>  }
> diff --git a/oidset.h b/oidset.h
> index 40ec5f87fe..4c4c5a42fe 100644
> --- a/oidset.h
> +++ b/oidset.h
> @@ -2,6 +2,7 @@
>  #define OIDSET_H
>  
>  #include "oidmap.h"

This can go.

> +#include "khash.h"
>  
>  /**
>   * This API is similar to sha1-array, in that it maintains a set of object ids
> @@ -15,19 +16,34 @@
>   *      table overhead.
>   */
>  
> +static inline unsigned int oid_hash(const struct object_id oid)
> +{
> +	unsigned int hash;
> +	memcpy(&hash, oid.hash, sizeof(hash));
> +	return hash;
> +}
> +
> +static inline int oid_equal(const struct object_id a,
> +			    const struct object_id b)
> +{
> +	return !oidcmp(&a, &b);
> +}

Look, it's oideq() from that other series in disguise! :)

> +
> +KHASH_INIT(oid, struct object_id, int, 0, oid_hash, oid_equal)

Note to self: The 0 is for kh_is_map, which means that no values are
kept and the given value type (int) doesn't matter.

> +
> +
>  /**
>   * A single oidset; should be zero-initialized (or use OIDSET_INIT).
>   */
>  struct oidset {
> -	struct oidmap map;
> +	kh_oid_t *map;
>  };
>  
> -#define OIDSET_INIT { OIDMAP_INIT }
> -
> +#define OIDSET_INIT { NULL }
>  
>  static inline void oidset_init(struct oidset *set, size_t initial_size)
>  {
> -	oidmap_init(&set->map, initial_size);
> +	set->map = NULL;
>  }

This ignores initial_size, which is misleading.  We should probably
call kh_resize_oid() here and move the function to oidset.c.  Or
get rid of the second parameter..

>  
>  /**
> @@ -58,19 +74,25 @@ int oidset_remove(struct oidset *set, const struct object_id *oid);
>  void oidset_clear(struct oidset *set);
>  
>  struct oidset_iter {
> -	struct oidmap_iter m_iter;
> +	kh_oid_t *map;
> +	khiter_t iter;
>  };
>  
>  static inline void oidset_iter_init(struct oidset *set,
>  				    struct oidset_iter *iter)
>  {
> -	oidmap_iter_init(&set->map, &iter->m_iter);
> +	iter->map = set->map;
> +	iter->iter = kh_begin(iter->map);
>  }

This is fine even if map == NULL, because kh_begin() can handle any
parameter value, as it ignores them...

>  
>  static inline struct object_id *oidset_iter_next(struct oidset_iter *iter)
>  {
> -	struct oidmap_entry *e = oidmap_iter_next(&iter->m_iter);
> -	return e ? &e->oid : NULL;
> +	for (; iter->iter != kh_end(iter->map); iter->iter++) {
> +		if (!kh_exist(iter->map, iter->iter))
> +			continue;
> +		return &kh_key(iter->map, iter->iter);
> +	}
> +	return NULL;
>  }

... but kh_end() dereferences map, so iterating a fresh oidset will
segfault here.

>  
>  static inline struct object_id *oidset_iter_first(struct oidset *set,
> 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] fsck: use oidset for skiplist
  2018-08-26 11:37             ` René Scharfe
@ 2018-08-27 23:03               ` Jeff King
  2018-10-01 19:15                 ` René Scharfe
  0 siblings, 1 reply; 31+ messages in thread
From: Jeff King @ 2018-08-27 23:03 UTC (permalink / raw)
  To: René Scharfe
  Cc: Git List, Ævar Arnfjörð Bjarmason, Ramsay Jones,
	Johannes Schindelin, Junio C Hamano

On Sun, Aug 26, 2018 at 01:37:41PM +0200, René Scharfe wrote:

> Am 14.08.2018 um 03:58 schrieb Jeff King:
> > Your suggestion can be implemented using khash (my patch below).
> > 
> >> Before:
> >> Benchmark #1: ./git-cat-file --batch-all-objects --buffer --unordered --batch-check='%(objectname)'
> >>
> >>   Time (mean ± σ):     269.5 ms ±  26.7 ms    [User: 247.7 ms, System: 21.4 ms]
> >>
> >>   Range (min … max):   240.3 ms … 339.3 ms
> >>
> >> After:
> >> Benchmark #1: ./git-cat-file --batch-all-objects --buffer --unordered --batch-check='%(objectname)'
> >>
> >>   Time (mean ± σ):     224.2 ms ±  18.2 ms    [User: 201.7 ms, System: 22.1 ms]
> >>
> >>   Range (min … max):   205.0 ms … 259.0 ms
> > 
> > Yeah. My best-of-five dropped from 300ms to 247ms. That 300 was using
> > the memory pool, though khash's deletion strategy isn't all that
> > different (the wasted memory hangs around until the next hash resize,
> > but if you're evenly dropping and adding, you likely won't need to
> > resize).
> 
> With your khash patch:
> 
> Benchmark #1: ./git-cat-file --batch-all-objects --buffer --unordered --batch-check='%(objectname)'
> 
>   Time (mean ± σ):     159.1 ms ±  20.5 ms    [User: 140.3 ms, System: 18.5 ms]
> 
>   Range (min … max):   140.0 ms … 214.0 ms
> 
> So it seems worth it.

Hmm, that really does. Which is a shame, because I hoped that one day we
could get rid of the nasty macro-infestation that is khash.h. But it
really is a lot faster than the alternatives.

> [...]

I agree with all of your comments here. What I posted was just trying to
do the least amount of work to get something we could time.

Do you want to pick up this topic and carry it forward?

-Peff

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] fsck: use oidset for skiplist
  2018-08-27 23:03               ` Jeff King
@ 2018-10-01 19:15                 ` René Scharfe
  2018-10-01 20:26                   ` Jeff King
  0 siblings, 1 reply; 31+ messages in thread
From: René Scharfe @ 2018-10-01 19:15 UTC (permalink / raw)
  To: Jeff King
  Cc: Git List, Ævar Arnfjörð Bjarmason, Ramsay Jones,
	Johannes Schindelin, Junio C Hamano

Am 28.08.2018 um 01:03 schrieb Jeff King:
> On Sun, Aug 26, 2018 at 01:37:41PM +0200, René Scharfe wrote:
>> So it seems worth it.
> 
> Hmm, that really does. Which is a shame, because I hoped that one day we
> could get rid of the nasty macro-infestation that is khash.h. But it
> really is a lot faster than the alternatives.

Well, we only compare it to hashmap.c here.  There might be better
implementations out there.  Hash tables in plain old C seem to be much
harder to find than e.g. in C++, though.

And then there are several possible variations that affect
performance -- perhaps hashmap.c could be improved by using open
addressing, maybe with Robin Hood hashing, and/or by offering better
support for sets, and/or by having a few macros to allow type
specialization.

But I like how khash.h is both already in the tree and also really easy
to deploy, as it's just a single header file.  It's a tasty low-hanging
fruit.

> Do you want to pick up this topic and carry it forward?

Well, I tried to simplify the implementation as much as possible and
ended up doing the opposite of what I wrote earlier.  Hmm.

The patch below postpones struct allocation until cleanup time, which is
a bit weird.  We can't avoid it fully without reimplementing kh_destroy,
but we can use structs supplied by callers for basically all other
operations.  That avoids NULL checks, and the main benefits of that are
simplicity and safety; performance is not much better without them.

This patch doesn't provide a _count (or _is_empty) method.  It would be
cleaner to have one added as the first step and just change its
implementation when switching to khash, but it's harder than expected
with hashmap.c; doing that later (if at all) is easier.

Using sha1hash instead of open-coding it may seem a bit anachronistic,
but is the best way to document the usage of that pattern (i.e. to
truncate a SHA1 hash to get a shorter one for a hash table).

-- >8 --
Subject: [PATCH] oidset: use khash

Reimplement struct oidset using khash.h in order to reduce its memory
footprint and make it faster.

This is straight-forward, except for oidset_clear(), which needs to
allocate a kh_oid_t on the heap in order to be able to feed it to
kh_destroy_oid() for release it.  Alternatively we could open-code the
relevant parts of the latter, but that would be a layering violation.

Performance of a command that mainly checks for duplicate objects
using an oidset, with master and Clang 6.0.1:

$ cmd="./git-cat-file --batch-all-objects --unordered --buffer --batch-check='%(objectname)'"

$ /usr/bin/time $cmd >/dev/null
0.22user 0.03system 0:00.25elapsed 99%CPU (0avgtext+0avgdata 48484maxresident)k
0inputs+0outputs (0major+11204minor)pagefaults 0swaps

$ hyperfine "$cmd"
Benchmark #1: ./git-cat-file --batch-all-objects --unordered --buffer --batch-check='%(objectname)'

  Time (mean ± σ):     250.0 ms ±   6.0 ms    [User: 225.9 ms, System: 23.6 ms]

  Range (min … max):   242.0 ms … 261.1 ms

And with this patch:

$ /usr/bin/time $cmd >/dev/null
0.14user 0.00system 0:00.15elapsed 100%CPU (0avgtext+0avgdata 41396maxresident)k
0inputs+0outputs (0major+8318minor)pagefaults 0swaps

$ hyperfine "$cmd"
Benchmark #1: ./git-cat-file --batch-all-objects --unordered --buffer --batch-check='%(objectname)'

  Time (mean ± σ):     151.9 ms ±   4.9 ms    [User: 130.5 ms, System: 21.2 ms]

  Range (min … max):   148.2 ms … 170.4 ms

Initial-patch-by: Jeff King <peff@peff.net>
Signed-off-by: Rene Scharfe <l.s.r@web.de>
---
 fetch-pack.c |  2 +-
 oidset.c     | 36 ++++++++++++++----------------------
 oidset.h     | 36 ++++++++++++++++++++++++++++--------
 3 files changed, 43 insertions(+), 31 deletions(-)

diff --git a/fetch-pack.c b/fetch-pack.c
index 75047a4b2a..a839315726 100644
--- a/fetch-pack.c
+++ b/fetch-pack.c
@@ -536,7 +536,7 @@ static int tip_oids_contain(struct oidset *tip_oids,
 	 * add to "newlist" between calls, the additions will always be for
 	 * oids that are already in the set.
 	 */
-	if (!tip_oids->map.map.tablesize) {
+	if (!tip_oids->set.n_buckets) {
 		add_refs_to_oidset(tip_oids, unmatched);
 		add_refs_to_oidset(tip_oids, newlist);
 	}
diff --git a/oidset.c b/oidset.c
index 454c54f933..d15b2b7a89 100644
--- a/oidset.c
+++ b/oidset.c
@@ -3,38 +3,30 @@
 
 int oidset_contains(const struct oidset *set, const struct object_id *oid)
 {
-	if (!set->map.map.tablesize)
-		return 0;
-	return !!oidmap_get(&set->map, oid);
+	khiter_t pos = kh_get_oid(&set->set, *oid);
+	return pos != kh_end(&set->set);
 }
 
 int oidset_insert(struct oidset *set, const struct object_id *oid)
 {
-	struct oidmap_entry *entry;
-
-	if (!set->map.map.tablesize)
-		oidmap_init(&set->map, 0);
-	else if (oidset_contains(set, oid))
-		return 1;
-
-	entry = xmalloc(sizeof(*entry));
-	oidcpy(&entry->oid, oid);
-
-	oidmap_put(&set->map, entry);
-	return 0;
+	int added;
+	kh_put_oid(&set->set, *oid, &added);
+	return !added;
 }
 
 int oidset_remove(struct oidset *set, const struct object_id *oid)
 {
-	struct oidmap_entry *entry;
-
-	entry = oidmap_remove(&set->map, oid);
-	free(entry);
-
-	return (entry != NULL);
+	khiter_t pos = kh_get_oid(&set->set, *oid);
+	if (pos == kh_end(&set->set))
+		return 0;
+	kh_del_oid(&set->set, pos);
+	return 1;
 }
 
 void oidset_clear(struct oidset *set)
 {
-	oidmap_free(&set->map, 1);
+	kh_oid_t *to_free = kh_init_oid();
+	*to_free = set->set;
+	kh_destroy_oid(to_free);
+	oidset_init(set, 0);
 }
diff --git a/oidset.h b/oidset.h
index 40ec5f87fe..4b90540cd4 100644
--- a/oidset.h
+++ b/oidset.h
@@ -1,7 +1,8 @@
 #ifndef OIDSET_H
 #define OIDSET_H
 
-#include "oidmap.h"
+#include "hashmap.h"
+#include "khash.h"
 
 /**
  * This API is similar to sha1-array, in that it maintains a set of object ids
@@ -15,19 +16,33 @@
  *      table overhead.
  */
 
+static inline unsigned int oid_hash(struct object_id oid)
+{
+	return sha1hash(oid.hash);
+}
+
+static inline int oid_equal(struct object_id a, struct object_id b)
+{
+	return oideq(&a, &b);
+}
+
+KHASH_INIT(oid, struct object_id, int, 0, oid_hash, oid_equal)
+
 /**
  * A single oidset; should be zero-initialized (or use OIDSET_INIT).
  */
 struct oidset {
-	struct oidmap map;
+	kh_oid_t set;
 };
 
-#define OIDSET_INIT { OIDMAP_INIT }
+#define OIDSET_INIT { { 0 } }
 
 
 static inline void oidset_init(struct oidset *set, size_t initial_size)
 {
-	oidmap_init(&set->map, initial_size);
+	memset(&set->set, 0, sizeof(set->set));
+	if (initial_size)
+		kh_resize_oid(&set->set, initial_size);
 }
 
 /**
@@ -58,19 +73,24 @@ int oidset_remove(struct oidset *set, const struct object_id *oid);
 void oidset_clear(struct oidset *set);
 
 struct oidset_iter {
-	struct oidmap_iter m_iter;
+	kh_oid_t *set;
+	khiter_t iter;
 };
 
 static inline void oidset_iter_init(struct oidset *set,
 				    struct oidset_iter *iter)
 {
-	oidmap_iter_init(&set->map, &iter->m_iter);
+	iter->set = &set->set;
+	iter->iter = kh_begin(iter->set);
 }
 
 static inline struct object_id *oidset_iter_next(struct oidset_iter *iter)
 {
-	struct oidmap_entry *e = oidmap_iter_next(&iter->m_iter);
-	return e ? &e->oid : NULL;
+	for (; iter->iter != kh_end(iter->set); iter->iter++) {
+		if (kh_exist(iter->set, iter->iter))
+			return &kh_key(iter->set, iter->iter++);
+	}
+	return NULL;
 }
 
 static inline struct object_id *oidset_iter_first(struct oidset *set,
-- 
2.19.0

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] fsck: use oidset for skiplist
  2018-10-01 19:15                 ` René Scharfe
@ 2018-10-01 20:26                   ` Jeff King
  2018-10-02 19:05                     ` René Scharfe
  0 siblings, 1 reply; 31+ messages in thread
From: Jeff King @ 2018-10-01 20:26 UTC (permalink / raw)
  To: René Scharfe
  Cc: Git List, Ævar Arnfjörð Bjarmason, Ramsay Jones,
	Johannes Schindelin, Junio C Hamano

On Mon, Oct 01, 2018 at 09:15:53PM +0200, René Scharfe wrote:

> Am 28.08.2018 um 01:03 schrieb Jeff King:
> > On Sun, Aug 26, 2018 at 01:37:41PM +0200, René Scharfe wrote:
> >> So it seems worth it.
> > 
> > Hmm, that really does. Which is a shame, because I hoped that one day we
> > could get rid of the nasty macro-infestation that is khash.h. But it
> > really is a lot faster than the alternatives.
> 
> Well, we only compare it to hashmap.c here.  There might be better
> implementations out there.  Hash tables in plain old C seem to be much
> harder to find than e.g. in C++, though.

I think it may be either-or here. The speed benefit to using khash here
is that we stick the data into the hash table itself, rather than
incurring a separate malloc for each entry. Which is pretty hard to do
in C without macros (the alternative is lots of void pointers, and
telling the hash table "your size is X bytes").

> And then there are several possible variations that affect
> performance -- perhaps hashmap.c could be improved by using open
> addressing, maybe with Robin Hood hashing, and/or by offering better
> support for sets, and/or by having a few macros to allow type
> specialization.

The reason hashmap.c was added was to avoid open addressing. ;)

So yeah, I think it could perhaps be improved, but in my mind talking
about "hashmap.c" is fundamentally talking about chained buckets.

But whatever you want to call it, I would be happy with a more type-safe
and performance hashmap.

> But I like how khash.h is both already in the tree and also really easy
> to deploy, as it's just a single header file.  It's a tasty low-hanging
> fruit.

Yeah. And if it really does perform better, I think we should stick with
it in the code base. I wonder if we could stand to clean up the
interfaces a little.  E.g., I had a hard time declaring a hash in one
place, and then defining it somewhere else. And I think as you found
that it insists on heap-allocating the hash-table struct itself, which
does not match our usual style.

> > Do you want to pick up this topic and carry it forward?
> 
> Well, I tried to simplify the implementation as much as possible and
> ended up doing the opposite of what I wrote earlier.  Hmm.
> 
> The patch below postpones struct allocation until cleanup time, which is
> a bit weird.  We can't avoid it fully without reimplementing kh_destroy,
> but we can use structs supplied by callers for basically all other
> operations.  That avoids NULL checks, and the main benefits of that are
> simplicity and safety; performance is not much better without them.

Your patch looks OK from a cursory view. I actually think that retaining
the extra NULL encapsulation is not the worst thing in the world. Most
callers should be going through the oid_* functions anyway.

But if we want to take the time to refactor khash (or even write our own
version, taking inspiration from what's there), the result would be
better still. If you're not planning to work on that in the near future,
though, I'd be OK with either of the alternates (the extra level of
pointer indirection from earlier, or this slight kh_destroy hackery
here).

> -- >8 --
> Subject: [PATCH] oidset: use khash
> 
> Reimplement struct oidset using khash.h in order to reduce its memory
> footprint and make it faster.
> 
> This is straight-forward, except for oidset_clear(), which needs to
> allocate a kh_oid_t on the heap in order to be able to feed it to
> kh_destroy_oid() for release it.  Alternatively we could open-code the
> relevant parts of the latter, but that would be a layering violation.

This is kind of a layering violation, too. You're assuming that struct
assignment is sufficient to make one kh struct freeable from another
pointer. That's probably reasonable, since you're just destroying them
both (e.g., some of our FLEX structs point into their own struct memory,
making a hidden dependency; but they obviously would not need to free
such a field).

-Peff

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] fsck: use oidset for skiplist
  2018-10-01 20:26                   ` Jeff King
@ 2018-10-02 19:05                     ` René Scharfe
  2018-10-02 19:19                       ` Jeff King
  0 siblings, 1 reply; 31+ messages in thread
From: René Scharfe @ 2018-10-02 19:05 UTC (permalink / raw)
  To: Jeff King
  Cc: Git List, Ævar Arnfjörð Bjarmason, Ramsay Jones,
	Johannes Schindelin, Junio C Hamano

Am 01.10.2018 um 22:26 schrieb Jeff King:
> On Mon, Oct 01, 2018 at 09:15:53PM +0200, René Scharfe wrote:
> The reason hashmap.c was added was to avoid open addressing. ;)
Because efficient removal of elements is easier to implement with
chaining, according to 6a364ced49 (add a hashtable implementation that
supports O(1) removal).  khash.h deletes using its flags bitmap.  We
didn't compare their performance when entries are removed so far.

> So yeah, I think it could perhaps be improved, but in my mind talking
> about "hashmap.c" is fundamentally talking about chained buckets.

Admittedly I wouldn't touch hashmap.c, as I find its interface too
complex to wrap my head around.  But perhaps I just didn't try hard
enough, yet.

>> But I like how khash.h is both already in the tree and also really easy
>> to deploy, as it's just a single header file.  It's a tasty low-hanging
>> fruit.
> 
> Yeah. And if it really does perform better, I think we should stick with
> it in the code base. I wonder if we could stand to clean up the
> interfaces a little.  E.g., I had a hard time declaring a hash in one
> place, and then defining it somewhere else.

You can't use KHASH_DECLARE and KHASH_INIT together, as both declare
the same structs.  So I guess the idea is to have a header file with
KHASH_DECLARE and a .c file with KHASH_INIT, the latter *not* including
the former, but both including khash.h.  I didn't actually try that,
though.

> And I think as you found
> that it insists on heap-allocating the hash-table struct itself, which
> does not match our usual style.

Perhaps we can fix that with little effort (see below).

>> This is straight-forward, except for oidset_clear(), which needs to
>> allocate a kh_oid_t on the heap in order to be able to feed it to
>> kh_destroy_oid() for release it.  Alternatively we could open-code the
>> relevant parts of the latter, but that would be a layering violation.
> 
> This is kind of a layering violation, too. You're assuming that struct
> assignment is sufficient to make one kh struct freeable from another
> pointer. That's probably reasonable, since you're just destroying them
> both (e.g., some of our FLEX structs point into their own struct memory,
> making a hidden dependency; but they obviously would not need to free
> such a field).

Fair enough.  How about this on top?  (The khash.h part would go in
first in a separate patch in a proper series.)

NB: I stuck to the 4-spaces-tabs formatting in khash.h here.

---
 khash.h  | 9 +++++++--
 oidset.c | 4 +---
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/khash.h b/khash.h
index 07b4cc2e67..d10caa0c35 100644
--- a/khash.h
+++ b/khash.h
@@ -82,11 +82,16 @@ static const double __ac_HASH_UPPER = 0.77;
 	SCOPE kh_##name##_t *kh_init_##name(void) {							\
 		return (kh_##name##_t*)xcalloc(1, sizeof(kh_##name##_t));		\
 	}																	\
+	SCOPE void kh_release_##name(kh_##name##_t *h)						\
+	{																	\
+		free(h->flags);													\
+		free((void *)h->keys);											\
+		free((void *)h->vals);											\
+	}																	\
 	SCOPE void kh_destroy_##name(kh_##name##_t *h)						\
 	{																	\
 		if (h) {														\
-			free((void *)h->keys); free(h->flags);					\
-			free((void *)h->vals);										\
+			kh_release_##name(h);										\
 			free(h);													\
 		}																\
 	}																	\
diff --git a/oidset.c b/oidset.c
index d15b2b7a89..9836d427ef 100644
--- a/oidset.c
+++ b/oidset.c
@@ -25,8 +25,6 @@ int oidset_remove(struct oidset *set, const struct object_id *oid)
 
 void oidset_clear(struct oidset *set)
 {
-	kh_oid_t *to_free = kh_init_oid();
-	*to_free = set->set;
-	kh_destroy_oid(to_free);
+	kh_release_oid(&set->set);
 	oidset_init(set, 0);
 }
-- 
2.19.0

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] fsck: use oidset for skiplist
  2018-10-02 19:05                     ` René Scharfe
@ 2018-10-02 19:19                       ` Jeff King
  0 siblings, 0 replies; 31+ messages in thread
From: Jeff King @ 2018-10-02 19:19 UTC (permalink / raw)
  To: René Scharfe
  Cc: Git List, Ævar Arnfjörð Bjarmason, Ramsay Jones,
	Johannes Schindelin, Junio C Hamano

On Tue, Oct 02, 2018 at 09:05:32PM +0200, René Scharfe wrote:

> > The reason hashmap.c was added was to avoid open addressing. ;)
> Because efficient removal of elements is easier to implement with
> chaining, according to 6a364ced49 (add a hashtable implementation that
> supports O(1) removal).  khash.h deletes using its flags bitmap.  We
> didn't compare their performance when entries are removed so far.

I think it may depend on your workload. Open-addressing generally uses a
tombstone, so you're still dealing with the "deleted" entries until the
next table resize. I suspect that's fine in most cases, but I also am
sure you could find a benchmark that favors the chained approach (I
think in most cases we actually never delete at all -- we simply fill up
a table and then eventually clear it).

> > So yeah, I think it could perhaps be improved, but in my mind talking
> > about "hashmap.c" is fundamentally talking about chained buckets.
> 
> Admittedly I wouldn't touch hashmap.c, as I find its interface too
> complex to wrap my head around.  But perhaps I just didn't try hard
> enough, yet.

FWIW, it's not just you. ;)

> > Yeah. And if it really does perform better, I think we should stick with
> > it in the code base. I wonder if we could stand to clean up the
> > interfaces a little.  E.g., I had a hard time declaring a hash in one
> > place, and then defining it somewhere else.
> 
> You can't use KHASH_DECLARE and KHASH_INIT together, as both declare
> the same structs.  So I guess the idea is to have a header file with
> KHASH_DECLARE and a .c file with KHASH_INIT, the latter *not* including
> the former, but both including khash.h.  I didn't actually try that,
> though.

Yeah, that seems weird. You'd want to include one from the other to make
sure that they both match.

By the way, if you do want to pursue changes, I have no problem at all
hacking up khash into something that can't be merged with its upstream.
It's nice that it's a well-used and tested library, but I'd much rather
have something that we on this project understand (and that matches our
conventions and style).

> > This is kind of a layering violation, too. You're assuming that struct
> > assignment is sufficient to make one kh struct freeable from another
> > pointer. That's probably reasonable, since you're just destroying them
> > both (e.g., some of our FLEX structs point into their own struct memory,
> > making a hidden dependency; but they obviously would not need to free
> > such a field).
> 
> Fair enough.  How about this on top?  (The khash.h part would go in
> first in a separate patch in a proper series.)

Yes, much nicer, and the khash change wasn't too painful.

-Peff

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] fsck: use oidset for skiplist
  2018-08-11 17:23     ` Jeff King
  2018-08-11 20:59       ` René Scharfe
@ 2018-08-13 17:15       ` René Scharfe
  2018-08-14  2:01         ` Jeff King
  1 sibling, 1 reply; 31+ messages in thread
From: René Scharfe @ 2018-08-13 17:15 UTC (permalink / raw)
  To: Jeff King
  Cc: Git List, Ævar Arnfjörð Bjarmason, Ramsay Jones,
	Johannes Schindelin, Junio C Hamano

Am 11.08.2018 um 19:23 schrieb Jeff King:
> On Sat, Aug 11, 2018 at 01:02:48PM -0400, Jeff King wrote:
> 
>>    - we could probably improve the speed of oidset. Two things I notice
>>      about its implementation:
>>
>>        - it has to malloc for each entry, which I suspect is the main
>> 	bottleneck. We could probably pool-allocate blocks, and when
>> 	entries get removed just leave the allocations in place until we
>> 	clear(). Most callers tend to build up a set and then query it a
>> 	lot, or possibly remove items from the set until it's empty. But
>> 	my guess is that few or none want a long-lived set that they add
>> 	and remove from randomly.
>>
>>        - insertion lets you do check-and-insert as a single operation
>> 	(something I failed to notice in [1]). But it's not implemented
>> 	as efficiently as it could be, since the "contains" and "put"
>> 	operations do separate lookups. This doesn't matter for a set
>> 	that's queried a lot more, but for something like de-duping
>> 	(like I was doing in [1]) most operations are check-and-insert.
>> [...]
>> [1] https://public-inbox.org/git/20180810232457.GG19875@sigill.intra.peff.net/
>>      but note that it's buried pretty deep.
> 
> Some notes on this, based on the cat-file patch that I linked to.
> 
> Before any optimizations, my best-of-five timing for:
> 
>    git cat-file --batch-all-objects --unordered --buffer \
>                 --batch-check='%(objectname)' >/dev/null
> 
> in git.git was:
> 
>    real	0m0.434s
>    user	0m0.414s
>    sys	0m0.020s
> 
> That's enumerating every object in the repo but not doing much more than
> de-duping the names and printing them.
> 
> Applying this patch:
> 
> diff --git a/builtin/cat-file.c b/builtin/cat-file.c
> index 45992c9be9..04b5cda191 100644
> --- a/builtin/cat-file.c
> +++ b/builtin/cat-file.c
> @@ -443,9 +443,8 @@ static int batch_unordered_object(const struct object_id *oid, void *vdata)
>   {
>   	struct object_cb_data *data = vdata;
>   
> -	if (oidset_contains(data->seen, oid))
> +	if (oidset_insert(data->seen, oid))
>   		return 0;
> -	oidset_insert(data->seen, oid);
>   
>   	return batch_object_cb(oid, data);
>   }
> 
> to use the single-call set-and-replace doesn't seem to help (any
> improvement is within the run-to-run noise). So a single hash lookup per
> object does not seem to be measurable. And thus teaching oidset_insert()
> to do a single hash lookup for check-and-insert is unlikely to help us.
> 
> On top of that, I tried using a pool to store the set entries:
> 
> diff --git a/oidset.c b/oidset.c
> index 454c54f933..504929f177 100644
> --- a/oidset.c
> +++ b/oidset.c
> @@ -17,7 +17,10 @@ int oidset_insert(struct oidset *set, const struct object_id *oid)
>   	else if (oidset_contains(set, oid))
>   		return 1;
>   
> -	entry = xmalloc(sizeof(*entry));
> +	if (!set->pool)
> +		mem_pool_init(&set->pool, 0);
> +
> +	entry = mem_pool_alloc(set->pool, sizeof(*entry));
>   	oidcpy(&entry->oid, oid);
>   
>   	oidmap_put(&set->map, entry);
> @@ -29,12 +32,13 @@ int oidset_remove(struct oidset *set, const struct object_id *oid)
>   	struct oidmap_entry *entry;
>   
>   	entry = oidmap_remove(&set->map, oid);
> -	free(entry);
> +	/* abandon pool memory for "entry" */
>   
>   	return (entry != NULL);
>   }
>   
>   void oidset_clear(struct oidset *set)
>   {
> -	oidmap_free(&set->map, 1);
> +	oidmap_free(&set->map, 0);
> +	mem_pool_discard(set->pool, 0);
>   }
> diff --git a/oidset.h b/oidset.h
> index 40ec5f87fe..6b8b802987 100644
> --- a/oidset.h
> +++ b/oidset.h
> @@ -20,6 +20,7 @@
>    */
>   struct oidset {
>   	struct oidmap map;
> +	struct mem_pool *pool;
>   };
>   
>   #define OIDSET_INIT { OIDMAP_INIT }
> 
> That drops my best-of-five to:
> 
>    real	0m0.300s
>    user	0m0.288s
>    sys	0m0.012s
> 
> which is over a 25% speedup. So that does seem worth pursuing.
> 
> For reference, the oid_array code path for cat-file is still:
> 
>    real	0m0.161s
>    user	0m0.157s
>    sys	0m0.004s
> 
> but that's not completely apples to apples. The oidset code is also
> iterating the packfiles in a different order and generating a revidx
> (which I know is about 25ms in this repo). So a better test would
> actually swap one data structure out for the other with no other changes
> (I just happened to have this test handy, and really wanted to know
> whether the mem_pool stuff would help).

Getting sidetracked here, but the following patch helps both sides a bit:

-- >8 --
Subject: [PATCH] cat-file: reuse strbuf in batch_object_write()

Avoid allocating and then releasing memory for constructing the output
for each object by reusing the strbuf for all of them.

Signed-off-by: Rene Scharfe <l.s.r@web.de>
---
# on git.git
$ hyperfine "./git-cat-file --batch-all-objects --buffer --batch-check='%(objectname)'"

Before:
Benchmark #1: ./git-cat-file --batch-all-objects --buffer --batch-check='%(objectname)'

   Time (mean ± σ):     139.3 ms ±  20.1 ms    [User: 124.2 ms, System: 14.8 ms]

   Range (min … max):   124.4 ms … 205.9 ms

After:
Benchmark #1: ./git-cat-file --batch-all-objects --buffer --batch-check='%(objectname)'

   Time (mean ± σ):     115.1 ms ±  20.6 ms    [User: 102.0 ms, System: 12.8 ms]

   Range (min … max):    99.6 ms … 198.1 ms

Test done one a small VM -- the measurements are quite noisy.

  builtin/cat-file.c | 17 +++++++++++------
  1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/builtin/cat-file.c b/builtin/cat-file.c
index 4a44b2404f..a979fc1f3a 100644
--- a/builtin/cat-file.c
+++ b/builtin/cat-file.c
@@ -211,6 +211,12 @@ struct expand_data {
  	 * optimized out.
  	 */
  	unsigned skip_object_info : 1;
+
+	/*
+	 * Scratch space for expanded string; shared between invocations
+	 * to reduce the number of memory allocations.
+	 */
+	struct strbuf *scratch;
  };
  
  static int is_atom(const char *atom, const char *s, int slen)
@@ -340,8 +346,6 @@ static void print_object_or_die(struct batch_options *opt, struct expand_data *d
  static void batch_object_write(const char *obj_name, struct batch_options *opt,
  			       struct expand_data *data)
  {
-	struct strbuf buf = STRBUF_INIT;
-
  	if (!data->skip_object_info &&
  	    oid_object_info_extended(the_repository, &data->oid, &data->info,
  				     OBJECT_INFO_LOOKUP_REPLACE) < 0) {
@@ -351,10 +355,10 @@ static void batch_object_write(const char *obj_name, struct batch_options *opt,
  		return;
  	}
  
-	strbuf_expand(&buf, opt->format, expand_format, data);
-	strbuf_addch(&buf, '\n');
-	batch_write(opt, buf.buf, buf.len);
-	strbuf_release(&buf);
+	strbuf_reset(data->scratch);
+	strbuf_expand(data->scratch, opt->format, expand_format, data);
+	strbuf_addch(data->scratch, '\n');
+	batch_write(opt, data->scratch->buf, data->scratch->len);
  
  	if (opt->print_contents) {
  		print_object_or_die(opt, data);
@@ -453,6 +457,7 @@ static int batch_objects(struct batch_options *opt)
  	 * object.
  	 */
  	memset(&data, 0, sizeof(data));
+	data.scratch = &buf;
  	data.mark_query = 1;
  	strbuf_expand(&buf, opt->format, expand_format, &data);
  	data.mark_query = 0;
-- 
2.18.0

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] fsck: use oidset for skiplist
  2018-08-13 17:15       ` René Scharfe
@ 2018-08-14  2:01         ` Jeff King
  0 siblings, 0 replies; 31+ messages in thread
From: Jeff King @ 2018-08-14  2:01 UTC (permalink / raw)
  To: René Scharfe
  Cc: Git List, Ævar Arnfjörð Bjarmason, Ramsay Jones,
	Johannes Schindelin, Junio C Hamano

On Mon, Aug 13, 2018 at 07:15:19PM +0200, René Scharfe wrote:

> Getting sidetracked here, but the following patch helps both sides a bit:
> 
> -- >8 --
> Subject: [PATCH] cat-file: reuse strbuf in batch_object_write()
> 
> Avoid allocating and then releasing memory for constructing the output
> for each object by reusing the strbuf for all of them.

Thanks, this an easy and sensible optimization. I have a few patches to
send on top of my cat-file topic, so I'll pick this up there.

-Peff

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] fsck: use oidset for skiplist
  2018-08-11 15:47 ` [PATCH 2/2] fsck: use oidset for skiplist René Scharfe
  2018-08-11 16:54   ` Ævar Arnfjörð Bjarmason
  2018-08-11 17:02   ` Jeff King
@ 2018-08-11 20:48   ` Ramsay Jones
  2018-08-25 18:49     ` René Scharfe
  2018-08-13 18:43   ` Junio C Hamano
  3 siblings, 1 reply; 31+ messages in thread
From: Ramsay Jones @ 2018-08-11 20:48 UTC (permalink / raw)
  To: René Scharfe, Git List
  Cc: Ævar Arnfjörð Bjarmason, Johannes Schindelin,
	Junio C Hamano



On 11/08/18 16:47, René Scharfe wrote:
> Object IDs to skip are stored in a shared static oid_array.  Lookups do
> a binary search on the sorted array.  The code checks if the object IDs
> are already in the correct order while loading and skips sorting in that
> case.
> 
> Simplify the code by using an oidset instead.  Memory usage is a bit
> higher, but lookups are done in constant time and there is no need to
> worry about any sort order.
> 
> Embed the oidset into struct fsck_options to make its ownership clear
> (no hidden sharing) and avoid unnecessary pointer indirection.
> 
> Signed-off-by: Rene Scharfe <l.s.r@web.de>
> ---
>  fsck.c | 23 ++---------------------
>  fsck.h |  8 +++++---
>  2 files changed, 7 insertions(+), 24 deletions(-)
> 
> diff --git a/fsck.c b/fsck.c
> index 83f4562390..9246afee22 100644
> --- a/fsck.c
> +++ b/fsck.c
> @@ -10,7 +10,6 @@
>  #include "fsck.h"
>  #include "refs.h"
>  #include "utf8.h"
> -#include "sha1-array.h"
>  #include "decorate.h"
>  #include "oidset.h"
>  #include "packfile.h"
> @@ -182,19 +181,10 @@ static int fsck_msg_type(enum fsck_msg_id msg_id,
>  
>  static void init_skiplist(struct fsck_options *options, const char *path)
>  {
> -    static struct oid_array skiplist = OID_ARRAY_INIT;
> -    int sorted;
>      FILE *fp;
>      struct strbuf sb = STRBUF_INIT;
>      struct object_id oid;
>  
> -    if (options->skiplist)
> -        sorted = options->skiplist->sorted;
> -    else {
> -        sorted = 1;
> -        options->skiplist = &skiplist;
> -    }
> -
>      fp = fopen(path, "r");
>      if (!fp)
>          die("Could not open skip list: %s", path);
> @@ -202,17 +192,10 @@ static void init_skiplist(struct fsck_options *options, const char *path)
>          const char *p;
>          if (parse_oid_hex(sb.buf, &oid, &p) || *p != '\0')
>              die("Invalid SHA-1: %s", sb.buf);
> -        oid_array_append(&skiplist, &oid);
> -        if (sorted && skiplist.nr > 1 &&
> -                oidcmp(&skiplist.oid[skiplist.nr - 2],
> -                       &oid) > 0)
> -            sorted = 0;
> +        oidset_insert(&options->skiplist, &oid);
>      }
>      fclose(fp);
>      strbuf_release(&sb);
> -
> -    if (sorted)
> -        skiplist.sorted = 1;
>  }
>  
>  static int parse_msg_type(const char *str)
> @@ -317,9 +300,7 @@ static void append_msg_id(struct strbuf *sb, const char *msg_id)
>  
>  static int object_on_skiplist(struct fsck_options *opts, struct object *obj)
>  {
> -    if (opts && opts->skiplist && obj)
> -        return oid_array_lookup(opts->skiplist, &obj->oid) >= 0;
> -    return 0;
> +    return opts && obj && oidset_contains(&opts->skiplist, &obj->oid);
>  }
>  
>  __attribute__((format (printf, 4, 5)))
> diff --git a/fsck.h b/fsck.h
> index c3cf5e0034..26120e6186 100644
> --- a/fsck.h
> +++ b/fsck.h
> @@ -1,6 +1,8 @@
>  #ifndef GIT_FSCK_H
>  #define GIT_FSCK_H
>  
> +#include "oidset.h"
> +
>  #define FSCK_ERROR 1
>  #define FSCK_WARN 2
>  #define FSCK_IGNORE 3
> @@ -34,12 +36,12 @@ struct fsck_options {
>      fsck_error error_func;
>      unsigned strict:1;
>      int *msg_type;
> -    struct oid_array *skiplist;
> +    struct oidset skiplist;
>      struct decoration *object_names;
>  };
>  
> -#define FSCK_OPTIONS_DEFAULT { NULL, fsck_error_function, 0, NULL }
> -#define FSCK_OPTIONS_STRICT { NULL, fsck_error_function, 1, NULL }
> +#define FSCK_OPTIONS_DEFAULT { NULL, fsck_error_function, 0, NULL, OIDSET_INIT }
> +#define FSCK_OPTIONS_STRICT { NULL, fsck_error_function, 1, NULL, OIDSET_INIT }

Note that a NULL initialiser, for the object_names field, is missing
(not introduced by this patch). Since you have bumped into the 80th
column, you may not want to add that NULL to the end of these macros
(it is not _necessary_ after all). However, ... :-D

Otherwise, LGTM.

Thanks!

ATB,
Ramsay Jones

>  
>  /* descend in all linked child objects
>   * the return value is:

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] fsck: use oidset for skiplist
  2018-08-11 20:48   ` Ramsay Jones
@ 2018-08-25 18:49     ` René Scharfe
  0 siblings, 0 replies; 31+ messages in thread
From: René Scharfe @ 2018-08-25 18:49 UTC (permalink / raw)
  To: Ramsay Jones, Git List
  Cc: Ævar Arnfjörð Bjarmason, Johannes Schindelin,
	Junio C Hamano

Am 11.08.2018 um 22:48 schrieb Ramsay Jones:
> On 11/08/18 16:47, René Scharfe wrote:
>> @@ -34,12 +36,12 @@ struct fsck_options {
>>      fsck_error error_func;
>>      unsigned strict:1;
>>      int *msg_type;
>> -    struct oid_array *skiplist;
>> +    struct oidset skiplist;
>>      struct decoration *object_names;
>>  };
>>  
>> -#define FSCK_OPTIONS_DEFAULT { NULL, fsck_error_function, 0, NULL }
>> -#define FSCK_OPTIONS_STRICT { NULL, fsck_error_function, 1, NULL }
>> +#define FSCK_OPTIONS_DEFAULT { NULL, fsck_error_function, 0, NULL, OIDSET_INIT }
>> +#define FSCK_OPTIONS_STRICT { NULL, fsck_error_function, 1, NULL, OIDSET_INIT }
> 
> Note that a NULL initialiser, for the object_names field, is missing
> (not introduced by this patch). Since you have bumped into the 80th
> column, you may not want to add that NULL to the end of these macros
> (it is not _necessary_ after all). However, ... :-D

Exactly my thoughts -- except the "However" part. :)

I even thought about reordering the struct to move the NULL-initialized
elements to the end, allowing us to drop them from the initializer, but
felt that this would be a bit too much..

René

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] fsck: use oidset for skiplist
  2018-08-11 15:47 ` [PATCH 2/2] fsck: use oidset for skiplist René Scharfe
                     ` (2 preceding siblings ...)
  2018-08-11 20:48   ` Ramsay Jones
@ 2018-08-13 18:43   ` Junio C Hamano
  2018-08-13 20:26     ` René Scharfe
  3 siblings, 1 reply; 31+ messages in thread
From: Junio C Hamano @ 2018-08-13 18:43 UTC (permalink / raw)
  To: René Scharfe
  Cc: Git List, Ævar Arnfjörð Bjarmason, Ramsay Jones,
	Johannes Schindelin

René Scharfe <l.s.r@web.de> writes:

> @@ -182,19 +181,10 @@ static int fsck_msg_type(enum fsck_msg_id msg_id,
>  static void init_skiplist(struct fsck_options *options, const char
> *path)
>  {
> -	static struct oid_array skiplist = OID_ARRAY_INIT;
> -	int sorted;
>  	FILE *fp;
>  	struct strbuf sb = STRBUF_INIT;
>  	struct object_id oid;
>  -	if (options->skiplist)
> -		sorted = options->skiplist->sorted;
> -	else {

That SP before '-' on the removed line is the most curious aspect of
this patch; I do not recall the last time I saw a corrupt patch from
you---changed where you read/write your mails recently?

> @@ -317,9 +300,7 @@ static void append_msg_id(struct strbuf *sb, const char *msg_id)
>  static int object_on_skiplist(struct fsck_options *opts, struct
> object *obj)
>  {
> -	if (opts && opts->skiplist && obj)
> -		return oid_array_lookup(opts->skiplist, &obj->oid) >= 0;
> -	return 0;
> +	return opts && obj && oidset_contains(&opts->skiplist, &obj->oid);

OK ;-)


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] fsck: use oidset for skiplist
  2018-08-13 18:43   ` Junio C Hamano
@ 2018-08-13 20:26     ` René Scharfe
  2018-08-13 21:07       ` Junio C Hamano
  0 siblings, 1 reply; 31+ messages in thread
From: René Scharfe @ 2018-08-13 20:26 UTC (permalink / raw)
  To: Junio C Hamano
  Cc: Git List, Ævar Arnfjörð Bjarmason, Ramsay Jones,
	Johannes Schindelin

Am 13.08.2018 um 20:43 schrieb Junio C Hamano:
> René Scharfe <l.s.r@web.de> writes:
> 
>> @@ -182,19 +181,10 @@ static int fsck_msg_type(enum fsck_msg_id msg_id,
>>   static void init_skiplist(struct fsck_options *options, const char
>> *path)
>>   {
>> -	static struct oid_array skiplist = OID_ARRAY_INIT;
>> -	int sorted;
>>   	FILE *fp;
>>   	struct strbuf sb = STRBUF_INIT;
>>   	struct object_id oid;
>>   -	if (options->skiplist)
>> -		sorted = options->skiplist->sorted;
>> -	else {
> 
> That SP before '-' on the removed line is the most curious aspect of
> this patch; I do not recall the last time I saw a corrupt patch from
> you---changed where you read/write your mails recently?

Well, I updated Thunderbird to version 60 a few days ago, but I can't
see that extra space in my Sent folder, nor via the NNTP interface of
the mailing list [1], nor on the web interface [2].  The latter shows
extra spaces on the context lines of the first hunk, though, which I
can't see anywhere else.  All the lines look fine in the citation of
Ramsay's reply [3].  So I don't know where these extra spaces are
coming from. :-/

René


[1] news://news.public-inbox.org:119/54a5367f-f832-402c-f51b-3225c92b41ad@web.de
[2] https://public-inbox.org/git/54a5367f-f832-402c-f51b-3225c92b41ad@web.de/
[3] https://public-inbox.org/git/bc9f21c6-b362-2e3f-1820-7da93a76a7c8@ramsayjones.plus.com/


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] fsck: use oidset for skiplist
  2018-08-13 20:26     ` René Scharfe
@ 2018-08-13 21:07       ` Junio C Hamano
  2018-08-13 23:09         ` René Scharfe
  0 siblings, 1 reply; 31+ messages in thread
From: Junio C Hamano @ 2018-08-13 21:07 UTC (permalink / raw)
  To: René Scharfe
  Cc: Git List, Ævar Arnfjörð Bjarmason, Ramsay Jones,
	Johannes Schindelin

René Scharfe <l.s.r@web.de> writes:

> the mailing list [1], nor on the web interface [2].  The latter shows
> extra spaces on the context lines of the first hunk, though, which I
> can't see anywhere else.  All the lines look fine in the citation of
> Ramsay's reply [3].  So I don't know where these extra spaces are
> coming from. :-/

Hmph, interesting.

https://public-inbox.org/git/54a5367f-f832-402c-f51b-3225c92b41ad@web.de/raw

has "Content-Type: text/plain; charset=utf-8; format=flowed".  That
page's rendition is more faithful to the bare text.

The funky " -" one I showed was what Gnus/Emacs came up with as the
result of its best effort to make the format=flawed into something
closer to "text", I think X-<.  

In any case, I do not think format=flowed can be reverted reliably
(or can it be?  If so we should teach mailinfo to repair them).

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] fsck: use oidset for skiplist
  2018-08-13 21:07       ` Junio C Hamano
@ 2018-08-13 23:09         ` René Scharfe
  0 siblings, 0 replies; 31+ messages in thread
From: René Scharfe @ 2018-08-13 23:09 UTC (permalink / raw)
  To: Junio C Hamano
  Cc: Git List, Ævar Arnfjörð Bjarmason, Ramsay Jones,
	Johannes Schindelin

Am 13.08.2018 um 23:07 schrieb Junio C Hamano:
> René Scharfe <l.s.r@web.de> writes:
> 
>> the mailing list [1], nor on the web interface [2].  The latter shows
>> extra spaces on the context lines of the first hunk, though, which I
>> can't see anywhere else.  All the lines look fine in the citation of
>> Ramsay's reply [3].  So I don't know where these extra spaces are
>> coming from. :-/
> 
> Hmph, interesting.
> 
> https://public-inbox.org/git/54a5367f-f832-402c-f51b-3225c92b41ad@web.de/raw
> 
> has "Content-Type: text/plain; charset=utf-8; format=flowed".  That
> page's rendition is more faithful to the bare text.

That explains it: Thunderbird 60 disables most older Add-ons, among them
Toggle Word Wrap, which used to turn off format=flowed for me.  I did
that now using the config settings mailnews.send_plaintext_flowed and
mailnews.display.disable_format_flowed_support.

> The funky " -" one I showed was what Gnus/Emacs came up with as the
> result of its best effort to make the format=flawed into something
> closer to "text", I think X-<.  

Sorry. :(

> In any case, I do not think format=flowed can be reverted reliably
> (or can it be?  If so we should teach mailinfo to repair them).

RFC3676 gives me a headache, perhaps I should go to bed.  If we can
assume that lines don't have trailing spaces originally then we should
be able to reconstruct their contents, no?  "A generating agent SHOULD:
[...] Trim spaces before user-inserted hard line breaks.", i.e. lines
with trailing spaces are doomed to truncation without hope for repair.

René

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/2] fsck: use strbuf_getline() to read skiplist file
  2018-08-11 15:39 [PATCH 1/2] fsck: use strbuf_getline() to read skiplist file René Scharfe
  2018-08-11 15:47 ` [PATCH 2/2] fsck: use oidset for skiplist René Scharfe
@ 2018-08-11 16:48 ` Jeff King
  2018-08-11 21:00   ` René Scharfe
  2018-08-25 18:50 ` [PATCH v2 " René Scharfe
  2018-08-25 18:50 ` [PATCH v2 2/2] fsck: use oidset for skiplist René Scharfe
  3 siblings, 1 reply; 31+ messages in thread
From: Jeff King @ 2018-08-11 16:48 UTC (permalink / raw)
  To: René Scharfe
  Cc: Git List, Ævar Arnfjörð Bjarmason, Ramsay Jones,
	Johannes Schindelin, Junio C Hamano

On Sat, Aug 11, 2018 at 05:39:27PM +0200, René Scharfe wrote:

> The char array named "buffer" is unlikely to contain a NUL character, so
> printing its contents using %s in a die() format is unsafe.  Clang's
> ASan reports running over the end of buffer in the recently added
> skiplist tests in t5504-fetch-receive-strict.sh as a result.
> 
> Use an idiomatic strbuf_getline() loop instead, which ensures the buffer
> is always NUL-terminated.  As a side-effect this also adds support for
> skiplist files with CRLF line endings.

Nice. Two other side effects, both of which I think are good:

 - this is likely faster for a large skiplist due to fewer syscalls

 - this will no longer complain about a sha1 with a missing newline at
   the end-of-file (it will quietly handle it as if the newline were
   there)

And one I'm not sure about:

 - a read() error will now be quietly ignored; I guess we'd have to
   check ferror(fp) to cover this. I'm not sure if it matters.

>  fsck.c | 23 ++++++++++-------------
>  1 file changed, 10 insertions(+), 13 deletions(-)

Code itself looks good to me (assuming we don't care about the read()
error thing).

-Peff

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/2] fsck: use strbuf_getline() to read skiplist file
  2018-08-11 16:48 ` [PATCH 1/2] fsck: use strbuf_getline() to read skiplist file Jeff King
@ 2018-08-11 21:00   ` René Scharfe
  0 siblings, 0 replies; 31+ messages in thread
From: René Scharfe @ 2018-08-11 21:00 UTC (permalink / raw)
  To: Jeff King
  Cc: Git List, Ævar Arnfjörð Bjarmason, Ramsay Jones,
	Johannes Schindelin, Junio C Hamano

Am 11.08.2018 um 18:48 schrieb Jeff King:
> And one I'm not sure about:
> 
>   - a read() error will now be quietly ignored; I guess we'd have to
>     check ferror(fp) to cover this. I'm not sure if it matters.

I'm not sure, either.  It would catch media errors or file system
corruption, right?  Sounds useful, actually.  We should check the other
callers of strbuf_getline and friends as well, though, as reporting such
errors seems to be omitted in most cases:

	$ git grep strbuf_getline | wc -l
	99
	$ git grep ferror | wc -l
	35

René

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH v2 1/2] fsck: use strbuf_getline() to read skiplist file
  2018-08-11 15:39 [PATCH 1/2] fsck: use strbuf_getline() to read skiplist file René Scharfe
  2018-08-11 15:47 ` [PATCH 2/2] fsck: use oidset for skiplist René Scharfe
  2018-08-11 16:48 ` [PATCH 1/2] fsck: use strbuf_getline() to read skiplist file Jeff King
@ 2018-08-25 18:50 ` René Scharfe
  2018-08-27 23:00   ` Jeff King
  2018-08-25 18:50 ` [PATCH v2 2/2] fsck: use oidset for skiplist René Scharfe
  3 siblings, 1 reply; 31+ messages in thread
From: René Scharfe @ 2018-08-25 18:50 UTC (permalink / raw)
  To: Git List
  Cc: Ævar Arnfjörð Bjarmason, Ramsay Jones,
	Johannes Schindelin, Junio C Hamano, Jeff King

buffer is unlikely to contain a NUL character, so printing its contents
using %s in a die() format is unsafe (detected with ASan).

Use an idiomatic strbuf_getline() loop instead, which ensures the buffer
is always NUL-terminated, supports CRLF files as well, accepts files
without a newline after the last line, supports any hash length
automatically, and is shorter.

Helped-by: Jeff King <peff@peff.net>
Signed-off-by: Rene Scharfe <l.s.r@web.de>
---
Added error check.
Hopefully fixed my MUA config..

 fsck.c | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/fsck.c b/fsck.c
index a0cee0be59..972a26b9ba 100644
--- a/fsck.c
+++ b/fsck.c
@@ -183,8 +183,9 @@ static int fsck_msg_type(enum fsck_msg_id msg_id,
 static void init_skiplist(struct fsck_options *options, const char *path)
 {
 	static struct oid_array skiplist = OID_ARRAY_INIT;
-	int sorted, fd;
-	char buffer[GIT_MAX_HEXSZ + 1];
+	int sorted;
+	FILE *fp;
+	struct strbuf sb = STRBUF_INIT;
 	struct object_id oid;
 
 	if (options->skiplist)
@@ -194,25 +195,23 @@ static void init_skiplist(struct fsck_options *options, const char *path)
 		options->skiplist = &skiplist;
 	}
 
-	fd = open(path, O_RDONLY);
-	if (fd < 0)
+	fp = fopen(path, "r");
+	if (!fp)
 		die("Could not open skip list: %s", path);
-	for (;;) {
+	while (!strbuf_getline(&sb, fp)) {
 		const char *p;
-		int result = read_in_full(fd, buffer, sizeof(buffer));
-		if (result < 0)
-			die_errno("Could not read '%s'", path);
-		if (!result)
-			break;
-		if (parse_oid_hex(buffer, &oid, &p) || *p != '\n')
-			die("Invalid SHA-1: %s", buffer);
+		if (parse_oid_hex(sb.buf, &oid, &p) || *p != '\0')
+			die("Invalid SHA-1: %s", sb.buf);
 		oid_array_append(&skiplist, &oid);
 		if (sorted && skiplist.nr > 1 &&
 				oidcmp(&skiplist.oid[skiplist.nr - 2],
 				       &oid) > 0)
 			sorted = 0;
 	}
-	close(fd);
+	if (ferror(fp))
+		die_errno("Could not read '%s'", path);
+	fclose(fp);
+	strbuf_release(&sb);
 
 	if (sorted)
 		skiplist.sorted = 1;
-- 
2.18.0

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH v2 1/2] fsck: use strbuf_getline() to read skiplist file
  2018-08-25 18:50 ` [PATCH v2 " René Scharfe
@ 2018-08-27 23:00   ` Jeff King
  0 siblings, 0 replies; 31+ messages in thread
From: Jeff King @ 2018-08-27 23:00 UTC (permalink / raw)
  To: René Scharfe
  Cc: Git List, Ævar Arnfjörð Bjarmason, Ramsay Jones,
	Johannes Schindelin, Junio C Hamano

On Sat, Aug 25, 2018 at 08:50:28PM +0200, René Scharfe wrote:

> buffer is unlikely to contain a NUL character, so printing its contents
> using %s in a die() format is unsafe (detected with ASan).

Having mostly forgotten about our earlier discussion, I got confused by
this, thinking the problem was that there is some issue with missing
NULs in the input.

But it is really just:

  We read() into a buffer and on error format the contents using "%s".
  But read() does not NUL-terminate, so die() might walk past the end of
  the buffer.

We _might_ be saved by a NUL in the input, but that is not the primary
concern. ;)

Not worth a re-roll on its own, but since there is some other
discussion, I thought I'd mention my confusion. :)

> Added error check.
> Hopefully fixed my MUA config..
> 
>  fsck.c | 25 ++++++++++++-------------
>  1 file changed, 12 insertions(+), 13 deletions(-)

Patch itself looks good to me.

-Peff

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH v2 2/2] fsck: use oidset for skiplist
  2018-08-11 15:39 [PATCH 1/2] fsck: use strbuf_getline() to read skiplist file René Scharfe
                   ` (2 preceding siblings ...)
  2018-08-25 18:50 ` [PATCH v2 " René Scharfe
@ 2018-08-25 18:50 ` René Scharfe
  2018-08-27  7:37   ` Ævar Arnfjörð Bjarmason
  3 siblings, 1 reply; 31+ messages in thread
From: René Scharfe @ 2018-08-25 18:50 UTC (permalink / raw)
  To: Git List
  Cc: Ævar Arnfjörð Bjarmason, Ramsay Jones,
	Johannes Schindelin, Junio C Hamano, Jeff King

Object IDs to skip are stored in a shared static oid_array.  Lookups do
a binary search on the sorted array.  The code checks if the object IDs
are already in the correct order while loading and skips sorting in that
case.  Lookups are done before reporting a (non-fatal) corruption and
before checking .gitmodules files.

Simplify the code by using an oidset instead.  Memory usage is a bit
higher, but we don't need to worry about any sort order anymore.  Embed
the oidset into struct fsck_options to make its ownership clear (no
hidden sharing) and avoid unnecessary pointer indirection.

Performance on repositories with a low number of reported issues and
.gitmodules files (i.e. the usual case) won't be affected much.  The
oidset should be a bit quicker with higher numbers of bad objects in
the skipList.

Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Rene Scharfe <l.s.r@web.de>
---
Added small documentation update.

 Documentation/config.txt |  2 +-
 fsck.c                   | 23 ++---------------------
 fsck.h                   |  8 +++++---
 3 files changed, 8 insertions(+), 25 deletions(-)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index 2fa65b7516..80ab570579 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -1715,7 +1715,7 @@ doing the same for `receive.fsck.<msg-id>` and `fetch.fsck.<msg-id>`
 will only cause git to warn.
 
 fsck.skipList::
-	The path to a sorted list of object names (i.e. one SHA-1 per
+	The path to a list of object names (i.e. one SHA-1 per
 	line) that are known to be broken in a non-fatal way and should
 	be ignored. This feature is useful when an established project
 	should be accepted despite early commits containing errors that
diff --git a/fsck.c b/fsck.c
index 972a26b9ba..4c643f1d40 100644
--- a/fsck.c
+++ b/fsck.c
@@ -10,7 +10,6 @@
 #include "fsck.h"
 #include "refs.h"
 #include "utf8.h"
-#include "sha1-array.h"
 #include "decorate.h"
 #include "oidset.h"
 #include "packfile.h"
@@ -182,19 +181,10 @@ static int fsck_msg_type(enum fsck_msg_id msg_id,
 
 static void init_skiplist(struct fsck_options *options, const char *path)
 {
-	static struct oid_array skiplist = OID_ARRAY_INIT;
-	int sorted;
 	FILE *fp;
 	struct strbuf sb = STRBUF_INIT;
 	struct object_id oid;
 
-	if (options->skiplist)
-		sorted = options->skiplist->sorted;
-	else {
-		sorted = 1;
-		options->skiplist = &skiplist;
-	}
-
 	fp = fopen(path, "r");
 	if (!fp)
 		die("Could not open skip list: %s", path);
@@ -202,19 +192,12 @@ static void init_skiplist(struct fsck_options *options, const char *path)
 		const char *p;
 		if (parse_oid_hex(sb.buf, &oid, &p) || *p != '\0')
 			die("Invalid SHA-1: %s", sb.buf);
-		oid_array_append(&skiplist, &oid);
-		if (sorted && skiplist.nr > 1 &&
-				oidcmp(&skiplist.oid[skiplist.nr - 2],
-				       &oid) > 0)
-			sorted = 0;
+		oidset_insert(&options->skiplist, &oid);
 	}
 	if (ferror(fp))
 		die_errno("Could not read '%s'", path);
 	fclose(fp);
 	strbuf_release(&sb);
-
-	if (sorted)
-		skiplist.sorted = 1;
 }
 
 static int parse_msg_type(const char *str)
@@ -319,9 +302,7 @@ static void append_msg_id(struct strbuf *sb, const char *msg_id)
 
 static int object_on_skiplist(struct fsck_options *opts, struct object *obj)
 {
-	if (opts && opts->skiplist && obj)
-		return oid_array_lookup(opts->skiplist, &obj->oid) >= 0;
-	return 0;
+	return opts && obj && oidset_contains(&opts->skiplist, &obj->oid);
 }
 
 __attribute__((format (printf, 4, 5)))
diff --git a/fsck.h b/fsck.h
index 0c7e8c9428..b95595ae5f 100644
--- a/fsck.h
+++ b/fsck.h
@@ -1,6 +1,8 @@
 #ifndef GIT_FSCK_H
 #define GIT_FSCK_H
 
+#include "oidset.h"
+
 #define FSCK_ERROR 1
 #define FSCK_WARN 2
 #define FSCK_IGNORE 3
@@ -35,12 +37,12 @@ struct fsck_options {
 	fsck_error error_func;
 	unsigned strict:1;
 	int *msg_type;
-	struct oid_array *skiplist;
+	struct oidset skiplist;
 	struct decoration *object_names;
 };
 
-#define FSCK_OPTIONS_DEFAULT { NULL, fsck_error_function, 0, NULL }
-#define FSCK_OPTIONS_STRICT { NULL, fsck_error_function, 1, NULL }
+#define FSCK_OPTIONS_DEFAULT { NULL, fsck_error_function, 0, NULL, OIDSET_INIT }
+#define FSCK_OPTIONS_STRICT { NULL, fsck_error_function, 1, NULL, OIDSET_INIT }
 
 /* descend in all linked child objects
  * the return value is:
-- 
2.18.0

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH v2 2/2] fsck: use oidset for skiplist
  2018-08-25 18:50 ` [PATCH v2 2/2] fsck: use oidset for skiplist René Scharfe
@ 2018-08-27  7:37   ` Ævar Arnfjörð Bjarmason
  2018-08-27 15:23     ` René Scharfe
  0 siblings, 1 reply; 31+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2018-08-27  7:37 UTC (permalink / raw)
  To: René Scharfe
  Cc: Git List, Ramsay Jones, Johannes Schindelin, Junio C Hamano, Jeff King


On Sat, Aug 25 2018, René Scharfe wrote:

> diff --git a/Documentation/config.txt b/Documentation/config.txt
> index 2fa65b7516..80ab570579 100644
> --- a/Documentation/config.txt
> +++ b/Documentation/config.txt
> @@ -1715,7 +1715,7 @@ doing the same for `receive.fsck.<msg-id>` and `fetch.fsck.<msg-id>`
>  will only cause git to warn.
>
>  fsck.skipList::
> -	The path to a sorted list of object names (i.e. one SHA-1 per
> +	The path to a list of object names (i.e. one SHA-1 per
>  	line) that are known to be broken in a non-fatal way and should
>  	be ignored. This feature is useful when an established project
>  	should be accepted despite early commits containing errors that

I was going to say that since this is a file format we're likely to
support across versions we should make a note that "up to version
so-and-so this needed to be sorted, but that's longer the case. So
e.g. someone wouldn't test this on 2.20 (or read the online docs) and
then deploy this for older clients..

But...

> -	if (options->skiplist)
> -		sorted = options->skiplist->sorted;
> -	else {
> -		sorted = 1;
> -		options->skiplist = &skiplist;
> -	}
> -
>  	fp = fopen(path, "r");
>  	if (!fp)
>  		die("Could not open skip list: %s", path);
> @@ -202,19 +192,12 @@ static void init_skiplist(struct fsck_options *options, const char *path)
>  		const char *p;
>  		if (parse_oid_hex(sb.buf, &oid, &p) || *p != '\0')
>  			die("Invalid SHA-1: %s", sb.buf);
> -		oid_array_append(&skiplist, &oid);
> -		if (sorted && skiplist.nr > 1 &&
> -				oidcmp(&skiplist.oid[skiplist.nr - 2],
> -				       &oid) > 0)
> -			sorted = 0;
> +		oidset_insert(&options->skiplist, &oid);

...reading this implementation, where we always called
oid_array_append(), but then just kept track of whether the set was
sorted...

>  	}
>  	if (ferror(fp))
>  		die_errno("Could not read '%s'", path);
>  	fclose(fp);
>  	strbuf_release(&sb);
> -
> -	if (sorted)
> -		skiplist.sorted = 1;

...and here where we assigned to the .sorted member of the oid_array...

>  static int object_on_skiplist(struct fsck_options *opts, struct object *obj)
>  {
> -	if (opts && opts->skiplist && obj)
> -		return oid_array_lookup(opts->skiplist, &obj->oid) >= 0;
> -	return 0;
> +	return opts && obj && oidset_contains(&opts->skiplist, &obj->oid);
>  }

....and here where we'd always do the lookup if the skiplist was
initialized, *not* just if it's sorted, and how the sha1-array.c code
has looked ever since cd94c6f91 ("fsck: git receive-pack: support
excluding objects from fsck'ing", 2015-06-22) when this was first added:

    $ git show cd94c6f91:sha1-array.c|grep -A5 sha1_array_lookup
    int sha1_array_lookup(struct sha1_array *array, const unsigned char *sha1)
    {
            if (!array->sorted)
                    sha1_array_sort(array);
            return sha1_pos(sha1, array->sha1, array->nr, sha1_access);
    }

So I think it makes sense to make this series a three-part, where in the
first part we only change these docs to say s/sorted //, and modify the
tests I added in 65a836fa6b ("fsck: add stress tests for fsck.skipList",
2018-07-27) to assert that an unsorted & sorted list of SHA-1s works
just as well.

Then following up with your [12]/2, where the internal implementation is
changed, but we make it clear that it's *just* the internal
implementation. I.e. from a UI perspective the list never had to be
pre-sorted, we'd just spend some work sorting it on the first lookup if
it wasn't sorted already.

Now without some very careful reading it's not clear what "we don't need
to worry about any sort order anymore" in the commit message means,
i.e. what it really means is "for the purposes of the internal
implementation, and as an opt-in user-side optimization ...".

I.e. an alternate version of this whole patch series could also be:

    diff --git a/Documentation/config.txt b/Documentation/config.txt
    index 1c4236498..930807e43 100644
    --- a/Documentation/config.txt
    +++ b/Documentation/config.txt
    @@ -1709,5 +1709,5 @@ will only cause git to warn.

     fsck.skipList::
    -       The path to a sorted list of object names (i.e. one SHA-1 per
    +       The path to a list of object names (i.e. one SHA-1 per
            line) that are known to be broken in a non-fatal way and should
            be ignored. This feature is useful when an established project
    diff --git a/fsck.c b/fsck.c
    index a0cee0be5..9d4e938ad 100644
    --- a/fsck.c
    +++ b/fsck.c
    @@ -184,14 +184,10 @@ static void init_skiplist(struct fsck_options *options, const char *path)
     {
            static struct oid_array skiplist = OID_ARRAY_INIT;
    -       int sorted, fd;
    +       int fd;
            char buffer[GIT_MAX_HEXSZ + 1];
            struct object_id oid;

    -       if (options->skiplist)
    -               sorted = options->skiplist->sorted;
    -       else {
    -               sorted = 1;
    +       if (!options->skiplist)
                    options->skiplist = &skiplist;
    -       }

            fd = open(path, O_RDONLY);
    @@ -208,13 +204,6 @@ static void init_skiplist(struct fsck_options *options, const char *path)
                            die("Invalid SHA-1: %s", buffer);
                    oid_array_append(&skiplist, &oid);
    -               if (sorted && skiplist.nr > 1 &&
    -                               oidcmp(&skiplist.oid[skiplist.nr - 2],
    -                                      &oid) > 0)
    -                       sorted = 0;
            }
            close(fd);
    -
    -       if (sorted)
    -               skiplist.sorted = 1;
     }

Now, I like yours much better. I'm just saying that currently the
patch/commit message combo is confusing about *what* it's
doing. I.e. let's not mix up the correction of docs that were always
wrong with a non-change in the user facing implementation, and some
internal optimization all in one patch.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v2 2/2] fsck: use oidset for skiplist
  2018-08-27  7:37   ` Ævar Arnfjörð Bjarmason
@ 2018-08-27 15:23     ` René Scharfe
  0 siblings, 0 replies; 31+ messages in thread
From: René Scharfe @ 2018-08-27 15:23 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Git List, Ramsay Jones, Johannes Schindelin, Junio C Hamano, Jeff King

Am 27.08.2018 um 09:37 schrieb Ævar Arnfjörð Bjarmason:
> 
> On Sat, Aug 25 2018, René Scharfe wrote:
> 
>> diff --git a/Documentation/config.txt b/Documentation/config.txt
>> index 2fa65b7516..80ab570579 100644
>> --- a/Documentation/config.txt
>> +++ b/Documentation/config.txt
>> @@ -1715,7 +1715,7 @@ doing the same for `receive.fsck.<msg-id>` and `fetch.fsck.<msg-id>`
>>  will only cause git to warn.
>>
>>  fsck.skipList::
>> -	The path to a sorted list of object names (i.e. one SHA-1 per
>> +	The path to a list of object names (i.e. one SHA-1 per
>>  	line) that are known to be broken in a non-fatal way and should
>>  	be ignored. This feature is useful when an established project
>>  	should be accepted despite early commits containing errors that
> 
> I was going to say that since this is a file format we're likely to
> support across versions we should make a note that "up to version
> so-and-so this needed to be sorted, but that's longer the case. So
> e.g. someone wouldn't test this on 2.20 (or read the online docs) and
> then deploy this for older clients..
> 
> But...
> 
>> -	if (options->skiplist)
>> -		sorted = options->skiplist->sorted;
>> -	else {
>> -		sorted = 1;
>> -		options->skiplist = &skiplist;
>> -	}
>> -
>>  	fp = fopen(path, "r");
>>  	if (!fp)
>>  		die("Could not open skip list: %s", path);
>> @@ -202,19 +192,12 @@ static void init_skiplist(struct fsck_options *options, const char *path)
>>  		const char *p;
>>  		if (parse_oid_hex(sb.buf, &oid, &p) || *p != '\0')
>>  			die("Invalid SHA-1: %s", sb.buf);
>> -		oid_array_append(&skiplist, &oid);
>> -		if (sorted && skiplist.nr > 1 &&
>> -				oidcmp(&skiplist.oid[skiplist.nr - 2],
>> -				       &oid) > 0)
>> -			sorted = 0;
>> +		oidset_insert(&options->skiplist, &oid);
> 
> ...reading this implementation, where we always called
> oid_array_append(), but then just kept track of whether the set was
> sorted...
> 
>>  	}
>>  	if (ferror(fp))
>>  		die_errno("Could not read '%s'", path);
>>  	fclose(fp);
>>  	strbuf_release(&sb);
>> -
>> -	if (sorted)
>> -		skiplist.sorted = 1;
> 
> ...and here where we assigned to the .sorted member of the oid_array...
> 
>>  static int object_on_skiplist(struct fsck_options *opts, struct object *obj)
>>  {
>> -	if (opts && opts->skiplist && obj)
>> -		return oid_array_lookup(opts->skiplist, &obj->oid) >= 0;
>> -	return 0;
>> +	return opts && obj && oidset_contains(&opts->skiplist, &obj->oid);
>>  }
> 
> ....and here where we'd always do the lookup if the skiplist was
> initialized, *not* just if it's sorted, and how the sha1-array.c code
> has looked ever since cd94c6f91 ("fsck: git receive-pack: support
> excluding objects from fsck'ing", 2015-06-22) when this was first added:
> 
>     $ git show cd94c6f91:sha1-array.c|grep -A5 sha1_array_lookup
>     int sha1_array_lookup(struct sha1_array *array, const unsigned char *sha1)
>     {
>             if (!array->sorted)
>                     sha1_array_sort(array);
>             return sha1_pos(sha1, array->sha1, array->nr, sha1_access);
>     }
> 
> So I think it makes sense to make this series a three-part, where in the
> first part we only change these docs to say s/sorted //, and modify the
> tests I added in 65a836fa6b ("fsck: add stress tests for fsck.skipList",
> 2018-07-27) to assert that an unsorted & sorted list of SHA-1s works
> just as well.
> 
> Then following up with your [12]/2, where the internal implementation is
> changed, but we make it clear that it's *just* the internal
> implementation. I.e. from a UI perspective the list never had to be
> pre-sorted, we'd just spend some work sorting it on the first lookup if
> it wasn't sorted already.
> 
> Now without some very careful reading it's not clear what "we don't need
> to worry about any sort order anymore" in the commit message means,
> i.e. what it really means is "for the purposes of the internal
> implementation, and as an opt-in user-side optimization ...".
> 
> I.e. an alternate version of this whole patch series could also be:
> 
>     diff --git a/Documentation/config.txt b/Documentation/config.txt
>     index 1c4236498..930807e43 100644
>     --- a/Documentation/config.txt
>     +++ b/Documentation/config.txt
>     @@ -1709,5 +1709,5 @@ will only cause git to warn.
> 
>      fsck.skipList::
>     -       The path to a sorted list of object names (i.e. one SHA-1 per
>     +       The path to a list of object names (i.e. one SHA-1 per
>             line) that are known to be broken in a non-fatal way and should
>             be ignored. This feature is useful when an established project
>     diff --git a/fsck.c b/fsck.c
>     index a0cee0be5..9d4e938ad 100644
>     --- a/fsck.c
>     +++ b/fsck.c
>     @@ -184,14 +184,10 @@ static void init_skiplist(struct fsck_options *options, const char *path)
>      {
>             static struct oid_array skiplist = OID_ARRAY_INIT;
>     -       int sorted, fd;
>     +       int fd;
>             char buffer[GIT_MAX_HEXSZ + 1];
>             struct object_id oid;
> 
>     -       if (options->skiplist)
>     -               sorted = options->skiplist->sorted;
>     -       else {
>     -               sorted = 1;
>     +       if (!options->skiplist)
>                     options->skiplist = &skiplist;
>     -       }
> 
>             fd = open(path, O_RDONLY);
>     @@ -208,13 +204,6 @@ static void init_skiplist(struct fsck_options *options, const char *path)
>                             die("Invalid SHA-1: %s", buffer);
>                     oid_array_append(&skiplist, &oid);
>     -               if (sorted && skiplist.nr > 1 &&
>     -                               oidcmp(&skiplist.oid[skiplist.nr - 2],
>     -                                      &oid) > 0)
>     -                       sorted = 0;
>             }
>             close(fd);
>     -
>     -       if (sorted)
>     -               skiplist.sorted = 1;
>      }
> 
> Now, I like yours much better. I'm just saying that currently the
> patch/commit message combo is confusing about *what* it's
> doing. I.e. let's not mix up the correction of docs that were always
> wrong with a non-change in the user facing implementation, and some
> internal optimization all in one patch.

Now you have me confused.  Unsorted lists were always accepted, but the
documentation asked for a sorted one anyway, probably to avoid sorting
it with every use.  Switching the underlying data structure makes that a
moot point -- sortedness is no longer a concern at all -- not in the
code, and not for users.

Inviting users to run the array implementation with unsorted lists is
not incorrect, but it also doesn't help anyone.  We could clarify that
sorted lists are preferred or recommended instead of required.  I don't
see value in perfecting the documentation of a quirk just before
removing it, though.

René

^ permalink raw reply	[flat|nested] 31+ messages in thread

end of thread, other threads:[~2018-10-02 19:19 UTC | newest]

Thread overview: 31+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-08-11 15:39 [PATCH 1/2] fsck: use strbuf_getline() to read skiplist file René Scharfe
2018-08-11 15:47 ` [PATCH 2/2] fsck: use oidset for skiplist René Scharfe
2018-08-11 16:54   ` Ævar Arnfjörð Bjarmason
2018-08-25 18:49     ` René Scharfe
2018-08-11 17:02   ` Jeff King
2018-08-11 17:23     ` Jeff King
2018-08-11 20:59       ` René Scharfe
2018-08-13 17:15         ` René Scharfe
2018-08-14  1:58           ` Jeff King
2018-08-14  2:03             ` Jeff King
2018-08-26 11:37             ` René Scharfe
2018-08-27 23:03               ` Jeff King
2018-10-01 19:15                 ` René Scharfe
2018-10-01 20:26                   ` Jeff King
2018-10-02 19:05                     ` René Scharfe
2018-10-02 19:19                       ` Jeff King
2018-08-13 17:15       ` René Scharfe
2018-08-14  2:01         ` Jeff King
2018-08-11 20:48   ` Ramsay Jones
2018-08-25 18:49     ` René Scharfe
2018-08-13 18:43   ` Junio C Hamano
2018-08-13 20:26     ` René Scharfe
2018-08-13 21:07       ` Junio C Hamano
2018-08-13 23:09         ` René Scharfe
2018-08-11 16:48 ` [PATCH 1/2] fsck: use strbuf_getline() to read skiplist file Jeff King
2018-08-11 21:00   ` René Scharfe
2018-08-25 18:50 ` [PATCH v2 " René Scharfe
2018-08-27 23:00   ` Jeff King
2018-08-25 18:50 ` [PATCH v2 2/2] fsck: use oidset for skiplist René Scharfe
2018-08-27  7:37   ` Ævar Arnfjörð Bjarmason
2018-08-27 15:23     ` René Scharfe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).