All of lore.kernel.org
 help / color / mirror / Atom feed
* Reduce pack-objects memory footprint?
@ 2018-02-28  9:27 Duy Nguyen
  2018-02-28 10:17 ` Jeff King
                   ` (5 more replies)
  0 siblings, 6 replies; 273+ messages in thread
From: Duy Nguyen @ 2018-02-28  9:27 UTC (permalink / raw)
  To: git

linux-2.6.git current has 6483999 objects. "git gc" on my poor laptop
consumes 1.7G out of 4G RAM, pushing lots of data to swap and making
all apps nearly unusuable (granted the problem is partly Linux I/O
scheduler too). So I wonder if we can reduce pack-objects memory
footprint a bit.

This demonstration patch (probably breaks some tests) would reduce the
size of struct object_entry from from 136 down to 112 bytes on
x86-64. There are 6483999 of these objects, so the saving is 17% or
148 MB.

If we go further, notice that nr_objects is uint32_t, we could convert
the three pointers

	struct object_entry *delta;
	struct object_entry *delta_child;
	struct object_entry *delta_sibling;

to

	uint32_t delta;
	uint32_t delta_child;
	uint32_t delta_sibling;

which saves 12 bytes (or another 74 MB). 222 MB total is plenty of
space to keep some file cache from being evicted.

Is it worth doing this? The struct packing makes it harder to read
(and more fragile too). I added some more artifical limit like max
depth of 2^11. But I think 4096+ depth is getting unreasonable.

-- 8< --
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 5c674b2843..6a9804daec 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -877,8 +877,11 @@ static void write_pack_file(void)
 			strbuf_addf(&tmpname, "%s-", base_name);
 
 			if (write_bitmap_index) {
+				ALLOC_ARRAY(to_pack.in_pack_pos, to_pack.nr_objects);
 				bitmap_writer_set_checksum(oid.hash);
-				bitmap_writer_build_type_index(written_list, nr_written);
+				bitmap_writer_build_type_index(written_list,
+							       nr_written,
+							       &to_pack);
 			}
 
 			finish_tmp_packfile(&tmpname, pack_tmp_name,
@@ -1407,6 +1410,7 @@ static void check_object(struct object_entry *entry)
 		unsigned long avail;
 		off_t ofs;
 		unsigned char *buf, c;
+		enum object_type type;
 
 		buf = use_pack(p, &w_curs, entry->in_pack_offset, &avail);
 
@@ -1415,8 +1419,9 @@ static void check_object(struct object_entry *entry)
 		 * since non-delta representations could still be reused.
 		 */
 		used = unpack_object_header_buffer(buf, avail,
-						   &entry->in_pack_type,
+						   &type,
 						   &entry->size);
+		entry->in_pack_type = type;
 		if (used == 0)
 			goto give_up;
 
@@ -1559,6 +1564,7 @@ static void drop_reused_delta(struct object_entry *entry)
 {
 	struct object_entry **p = &entry->delta->delta_child;
 	struct object_info oi = OBJECT_INFO_INIT;
+	enum object_type type;
 
 	while (*p) {
 		if (*p == entry)
@@ -1570,7 +1576,7 @@ static void drop_reused_delta(struct object_entry *entry)
 	entry->depth = 0;
 
 	oi.sizep = &entry->size;
-	oi.typep = &entry->type;
+	oi.typep = &type;
 	if (packed_object_info(entry->in_pack, entry->in_pack_offset, &oi) < 0) {
 		/*
 		 * We failed to get the info from this pack for some reason;
@@ -1580,7 +1586,8 @@ static void drop_reused_delta(struct object_entry *entry)
 		 */
 		entry->type = sha1_object_info(entry->idx.oid.hash,
 					       &entry->size);
-	}
+	} else
+		entry->type = type;
 }
 
 /*
diff --git a/pack-bitmap-write.c b/pack-bitmap-write.c
index e01f992884..5c9957a095 100644
--- a/pack-bitmap-write.c
+++ b/pack-bitmap-write.c
@@ -49,7 +49,8 @@ void bitmap_writer_show_progress(int show)
  * Build the initial type index for the packfile
  */
 void bitmap_writer_build_type_index(struct pack_idx_entry **index,
-				    uint32_t index_nr)
+				    uint32_t index_nr,
+				    struct packing_data *to_pack)
 {
 	uint32_t i;
 
@@ -62,7 +63,7 @@ void bitmap_writer_build_type_index(struct pack_idx_entry **index,
 		struct object_entry *entry = (struct object_entry *)index[i];
 		enum object_type real_type;
 
-		entry->in_pack_pos = i;
+		to_pack->in_pack_pos[entry - to_pack->objects] = i;
 
 		switch (entry->type) {
 		case OBJ_COMMIT:
@@ -147,7 +148,7 @@ static uint32_t find_object_pos(const unsigned char *sha1)
 			"(object %s is missing)", sha1_to_hex(sha1));
 	}
 
-	return entry->in_pack_pos;
+	return writer.to_pack->in_pack_pos[entry - writer.to_pack->objects];
 }
 
 static void show_object(struct object *object, const char *name, void *data)
diff --git a/pack-bitmap.c b/pack-bitmap.c
index 9270983e5f..595914fa43 100644
--- a/pack-bitmap.c
+++ b/pack-bitmap.c
@@ -1032,7 +1032,7 @@ int rebuild_existing_bitmaps(struct packing_data *mapping,
 		oe = packlist_find(mapping, sha1, NULL);
 
 		if (oe)
-			reposition[i] = oe->in_pack_pos + 1;
+			reposition[i] = mapping->in_pack_pos[oe - mapping->objects] + 1;
 	}
 
 	rebuild = bitmap_new();
diff --git a/pack-bitmap.h b/pack-bitmap.h
index 3742a00e14..2558f7662a 100644
--- a/pack-bitmap.h
+++ b/pack-bitmap.h
@@ -44,7 +44,9 @@ int rebuild_existing_bitmaps(struct packing_data *mapping, khash_sha1 *reused_bi
 
 void bitmap_writer_show_progress(int show);
 void bitmap_writer_set_checksum(unsigned char *sha1);
-void bitmap_writer_build_type_index(struct pack_idx_entry **index, uint32_t index_nr);
+void bitmap_writer_build_type_index(struct pack_idx_entry **index,
+				    uint32_t index_nr,
+				    struct packing_data *);
 void bitmap_writer_reuse_bitmaps(struct packing_data *to_pack);
 void bitmap_writer_select_commits(struct commit **indexed_commits,
 		unsigned int indexed_commits_nr, int max_bitmaps);
diff --git a/pack-objects.h b/pack-objects.h
index 03f1191659..caecad23b6 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -1,6 +1,18 @@
 #ifndef PACK_OBJECTS_H
 #define PACK_OBJECTS_H
 
+/*
+ * State flags for depth-first search used for analyzing delta cycles.
+ *
+ * The depth is measured in delta-links to the base (so if A is a delta
+ * against B, then A has a depth of 1, and B a depth of 0).
+ */
+enum dfs_state {
+	DFS_NONE = 0,
+	DFS_ACTIVE,
+	DFS_DONE
+};
+
 struct object_entry {
 	struct pack_idx_entry idx;
 	unsigned long size;	/* uncompressed size */
@@ -14,11 +26,10 @@ struct object_entry {
 	void *delta_data;	/* cached delta (uncompressed) */
 	unsigned long delta_size;	/* delta data size (uncompressed) */
 	unsigned long z_delta_size;	/* delta data size (compressed) */
-	enum object_type type;
-	enum object_type in_pack_type;	/* could be delta */
 	uint32_t hash;			/* name hint hash */
-	unsigned int in_pack_pos;
 	unsigned char in_pack_header_size;
+	unsigned type:3;	 /* enum object_type */
+	unsigned in_pack_type:3; /* enum object_type - could be delta */
 	unsigned preferred_base:1; /*
 				    * we do not pack this, but is available
 				    * to be used as the base object to delta
@@ -27,19 +38,8 @@ struct object_entry {
 	unsigned no_try_delta:1;
 	unsigned tagged:1; /* near the very tip of refs */
 	unsigned filled:1; /* assigned write-order */
-
-	/*
-	 * State flags for depth-first search used for analyzing delta cycles.
-	 *
-	 * The depth is measured in delta-links to the base (so if A is a delta
-	 * against B, then A has a depth of 1, and B a depth of 0).
-	 */
-	enum {
-		DFS_NONE = 0,
-		DFS_ACTIVE,
-		DFS_DONE
-	} dfs_state;
-	int depth;
+	unsigned dfs_state:3;	/* enum dfs_state */
+	unsigned depth:11;
 };
 
 struct packing_data {
@@ -48,6 +48,8 @@ struct packing_data {
 
 	int32_t *index;
 	uint32_t index_size;
+
+	int *in_pack_pos;
 };
 
 struct object_entry *packlist_alloc(struct packing_data *pdata,
-- 8< --

^ permalink raw reply related	[flat|nested] 273+ messages in thread

* Re: Reduce pack-objects memory footprint?
  2018-02-28  9:27 Reduce pack-objects memory footprint? Duy Nguyen
@ 2018-02-28 10:17 ` Jeff King
  2018-02-28 10:58   ` Duy Nguyen
  2018-02-28 18:22 ` Eric Wong
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 273+ messages in thread
From: Jeff King @ 2018-02-28 10:17 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: git

On Wed, Feb 28, 2018 at 04:27:22PM +0700, Duy Nguyen wrote:

> linux-2.6.git current has 6483999 objects. "git gc" on my poor laptop
> consumes 1.7G out of 4G RAM, pushing lots of data to swap and making
> all apps nearly unusuable (granted the problem is partly Linux I/O
> scheduler too). So I wonder if we can reduce pack-objects memory
> footprint a bit.

Yeah, the per object memory footprint is not great. Around 100 million
objects it becomes pretty ridiculous. I started to dig into it a year or
three ago when I saw such a case, but it turned out to be something that
we could prune. The torvalds/linux fork network has ~23 million objects,
so it's probably 7-8 GB of book-keeping. Which is gross, but 64GB in a
server isn't uncommon these days.

I think laptops repacking the kernel are probably one of the worst cases
(leaving aside the awful Windows repository, but my impression is that
they simply can't do a full repack at all there).

> This demonstration patch (probably breaks some tests) would reduce the
> size of struct object_entry from from 136 down to 112 bytes on
> x86-64. There are 6483999 of these objects, so the saving is 17% or
> 148 MB.

136 x 6.5M objects is only about 800MB. I suspect a big chunk of the
rest is going to the object structs we create when doing the internal
rev-list traversal. And those duplicate the 20-byte sha1s at the very
least.

I don't know if it would be a good idea to free them after the
traversal, though. We do use them again later in the bitmap case. On the
other hand, we could probably do so for the non-bitmap case. And even
for the bitmap case, the primary value in keeping them around is that
the parent pointers will already be cached. So it might make sense to
free the blobs and trees (though it might be tricky; the commits have
pointers to the trees).

It also doesn't help with peak memory usage, because you'll have the
full to_pack list and all of the "struct object" in memory together at
one point.

Another option would be to somehow replace the pack_idx_entry with a
reference to a "struct object". That would let us at least avoid storing
the 20-byte oid twice.

> If we go further, notice that nr_objects is uint32_t, we could convert
> the three pointers
> 
> 	struct object_entry *delta;
> 	struct object_entry *delta_child;
> 	struct object_entry *delta_sibling;
> 
> to
> 
> 	uint32_t delta;
> 	uint32_t delta_child;
> 	uint32_t delta_sibling;
> 
> which saves 12 bytes (or another 74 MB). 222 MB total is plenty of
> space to keep some file cache from being evicted.

Yeah, that seems like low-hanging fruit. I'd also note that we don't
actually use all of the fields during the whole process. I think some of
those delta fields are only used for a short time. So we might be able
to reduce peak memory if there are some mutually exclusive bits of each
entry (and even if there's some overlap, we'd reduce the length of time
we'd need to be at peak).

> Is it worth doing this? The struct packing makes it harder to read
> (and more fragile too). I added some more artifical limit like max
> depth of 2^11. But I think 4096+ depth is getting unreasonable.

I'm OK with a limit like 4096, as long as we notice when we hit the
limit and behave reasonably. I think the algorithm in
break_delta_chains() may temporarily store up to uint32_t depth. But I
think we won't ever write anything into cur->depth larger than the
max-depth limit. So it would probably be sufficient to just check that
the --depth argument is reasonably sized and complain otherwise.

I do agree this makes things a bit harder to read, but I think the
benefits are pretty measurable. And may make a real usability difference
on a large repository.

-Peff

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: Reduce pack-objects memory footprint?
  2018-02-28 10:17 ` Jeff King
@ 2018-02-28 10:58   ` Duy Nguyen
  2018-02-28 11:11     ` Jeff King
  0 siblings, 1 reply; 273+ messages in thread
From: Duy Nguyen @ 2018-02-28 10:58 UTC (permalink / raw)
  To: Jeff King; +Cc: Git Mailing List

On Wed, Feb 28, 2018 at 5:17 PM, Jeff King <peff@peff.net> wrote:
> On Wed, Feb 28, 2018 at 04:27:22PM +0700, Duy Nguyen wrote:
>
>> linux-2.6.git current has 6483999 objects. "git gc" on my poor laptop
>> consumes 1.7G out of 4G RAM, pushing lots of data to swap and making
>> all apps nearly unusuable (granted the problem is partly Linux I/O
>> scheduler too). So I wonder if we can reduce pack-objects memory
>> footprint a bit.
>
> Yeah, the per object memory footprint is not great. Around 100 million
> objects it becomes pretty ridiculous. I started to dig into it a year or
> three ago when I saw such a case, but it turned out to be something that
> we could prune.

We could? What could we prune?

> The torvalds/linux fork network has ~23 million objects,
> so it's probably 7-8 GB of book-keeping. Which is gross, but 64GB in a
> server isn't uncommon these days.

I wonder if we could just do book keeping for some but not all objects
because all objects simply do not scale. Say we have a big pack of
many GBs, could we keep the 80% of its bottom untouched, register the
top 20% (mostly non-blobs, and some more blobs as delta base) for
repack? We copy the bottom part to the new pack byte-by-byte, then
pack-objects rebuilds the top part with objects from other sources.

That would of course be less optimal because we can't make delta
against those objects at the bottom. And this makes the assumption
that these packs are generated using the same heuristics that we use.

> I think laptops repacking the kernel are probably one of the worst cases
> (leaving aside the awful Windows repository, but my impression is that
> they simply can't do a full repack at all there).

Yeah. My fear is "git gc --auto" kicking in. Even on a laptop we
should support working on a repo as big as linux-2.6 (or as smalls as
one from facebook/microsoft perspective). I probably will add a mode
that keeps the largest pack alone and just pack the rest in a second
pack in "git gc --auto". That might help (I haven't really checked the
details yet, but object_entry book keeping should go down at least)

>> This demonstration patch (probably breaks some tests) would reduce the
>> size of struct object_entry from from 136 down to 112 bytes on
>> x86-64. There are 6483999 of these objects, so the saving is 17% or
>> 148 MB.
>
> 136 x 6.5M objects is only about 800MB. I suspect a big chunk of the
> rest is going to the object structs we create when doing the internal
> rev-list traversal. And those duplicate the 20-byte sha1s at the very
> least.

They are 32 bytes per entry, so it should take less than object_entry.
I briefly wondered if we should fall back to external rev-list too,
just to free that memory.

So about 200 MB for those objects (or maybe more for commits). Add 256
MB delta cache on top, it's still a bit far from 1.7G. There's
something I'm still missing.

> Another option would be to somehow replace the pack_idx_entry with a
> reference to a "struct object". That would let us at least avoid storing
> the 20-byte oid twice.

20 bytes saving? /me drools.

Pity we can't do the same for 'struct object'. Most of the time we
have a giant .idx file with most hashes. We could look up in both
places: the hash table in object.c, and the idx file, to find an
object. Then those objects that are associated with .idx file will not
need "oid" field (needed to as key for the hash table). But I see no
way to make that change.

>> If we go further, notice that nr_objects is uint32_t, we could convert
>> the three pointers
>>
>>       struct object_entry *delta;
>>       struct object_entry *delta_child;
>>       struct object_entry *delta_sibling;
>>
>> to
>>
>>       uint32_t delta;
>>       uint32_t delta_child;
>>       uint32_t delta_sibling;
>>
>> which saves 12 bytes (or another 74 MB). 222 MB total is plenty of
>> space to keep some file cache from being evicted.
>
> Yeah, that seems like low-hanging fruit. I'd also note that we don't
> actually use all of the fields during the whole process. I think some of
> those delta fields are only used for a short time. So we might be able
> to reduce peak memory if there are some mutually exclusive bits of each
> entry (and even if there's some overlap, we'd reduce the length of time
> we'd need to be at peak).

Yeah it looks that way to me too. Now that we have packing_data,
splitting object_entry[] to multiple arrays (that is only needed at
some point) may be very nice.

>> Is it worth doing this? The struct packing makes it harder to read
>> (and more fragile too). I added some more artifical limit like max
>> depth of 2^11. But I think 4096+ depth is getting unreasonable.
>
> I'm OK with a limit like 4096, as long as we notice when we hit the
> limit and behave reasonably. I think the algorithm in
> break_delta_chains() may temporarily store up to uint32_t depth. But I
> think we won't ever write anything into cur->depth larger than the
> max-depth limit. So it would probably be sufficient to just check that
> the --depth argument is reasonably sized and complain otherwise.
>
> I do agree this makes things a bit harder to read, but I think the
> benefits are pretty measurable. And may make a real usability difference
> on a large repository.

Thanks. I'll go make real patches then (with lots more checks to make
sure we don't overflow in shortened fields like depth or type).
-- 
Duy

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: Reduce pack-objects memory footprint?
  2018-02-28 10:58   ` Duy Nguyen
@ 2018-02-28 11:11     ` Jeff King
  2018-02-28 11:24       ` Duy Nguyen
  0 siblings, 1 reply; 273+ messages in thread
From: Jeff King @ 2018-02-28 11:11 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Git Mailing List

On Wed, Feb 28, 2018 at 05:58:50PM +0700, Duy Nguyen wrote:

> > Yeah, the per object memory footprint is not great. Around 100 million
> > objects it becomes pretty ridiculous. I started to dig into it a year or
> > three ago when I saw such a case, but it turned out to be something that
> > we could prune.
> 
> We could? What could we prune?

Sorry, I just meant that my 100 million-object case turned out not to
need all those objects, and I was able to prune it down. No code fixes
came out of it. ;)

> > The torvalds/linux fork network has ~23 million objects,
> > so it's probably 7-8 GB of book-keeping. Which is gross, but 64GB in a
> > server isn't uncommon these days.
> 
> I wonder if we could just do book keeping for some but not all objects
> because all objects simply do not scale. Say we have a big pack of
> many GBs, could we keep the 80% of its bottom untouched, register the
> top 20% (mostly non-blobs, and some more blobs as delta base) for
> repack? We copy the bottom part to the new pack byte-by-byte, then
> pack-objects rebuilds the top part with objects from other sources.

Yes, though I think it would take a fair bit of surgery to do
internally. And some features (like bitmap generation) just wouldn't
work at all.

I suspect you could simulate it, though, by just packing your subset
with pack-objects (feeding it directly without using "--revs") and then
catting the resulting packfiles together with a fixed-up header.

At one point I played with a "fast pack" that would just cat packfiles
together. My goal was to make cases with 10,000 packs workable by
creating one lousy pack, and then repacking that lousy pack with a
"real" repack. In the end I abandoned it in favor of fixing the
performance problems from trying to make a real pack of 10,000 packs. :)

But I might be able to dig it up if you want to experiment in that
direction.

> They are 32 bytes per entry, so it should take less than object_entry.
> I briefly wondered if we should fall back to external rev-list too,
> just to free that memory.
> 
> So about 200 MB for those objects (or maybe more for commits). Add 256
> MB delta cache on top, it's still a bit far from 1.7G. There's
> something I'm still missing.

Are you looking at RSS or heap? Keep in mind that you're mmap-ing what's
probably a 1GB packfile on disk. If you're under memory pressure that
won't all stay resident, but some of it will be counted in RSS.

> Pity we can't do the same for 'struct object'. Most of the time we
> have a giant .idx file with most hashes. We could look up in both
> places: the hash table in object.c, and the idx file, to find an
> object. Then those objects that are associated with .idx file will not
> need "oid" field (needed to as key for the hash table). But I see no
> way to make that change.

Yeah, that would be pretty invasive, I think. I also wonder if it would
perform worse due to cache effects.

-Peff

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: Reduce pack-objects memory footprint?
  2018-02-28 11:11     ` Jeff King
@ 2018-02-28 11:24       ` Duy Nguyen
  0 siblings, 0 replies; 273+ messages in thread
From: Duy Nguyen @ 2018-02-28 11:24 UTC (permalink / raw)
  To: Jeff King; +Cc: Git Mailing List

On Wed, Feb 28, 2018 at 6:11 PM, Jeff King <peff@peff.net> wrote:
>> > The torvalds/linux fork network has ~23 million objects,
>> > so it's probably 7-8 GB of book-keeping. Which is gross, but 64GB in a
>> > server isn't uncommon these days.
>>
>> I wonder if we could just do book keeping for some but not all objects
>> because all objects simply do not scale. Say we have a big pack of
>> many GBs, could we keep the 80% of its bottom untouched, register the
>> top 20% (mostly non-blobs, and some more blobs as delta base) for
>> repack? We copy the bottom part to the new pack byte-by-byte, then
>> pack-objects rebuilds the top part with objects from other sources.
>
> Yes, though I think it would take a fair bit of surgery to do
> internally. And some features (like bitmap generation) just wouldn't
> work at all.
>
> I suspect you could simulate it, though, by just packing your subset
> with pack-objects (feeding it directly without using "--revs") and then
> catting the resulting packfiles together with a fixed-up header.
>
> At one point I played with a "fast pack" that would just cat packfiles
> together. My goal was to make cases with 10,000 packs workable by
> creating one lousy pack, and then repacking that lousy pack with a
> "real" repack. In the end I abandoned it in favor of fixing the
> performance problems from trying to make a real pack of 10,000 packs. :)
>
> But I might be able to dig it up if you want to experiment in that
> direction.

Naah it's ok. I'll go similar direction, but I'd repack those pack
files too except the big one. Let's see how it turns out.

>> They are 32 bytes per entry, so it should take less than object_entry.
>> I briefly wondered if we should fall back to external rev-list too,
>> just to free that memory.
>>
>> So about 200 MB for those objects (or maybe more for commits). Add 256
>> MB delta cache on top, it's still a bit far from 1.7G. There's
>> something I'm still missing.
>
> Are you looking at RSS or heap? Keep in mind that you're mmap-ing what's
> probably a 1GB packfile on disk. If you're under memory pressure that
> won't all stay resident, but some of it will be counted in RSS.

Interesting. It was RSS.

>> Pity we can't do the same for 'struct object'. Most of the time we
>> have a giant .idx file with most hashes. We could look up in both
>> places: the hash table in object.c, and the idx file, to find an
>> object. Then those objects that are associated with .idx file will not
>> need "oid" field (needed to as key for the hash table). But I see no
>> way to make that change.
>
> Yeah, that would be pretty invasive, I think. I also wonder if it would
> perform worse due to cache effects.

It should be better because of cache effects, I think. I mean, hash
map is the least cache friendly lookup. Moving most objects out of the
hash table shrinks it, which is even nicer to cache. But we also lose
O(1) when we do binary search on .idx file (after failing to find the
same object in the hash table)
-- 
Duy

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: Reduce pack-objects memory footprint?
  2018-02-28  9:27 Reduce pack-objects memory footprint? Duy Nguyen
  2018-02-28 10:17 ` Jeff King
@ 2018-02-28 18:22 ` Eric Wong
  2018-03-01  9:00   ` Duy Nguyen
  2018-03-01  9:10 ` [PATCH 00/11] Reduce pack-objects memory footprint Nguyễn Thái Ngọc Duy
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 273+ messages in thread
From: Eric Wong @ 2018-02-28 18:22 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: git

Duy Nguyen <pclouds@gmail.com> wrote:
> which saves 12 bytes (or another 74 MB). 222 MB total is plenty of
> space to keep some file cache from being evicted.

Nice!  I can definitely benefit from lower memory usage when
packing.  Fwiw, I use pahole with other projects to help find
packing opportunities:

	git://git.kernel.org/pub/scm/devel/pahole/pahole.git

> @@ -14,11 +26,10 @@ struct object_entry {
>  	void *delta_data;	/* cached delta (uncompressed) */
>  	unsigned long delta_size;	/* delta data size (uncompressed) */
>  	unsigned long z_delta_size;	/* delta data size (compressed) */
> -	enum object_type type;
> -	enum object_type in_pack_type;	/* could be delta */
>  	uint32_t hash;			/* name hint hash */
> -	unsigned int in_pack_pos;
>  	unsigned char in_pack_header_size;
> +	unsigned type:3;	 /* enum object_type */
> +	unsigned in_pack_type:3; /* enum object_type - could be delta */

For C99 compilers, enums can be bitfields.  I introduced the
following macro into Ruby a few weeks ago to remain compatible
with non-C99 compilers:

/*
 * For declaring bitfields out of non-unsigned int types:
 *   struct date {
 *      BITFIELD(enum months) month:4;
 *      ...
 *   };
 */
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
# define BITFIELD(type) type
#else
# define BITFIELD(type) unsigned int
#endif

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: Reduce pack-objects memory footprint?
  2018-02-28 18:22 ` Eric Wong
@ 2018-03-01  9:00   ` Duy Nguyen
  0 siblings, 0 replies; 273+ messages in thread
From: Duy Nguyen @ 2018-03-01  9:00 UTC (permalink / raw)
  To: Eric Wong; +Cc: git

On Wed, Feb 28, 2018 at 06:22:33PM +0000, Eric Wong wrote:
> Duy Nguyen <pclouds@gmail.com> wrote:
> > which saves 12 bytes (or another 74 MB). 222 MB total is plenty of
> > space to keep some file cache from being evicted.
> 
> Nice!  I can definitely benefit from lower memory usage when
> packing.  Fwiw, I use pahole with other projects to help find
> packing opportunities:
> 
> 	git://git.kernel.org/pub/scm/devel/pahole/pahole.git

Yes it's a wonderful tool.

> > @@ -14,11 +26,10 @@ struct object_entry {
> >  	void *delta_data;	/* cached delta (uncompressed) */
> >  	unsigned long delta_size;	/* delta data size (uncompressed) */
> >  	unsigned long z_delta_size;	/* delta data size (compressed) */
> > -	enum object_type type;
> > -	enum object_type in_pack_type;	/* could be delta */
> >  	uint32_t hash;			/* name hint hash */
> > -	unsigned int in_pack_pos;
> >  	unsigned char in_pack_header_size;
> > +	unsigned type:3;	 /* enum object_type */
> > +	unsigned in_pack_type:3; /* enum object_type - could be delta */
> 
> For C99 compilers, enums can be bitfields.  I introduced the
> following macro into Ruby a few weeks ago to remain compatible
> with non-C99 compilers:
> 
> /*
>  * For declaring bitfields out of non-unsigned int types:
>  *   struct date {
>  *      BITFIELD(enum months) month:4;
>  *      ...
>  *   };
>  */
> #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
> # define BITFIELD(type) type
> #else
> # define BITFIELD(type) unsigned int
> #endif

I tried this and got

In file included from builtin/pack-objects.c:20:0:
./pack-objects.h:49:19: l?i: ?type? is narrower than values of its type [-Werror]
  enum object_type type:TYPE_BITS;
                   ^~~~

The compiler is not wrong. What it does not realize is pack-objects
code never uses out-of-range values (OBJ_BAD and OBJ_ANY) but I don't
see how I could suppress this warning. So I went back to non-enum
bitfields.

--
Duy

^ permalink raw reply	[flat|nested] 273+ messages in thread

* [PATCH 00/11] Reduce pack-objects memory footprint
  2018-02-28  9:27 Reduce pack-objects memory footprint? Duy Nguyen
  2018-02-28 10:17 ` Jeff King
  2018-02-28 18:22 ` Eric Wong
@ 2018-03-01  9:10 ` Nguyễn Thái Ngọc Duy
  2018-03-01  9:10   ` [PATCH 01/11] pack-objects: document holes in struct object_entry.h Nguyễn Thái Ngọc Duy
                     ` (12 more replies)
  2018-03-01  9:20 ` [PATCH/RFC 0/1] Avoid expensive 'repack -ad' in gc --auto Nguyễn Thái Ngọc Duy
                   ` (2 subsequent siblings)
  5 siblings, 13 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-01  9:10 UTC (permalink / raw)
  To: git; +Cc: Jeff King, Eric Wong, Nguyễn Thái Ngọc Duy

The array of object_entry in pack-objects can take a lot of memory
when pack-objects is run in "pack everything" mode. On linux-2.6.git,
this array alone takes roughly 800MB.

This series reorders some fields and reduces field size... to keep
this struct smaller. Its size goes from 136 bytes to 96 bytes (29%) on
64-bit linux and saves 260MB on linux-2.6.git.

Now the bad side:

- the number of pack files pack-objects can handle is reduced to 4096
  (previously unlimited)
- max delta chain is also limited to 4096 (previously practically
  unlimited)
- some patches are quite invasive (e.g. replacing pointer with
  uint32_t) and reduces readability a bit.
- it may be tricker to add more data in object_entry in the future.

Nguyễn Thái Ngọc Duy (11):
  pack-objects: document holes in struct object_entry.h
  pack-objects: turn type and in_pack_type to bitfields
  pack-objects: use bitfield for object_entry::dfs_state
  pack-objects: use bitfield for object_entry::depth
  pack-objects: note about in_pack_header_size
  pack-objects: move in_pack_pos out of struct object_entry
  pack-objects: move in_pack out of struct object_entry
  pack-objects: faster reverse packed_git lookup
  pack-objects: refer to delta objects by index instead of pointer
  pack-objects: reorder 'hash' to pack struct object_entry
  pack-objects: increase pack file limit to 4096

 builtin/pack-objects.c | 189 ++++++++++++++++++++++++++---------------
 cache.h                |   3 +
 object.h               |   1 -
 pack-bitmap-write.c    |   8 +-
 pack-bitmap.c          |   2 +-
 pack-bitmap.h          |   4 +-
 pack-objects.h         |  70 ++++++++++-----
 7 files changed, 180 insertions(+), 97 deletions(-)

-- 
2.16.1.435.g8f24da2e1a


^ permalink raw reply	[flat|nested] 273+ messages in thread

* [PATCH 01/11] pack-objects: document holes in struct object_entry.h
  2018-03-01  9:10 ` [PATCH 00/11] Reduce pack-objects memory footprint Nguyễn Thái Ngọc Duy
@ 2018-03-01  9:10   ` Nguyễn Thái Ngọc Duy
  2018-03-01  9:10   ` [PATCH 02/11] pack-objects: turn type and in_pack_type to bitfields Nguyễn Thái Ngọc Duy
                     ` (11 subsequent siblings)
  12 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-01  9:10 UTC (permalink / raw)
  To: git; +Cc: Jeff King, Eric Wong, Nguyễn Thái Ngọc Duy

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 pack-objects.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pack-objects.h b/pack-objects.h
index 03f1191659..720a8e8756 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -28,6 +28,7 @@ struct object_entry {
 	unsigned tagged:1; /* near the very tip of refs */
 	unsigned filled:1; /* assigned write-order */
 
+	/* XXX 28 bits hole, try to pack */
 	/*
 	 * State flags for depth-first search used for analyzing delta cycles.
 	 *
@@ -40,6 +41,7 @@ struct object_entry {
 		DFS_DONE
 	} dfs_state;
 	int depth;
+	/* size: 136, padding: 4 */
 };
 
 struct packing_data {
-- 
2.16.1.435.g8f24da2e1a


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH 02/11] pack-objects: turn type and in_pack_type to bitfields
  2018-03-01  9:10 ` [PATCH 00/11] Reduce pack-objects memory footprint Nguyễn Thái Ngọc Duy
  2018-03-01  9:10   ` [PATCH 01/11] pack-objects: document holes in struct object_entry.h Nguyễn Thái Ngọc Duy
@ 2018-03-01  9:10   ` Nguyễn Thái Ngọc Duy
  2018-03-01  9:10   ` [PATCH 03/11] pack-objects: use bitfield for object_entry::dfs_state Nguyễn Thái Ngọc Duy
                     ` (10 subsequent siblings)
  12 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-01  9:10 UTC (permalink / raw)
  To: git; +Cc: Jeff King, Eric Wong, Nguyễn Thái Ngọc Duy

This saves 8 bytes in sizeof(struct object_entry). On a large
repository like linux-2.6.git (6.5M objects), this saves us 52MB
memory.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 14 ++++++++++++--
 cache.h                |  2 ++
 object.h               |  1 -
 pack-objects.h         |  8 ++++----
 4 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 5c674b2843..fd217cb51f 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -1407,6 +1407,7 @@ static void check_object(struct object_entry *entry)
 		unsigned long avail;
 		off_t ofs;
 		unsigned char *buf, c;
+		enum object_type type;
 
 		buf = use_pack(p, &w_curs, entry->in_pack_offset, &avail);
 
@@ -1415,11 +1416,15 @@ static void check_object(struct object_entry *entry)
 		 * since non-delta representations could still be reused.
 		 */
 		used = unpack_object_header_buffer(buf, avail,
-						   &entry->in_pack_type,
+						   &type,
 						   &entry->size);
 		if (used == 0)
 			goto give_up;
 
+		if (type < 0)
+			die("BUG: invalid type %d", type);
+		entry->in_pack_type = type;
+
 		/*
 		 * Determine if this is a delta and if so whether we can
 		 * reuse it or not.  Otherwise let's find out as cheaply as
@@ -1559,6 +1564,7 @@ static void drop_reused_delta(struct object_entry *entry)
 {
 	struct object_entry **p = &entry->delta->delta_child;
 	struct object_info oi = OBJECT_INFO_INIT;
+	enum object_type type;
 
 	while (*p) {
 		if (*p == entry)
@@ -1570,7 +1576,7 @@ static void drop_reused_delta(struct object_entry *entry)
 	entry->depth = 0;
 
 	oi.sizep = &entry->size;
-	oi.typep = &entry->type;
+	oi.typep = &type;
 	if (packed_object_info(entry->in_pack, entry->in_pack_offset, &oi) < 0) {
 		/*
 		 * We failed to get the info from this pack for some reason;
@@ -1580,6 +1586,10 @@ static void drop_reused_delta(struct object_entry *entry)
 		 */
 		entry->type = sha1_object_info(entry->idx.oid.hash,
 					       &entry->size);
+	} else {
+		if (type < 0)
+			die("BUG: invalid type %d", type);
+		entry->type = type;
 	}
 }
 
diff --git a/cache.h b/cache.h
index 21fbcc2414..862bdff83a 100644
--- a/cache.h
+++ b/cache.h
@@ -373,6 +373,8 @@ extern void free_name_hash(struct index_state *istate);
 #define read_blob_data_from_cache(path, sz) read_blob_data_from_index(&the_index, (path), (sz))
 #endif
 
+#define TYPE_BITS 3
+
 enum object_type {
 	OBJ_BAD = -1,
 	OBJ_NONE = 0,
diff --git a/object.h b/object.h
index 87563d9056..8ce294d6ec 100644
--- a/object.h
+++ b/object.h
@@ -25,7 +25,6 @@ struct object_array {
 
 #define OBJECT_ARRAY_INIT { 0, 0, NULL }
 
-#define TYPE_BITS   3
 /*
  * object flag allocation:
  * revision.h:      0---------10                                26
diff --git a/pack-objects.h b/pack-objects.h
index 720a8e8756..f8b06e2521 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -14,11 +14,11 @@ struct object_entry {
 	void *delta_data;	/* cached delta (uncompressed) */
 	unsigned long delta_size;	/* delta data size (uncompressed) */
 	unsigned long z_delta_size;	/* delta data size (compressed) */
-	enum object_type type;
-	enum object_type in_pack_type;	/* could be delta */
 	uint32_t hash;			/* name hint hash */
 	unsigned int in_pack_pos;
 	unsigned char in_pack_header_size;
+	unsigned type:TYPE_BITS;
+	unsigned in_pack_type:TYPE_BITS; /* could be delta */
 	unsigned preferred_base:1; /*
 				    * we do not pack this, but is available
 				    * to be used as the base object to delta
@@ -28,7 +28,7 @@ struct object_entry {
 	unsigned tagged:1; /* near the very tip of refs */
 	unsigned filled:1; /* assigned write-order */
 
-	/* XXX 28 bits hole, try to pack */
+	/* XXX 22 bits hole, try to pack */
 	/*
 	 * State flags for depth-first search used for analyzing delta cycles.
 	 *
@@ -41,7 +41,7 @@ struct object_entry {
 		DFS_DONE
 	} dfs_state;
 	int depth;
-	/* size: 136, padding: 4 */
+	/* size: 128, padding: 4 */
 };
 
 struct packing_data {
-- 
2.16.1.435.g8f24da2e1a


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH 03/11] pack-objects: use bitfield for object_entry::dfs_state
  2018-03-01  9:10 ` [PATCH 00/11] Reduce pack-objects memory footprint Nguyễn Thái Ngọc Duy
  2018-03-01  9:10   ` [PATCH 01/11] pack-objects: document holes in struct object_entry.h Nguyễn Thái Ngọc Duy
  2018-03-01  9:10   ` [PATCH 02/11] pack-objects: turn type and in_pack_type to bitfields Nguyễn Thái Ngọc Duy
@ 2018-03-01  9:10   ` Nguyễn Thái Ngọc Duy
  2018-03-01  9:10   ` [PATCH 04/11] pack-objects: use bitfield for object_entry::depth Nguyễn Thái Ngọc Duy
                     ` (9 subsequent siblings)
  12 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-01  9:10 UTC (permalink / raw)
  To: git; +Cc: Jeff King, Eric Wong, Nguyễn Thái Ngọc Duy

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c |  3 +++
 pack-objects.h         | 33 ++++++++++++++++++++-------------
 2 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index fd217cb51f..a4dbb40824 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -3049,6 +3049,9 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 		OPT_END(),
 	};
 
+	if (DFS_NUM_STATES > (1 << OE_DFS_STATE_BITS))
+		die("BUG: too many dfs states, increase OE_DFS_STATE_BITS");
+
 	check_replace_refs = 0;
 
 	reset_pack_idx_option(&pack_idx_opts);
diff --git a/pack-objects.h b/pack-objects.h
index f8b06e2521..fca334ab4d 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -1,6 +1,21 @@
 #ifndef PACK_OBJECTS_H
 #define PACK_OBJECTS_H
 
+#define OE_DFS_STATE_BITS 2
+
+/*
+ * State flags for depth-first search used for analyzing delta cycles.
+ *
+ * The depth is measured in delta-links to the base (so if A is a delta
+ * against B, then A has a depth of 1, and B a depth of 0).
+ */
+enum dfs_state {
+	DFS_NONE = 0,
+	DFS_ACTIVE,
+	DFS_DONE,
+	DFS_NUM_STATES
+};
+
 struct object_entry {
 	struct pack_idx_entry idx;
 	unsigned long size;	/* uncompressed size */
@@ -27,21 +42,13 @@ struct object_entry {
 	unsigned no_try_delta:1;
 	unsigned tagged:1; /* near the very tip of refs */
 	unsigned filled:1; /* assigned write-order */
+	unsigned dfs_state:OE_DFS_STATE_BITS;
+
+	/* XXX 20 bits hole, try to pack */
 
-	/* XXX 22 bits hole, try to pack */
-	/*
-	 * State flags for depth-first search used for analyzing delta cycles.
-	 *
-	 * The depth is measured in delta-links to the base (so if A is a delta
-	 * against B, then A has a depth of 1, and B a depth of 0).
-	 */
-	enum {
-		DFS_NONE = 0,
-		DFS_ACTIVE,
-		DFS_DONE
-	} dfs_state;
 	int depth;
-	/* size: 128, padding: 4 */
+
+	/* size: 120, padding: 4 */
 };
 
 struct packing_data {
-- 
2.16.1.435.g8f24da2e1a


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH 04/11] pack-objects: use bitfield for object_entry::depth
  2018-03-01  9:10 ` [PATCH 00/11] Reduce pack-objects memory footprint Nguyễn Thái Ngọc Duy
                     ` (2 preceding siblings ...)
  2018-03-01  9:10   ` [PATCH 03/11] pack-objects: use bitfield for object_entry::dfs_state Nguyễn Thái Ngọc Duy
@ 2018-03-01  9:10   ` Nguyễn Thái Ngọc Duy
  2018-03-01 18:00     ` Junio C Hamano
  2018-03-01  9:10   ` [PATCH 05/11] pack-objects: note about in_pack_header_size Nguyễn Thái Ngọc Duy
                     ` (8 subsequent siblings)
  12 siblings, 1 reply; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-01  9:10 UTC (permalink / raw)
  To: git; +Cc: Jeff King, Eric Wong, Nguyễn Thái Ngọc Duy

This does not give us any saving due to padding. But we will be able
to save once we cut 4 bytes out of this struct in a subsequent patch.

Because of struct packing from now on we can only handle max depth
4095 (or even lower when new booleans are added in this struct). This
should be ok since long delta chain will cause significant slow down
anyway.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 4 ++++
 pack-objects.h         | 6 ++----
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index a4dbb40824..cfd97da7db 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -3068,6 +3068,10 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 	if (pack_to_stdout != !base_name || argc)
 		usage_with_options(pack_usage, pack_objects_options);
 
+	if (depth > (1 << OE_DEPTH_BITS))
+		die(_("delta chain depth %d is greater than maximum limit %d"),
+		    depth, (1 << OE_DEPTH_BITS));
+
 	argv_array_push(&rp, "pack-objects");
 	if (thin) {
 		use_internal_rev_list = 1;
diff --git a/pack-objects.h b/pack-objects.h
index fca334ab4d..3941e6c9a6 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -2,6 +2,7 @@
 #define PACK_OBJECTS_H
 
 #define OE_DFS_STATE_BITS 2
+#define OE_DEPTH_BITS 12
 
 /*
  * State flags for depth-first search used for analyzing delta cycles.
@@ -43,10 +44,7 @@ struct object_entry {
 	unsigned tagged:1; /* near the very tip of refs */
 	unsigned filled:1; /* assigned write-order */
 	unsigned dfs_state:OE_DFS_STATE_BITS;
-
-	/* XXX 20 bits hole, try to pack */
-
-	int depth;
+	unsigned depth:OE_DEPTH_BITS;
 
 	/* size: 120, padding: 4 */
 };
-- 
2.16.1.435.g8f24da2e1a


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH 05/11] pack-objects: note about in_pack_header_size
  2018-03-01  9:10 ` [PATCH 00/11] Reduce pack-objects memory footprint Nguyễn Thái Ngọc Duy
                     ` (3 preceding siblings ...)
  2018-03-01  9:10   ` [PATCH 04/11] pack-objects: use bitfield for object_entry::depth Nguyễn Thái Ngọc Duy
@ 2018-03-01  9:10   ` Nguyễn Thái Ngọc Duy
  2018-03-01  9:10   ` [PATCH 06/11] pack-objects: move in_pack_pos out of struct object_entry Nguyễn Thái Ngọc Duy
                     ` (7 subsequent siblings)
  12 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-01  9:10 UTC (permalink / raw)
  To: git; +Cc: Jeff King, Eric Wong, Nguyễn Thái Ngọc Duy

Object header in a pack is packed really tight (see
pack-format.txt). Even with 8 bytes length, we need 9-10 bytes most,
plus a hash (20 bytes). Which means this field only needs to store a
number as big as 32 (5 bits).

This is trickier to pack tight though since a new hash algorithm is
coming, the number of bits needed may quickly increase. So leave it
for now.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 pack-objects.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pack-objects.h b/pack-objects.h
index 3941e6c9a6..017cc3425f 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -32,7 +32,7 @@ struct object_entry {
 	unsigned long z_delta_size;	/* delta data size (compressed) */
 	uint32_t hash;			/* name hint hash */
 	unsigned int in_pack_pos;
-	unsigned char in_pack_header_size;
+	unsigned char in_pack_header_size; /* note: spare bits available! */
 	unsigned type:TYPE_BITS;
 	unsigned in_pack_type:TYPE_BITS; /* could be delta */
 	unsigned preferred_base:1; /*
-- 
2.16.1.435.g8f24da2e1a


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH 06/11] pack-objects: move in_pack_pos out of struct object_entry
  2018-03-01  9:10 ` [PATCH 00/11] Reduce pack-objects memory footprint Nguyễn Thái Ngọc Duy
                     ` (4 preceding siblings ...)
  2018-03-01  9:10   ` [PATCH 05/11] pack-objects: note about in_pack_header_size Nguyễn Thái Ngọc Duy
@ 2018-03-01  9:10   ` Nguyễn Thái Ngọc Duy
  2018-03-01  9:10   ` [PATCH 07/11] pack-objects: move in_pack " Nguyễn Thái Ngọc Duy
                     ` (6 subsequent siblings)
  12 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-01  9:10 UTC (permalink / raw)
  To: git; +Cc: Jeff King, Eric Wong, Nguyễn Thái Ngọc Duy

This field is only need for pack-bitmap, which is an optional
feature. Move it to a separate array that is only allocated when
pack-bitmap is used (it's not freed in the same way that objects[] is
not). This saves us 8 bytes in struct object_entry.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 3 ++-
 pack-bitmap-write.c    | 8 +++++---
 pack-bitmap.c          | 2 +-
 pack-bitmap.h          | 4 +++-
 pack-objects.h         | 8 ++++++--
 5 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index cfd97da7db..7bb5544883 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -878,7 +878,8 @@ static void write_pack_file(void)
 
 			if (write_bitmap_index) {
 				bitmap_writer_set_checksum(oid.hash);
-				bitmap_writer_build_type_index(written_list, nr_written);
+				bitmap_writer_build_type_index(
+					&to_pack, written_list, nr_written);
 			}
 
 			finish_tmp_packfile(&tmpname, pack_tmp_name,
diff --git a/pack-bitmap-write.c b/pack-bitmap-write.c
index e01f992884..1360a93311 100644
--- a/pack-bitmap-write.c
+++ b/pack-bitmap-write.c
@@ -48,7 +48,8 @@ void bitmap_writer_show_progress(int show)
 /**
  * Build the initial type index for the packfile
  */
-void bitmap_writer_build_type_index(struct pack_idx_entry **index,
+void bitmap_writer_build_type_index(struct packing_data *to_pack,
+				    struct pack_idx_entry **index,
 				    uint32_t index_nr)
 {
 	uint32_t i;
@@ -57,12 +58,13 @@ void bitmap_writer_build_type_index(struct pack_idx_entry **index,
 	writer.trees = ewah_new();
 	writer.blobs = ewah_new();
 	writer.tags = ewah_new();
+	ALLOC_ARRAY(to_pack->in_pack_pos, to_pack->nr_objects);
 
 	for (i = 0; i < index_nr; ++i) {
 		struct object_entry *entry = (struct object_entry *)index[i];
 		enum object_type real_type;
 
-		entry->in_pack_pos = i;
+		IN_PACK_POS(to_pack, entry) = i;
 
 		switch (entry->type) {
 		case OBJ_COMMIT:
@@ -147,7 +149,7 @@ static uint32_t find_object_pos(const unsigned char *sha1)
 			"(object %s is missing)", sha1_to_hex(sha1));
 	}
 
-	return entry->in_pack_pos;
+	return IN_PACK_POS(writer.to_pack, entry);
 }
 
 static void show_object(struct object *object, const char *name, void *data)
diff --git a/pack-bitmap.c b/pack-bitmap.c
index 9270983e5f..f21479fe16 100644
--- a/pack-bitmap.c
+++ b/pack-bitmap.c
@@ -1032,7 +1032,7 @@ int rebuild_existing_bitmaps(struct packing_data *mapping,
 		oe = packlist_find(mapping, sha1, NULL);
 
 		if (oe)
-			reposition[i] = oe->in_pack_pos + 1;
+			reposition[i] = IN_PACK_POS(mapping, oe) + 1;
 	}
 
 	rebuild = bitmap_new();
diff --git a/pack-bitmap.h b/pack-bitmap.h
index 3742a00e14..5ded2f139a 100644
--- a/pack-bitmap.h
+++ b/pack-bitmap.h
@@ -44,7 +44,9 @@ int rebuild_existing_bitmaps(struct packing_data *mapping, khash_sha1 *reused_bi
 
 void bitmap_writer_show_progress(int show);
 void bitmap_writer_set_checksum(unsigned char *sha1);
-void bitmap_writer_build_type_index(struct pack_idx_entry **index, uint32_t index_nr);
+void bitmap_writer_build_type_index(struct packing_data *to_pack,
+				    struct pack_idx_entry **index,
+				    uint32_t index_nr);
 void bitmap_writer_reuse_bitmaps(struct packing_data *to_pack);
 void bitmap_writer_select_commits(struct commit **indexed_commits,
 		unsigned int indexed_commits_nr, int max_bitmaps);
diff --git a/pack-objects.h b/pack-objects.h
index 017cc3425f..3bef28196c 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -4,6 +4,9 @@
 #define OE_DFS_STATE_BITS 2
 #define OE_DEPTH_BITS 12
 
+#define IN_PACK_POS(to_pack, obj) \
+	(to_pack)->in_pack_pos[(struct object_entry *)(obj) - (to_pack)->objects]
+
 /*
  * State flags for depth-first search used for analyzing delta cycles.
  *
@@ -31,7 +34,6 @@ struct object_entry {
 	unsigned long delta_size;	/* delta data size (uncompressed) */
 	unsigned long z_delta_size;	/* delta data size (compressed) */
 	uint32_t hash;			/* name hint hash */
-	unsigned int in_pack_pos;
 	unsigned char in_pack_header_size; /* note: spare bits available! */
 	unsigned type:TYPE_BITS;
 	unsigned in_pack_type:TYPE_BITS; /* could be delta */
@@ -46,7 +48,7 @@ struct object_entry {
 	unsigned dfs_state:OE_DFS_STATE_BITS;
 	unsigned depth:OE_DEPTH_BITS;
 
-	/* size: 120, padding: 4 */
+	/* size: 112 */
 };
 
 struct packing_data {
@@ -55,6 +57,8 @@ struct packing_data {
 
 	int32_t *index;
 	uint32_t index_size;
+
+	unsigned int *in_pack_pos;
 };
 
 struct object_entry *packlist_alloc(struct packing_data *pdata,
-- 
2.16.1.435.g8f24da2e1a


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH 07/11] pack-objects: move in_pack out of struct object_entry
  2018-03-01  9:10 ` [PATCH 00/11] Reduce pack-objects memory footprint Nguyễn Thái Ngọc Duy
                     ` (5 preceding siblings ...)
  2018-03-01  9:10   ` [PATCH 06/11] pack-objects: move in_pack_pos out of struct object_entry Nguyễn Thái Ngọc Duy
@ 2018-03-01  9:10   ` Nguyễn Thái Ngọc Duy
  2018-03-01 12:37     ` Ævar Arnfjörð Bjarmason
                       ` (2 more replies)
  2018-03-01  9:10   ` [PATCH 08/11] pack-objects: faster reverse packed_git lookup Nguyễn Thái Ngọc Duy
                     ` (5 subsequent siblings)
  12 siblings, 3 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-01  9:10 UTC (permalink / raw)
  To: git; +Cc: Jeff King, Eric Wong, Nguyễn Thái Ngọc Duy

Instead of using 8 bytes (on 64 bit arch) to store a pointer to a
pack. Use an index isntead since the number of packs should be
relatively small.

This limits the number of packs we can handle to 256 (still
unreasonably high for a repo to work well). If you have more than 256
packs, you'll need an older version of Git to repack first.

This technically saves 7 bytes. But we don't see any of that in
practice due to padding. The saving becomes real when we pack this
struct tighter later.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 48 ++++++++++++++++++++++++++++++++----------
 pack-objects.h         | 12 +++++++++--
 2 files changed, 47 insertions(+), 13 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 7bb5544883..d0d371714a 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -367,7 +367,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 				unsigned long limit, int usable_delta)
 {
-	struct packed_git *p = entry->in_pack;
+	struct packed_git *p = IN_PACK(&to_pack, entry);
 	struct pack_window *w_curs = NULL;
 	struct revindex_entry *revidx;
 	off_t offset;
@@ -478,7 +478,7 @@ static off_t write_object(struct hashfile *f,
 
 	if (!reuse_object)
 		to_reuse = 0;	/* explicit */
-	else if (!entry->in_pack)
+	else if (!IN_PACK(&to_pack, entry))
 		to_reuse = 0;	/* can't reuse what we don't have */
 	else if (entry->type == OBJ_REF_DELTA || entry->type == OBJ_OFS_DELTA)
 				/* check_object() decided it for us ... */
@@ -1074,7 +1074,15 @@ static void create_object_entry(const struct object_id *oid,
 	else
 		nr_result++;
 	if (found_pack) {
-		entry->in_pack = found_pack;
+		int i;
+
+		for (i = 0; i < (1 << OE_IN_PACK_BITS); i++)
+			if (to_pack.in_pack[i] == found_pack) {
+				entry->in_pack_idx = i;
+				break;
+			}
+		if (i == (1 << OE_IN_PACK_BITS))
+			die("BUG: pack not found!");
 		entry->in_pack_offset = found_offset;
 	}
 
@@ -1399,8 +1407,8 @@ static void cleanup_preferred_base(void)
 
 static void check_object(struct object_entry *entry)
 {
-	if (entry->in_pack) {
-		struct packed_git *p = entry->in_pack;
+	if (IN_PACK(&to_pack, entry)) {
+		struct packed_git *p = IN_PACK(&to_pack, entry);
 		struct pack_window *w_curs = NULL;
 		const unsigned char *base_ref = NULL;
 		struct object_entry *base_entry;
@@ -1535,14 +1543,16 @@ static int pack_offset_sort(const void *_a, const void *_b)
 {
 	const struct object_entry *a = *(struct object_entry **)_a;
 	const struct object_entry *b = *(struct object_entry **)_b;
+	const struct packed_git *a_in_pack = IN_PACK(&to_pack, a);
+	const struct packed_git *b_in_pack = IN_PACK(&to_pack, b);
 
 	/* avoid filesystem trashing with loose objects */
-	if (!a->in_pack && !b->in_pack)
+	if (!a_in_pack && !b_in_pack)
 		return oidcmp(&a->idx.oid, &b->idx.oid);
 
-	if (a->in_pack < b->in_pack)
+	if (a_in_pack < b_in_pack)
 		return -1;
-	if (a->in_pack > b->in_pack)
+	if (a_in_pack > b_in_pack)
 		return 1;
 	return a->in_pack_offset < b->in_pack_offset ? -1 :
 			(a->in_pack_offset > b->in_pack_offset);
@@ -1578,7 +1588,7 @@ static void drop_reused_delta(struct object_entry *entry)
 
 	oi.sizep = &entry->size;
 	oi.typep = &type;
-	if (packed_object_info(entry->in_pack, entry->in_pack_offset, &oi) < 0) {
+	if (packed_object_info(IN_PACK(&to_pack, entry), entry->in_pack_offset, &oi) < 0) {
 		/*
 		 * We failed to get the info from this pack for some reason;
 		 * fall back to sha1_object_info, which may find another copy.
@@ -1848,8 +1858,8 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	 * it, we will still save the transfer cost, as we already know
 	 * the other side has it and we won't send src_entry at all.
 	 */
-	if (reuse_delta && trg_entry->in_pack &&
-	    trg_entry->in_pack == src_entry->in_pack &&
+	if (reuse_delta && IN_PACK(&to_pack, trg_entry) &&
+	    IN_PACK(&to_pack, trg_entry) == IN_PACK(&to_pack, src_entry) &&
 	    !src_entry->preferred_base &&
 	    trg_entry->in_pack_type != OBJ_REF_DELTA &&
 	    trg_entry->in_pack_type != OBJ_OFS_DELTA)
@@ -2958,6 +2968,21 @@ static int option_parse_unpack_unreachable(const struct option *opt,
 	return 0;
 }
 
+static void init_in_pack_mapping(struct packing_data *to_pack)
+{
+	struct packed_git *p;
+	int i = 0;
+
+	/* let IN_PACK() return NULL if in_pack_idx is zero */
+	to_pack->in_pack[i++] = NULL;
+
+	for (p = packed_git; p; p = p->next, i++) {
+		if (i >= (1 << OE_IN_PACK_BITS))
+			die("BUG: too many packs to handle!");
+		to_pack->in_pack[i] = p;
+	}
+}
+
 int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 {
 	int use_internal_rev_list = 0;
@@ -3190,6 +3215,7 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 			}
 		}
 	}
+	init_in_pack_mapping(&to_pack);
 
 	if (progress)
 		progress_state = start_progress(_("Counting objects"), 0);
diff --git a/pack-objects.h b/pack-objects.h
index 3bef28196c..839d5dc4fd 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -3,10 +3,14 @@
 
 #define OE_DFS_STATE_BITS 2
 #define OE_DEPTH_BITS 12
+#define OE_IN_PACK_BITS 8
 
 #define IN_PACK_POS(to_pack, obj) \
 	(to_pack)->in_pack_pos[(struct object_entry *)(obj) - (to_pack)->objects]
 
+#define IN_PACK(to_pack, obj) \
+	(to_pack)->in_pack[(obj)->in_pack_idx]
+
 /*
  * State flags for depth-first search used for analyzing delta cycles.
  *
@@ -23,7 +27,6 @@ enum dfs_state {
 struct object_entry {
 	struct pack_idx_entry idx;
 	unsigned long size;	/* uncompressed size */
-	struct packed_git *in_pack;	/* already in pack */
 	off_t in_pack_offset;
 	struct object_entry *delta;	/* delta base object */
 	struct object_entry *delta_child; /* deltified objects who bases me */
@@ -35,6 +38,7 @@ struct object_entry {
 	unsigned long z_delta_size;	/* delta data size (compressed) */
 	uint32_t hash;			/* name hint hash */
 	unsigned char in_pack_header_size; /* note: spare bits available! */
+	unsigned in_pack_idx:OE_IN_PACK_BITS;	/* already in pack */
 	unsigned type:TYPE_BITS;
 	unsigned in_pack_type:TYPE_BITS; /* could be delta */
 	unsigned preferred_base:1; /*
@@ -46,9 +50,12 @@ struct object_entry {
 	unsigned tagged:1; /* near the very tip of refs */
 	unsigned filled:1; /* assigned write-order */
 	unsigned dfs_state:OE_DFS_STATE_BITS;
+
+	/* XXX 12 bits hole, try to pack */
+
 	unsigned depth:OE_DEPTH_BITS;
 
-	/* size: 112 */
+	/* size: 112, padding: 4 */
 };
 
 struct packing_data {
@@ -59,6 +66,7 @@ struct packing_data {
 	uint32_t index_size;
 
 	unsigned int *in_pack_pos;
+	struct packed_git *in_pack[1 << OE_IN_PACK_BITS];
 };
 
 struct object_entry *packlist_alloc(struct packing_data *pdata,
-- 
2.16.1.435.g8f24da2e1a


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH 08/11] pack-objects: faster reverse packed_git lookup
  2018-03-01  9:10 ` [PATCH 00/11] Reduce pack-objects memory footprint Nguyễn Thái Ngọc Duy
                     ` (6 preceding siblings ...)
  2018-03-01  9:10   ` [PATCH 07/11] pack-objects: move in_pack " Nguyễn Thái Ngọc Duy
@ 2018-03-01  9:10   ` Nguyễn Thái Ngọc Duy
  2018-03-01  9:10   ` [PATCH 09/11] pack-objects: refer to delta objects by index instead of pointer Nguyễn Thái Ngọc Duy
                     ` (4 subsequent siblings)
  12 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-01  9:10 UTC (permalink / raw)
  To: git; +Cc: Jeff King, Eric Wong, Nguyễn Thái Ngọc Duy

We do a linear search for in_pack index in create_object_entry(). This
function is called for every available object in the worst case (and
on linux-2.6.git, that's about 6.5M). Try to avoid that by saving the
index in packed_git. Since we should not have zillions of packs, this
extra space should not be a big deal.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 11 ++---------
 cache.h                |  1 +
 2 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index d0d371714a..1fdb85ebb5 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -1074,15 +1074,7 @@ static void create_object_entry(const struct object_id *oid,
 	else
 		nr_result++;
 	if (found_pack) {
-		int i;
-
-		for (i = 0; i < (1 << OE_IN_PACK_BITS); i++)
-			if (to_pack.in_pack[i] == found_pack) {
-				entry->in_pack_idx = i;
-				break;
-			}
-		if (i == (1 << OE_IN_PACK_BITS))
-			die("BUG: pack not found!");
+		entry->in_pack_idx = found_pack->index;
 		entry->in_pack_offset = found_offset;
 	}
 
@@ -2980,6 +2972,7 @@ static void init_in_pack_mapping(struct packing_data *to_pack)
 		if (i >= (1 << OE_IN_PACK_BITS))
 			die("BUG: too many packs to handle!");
 		to_pack->in_pack[i] = p;
+		p->index = i;
 	}
 }
 
diff --git a/cache.h b/cache.h
index 862bdff83a..b90feb3802 100644
--- a/cache.h
+++ b/cache.h
@@ -1635,6 +1635,7 @@ extern struct packed_git {
 	int index_version;
 	time_t mtime;
 	int pack_fd;
+	int index;		/* for builtin/pack-objects.c */
 	unsigned pack_local:1,
 		 pack_keep:1,
 		 freshened:1,
-- 
2.16.1.435.g8f24da2e1a


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH 09/11] pack-objects: refer to delta objects by index instead of pointer
  2018-03-01  9:10 ` [PATCH 00/11] Reduce pack-objects memory footprint Nguyễn Thái Ngọc Duy
                     ` (7 preceding siblings ...)
  2018-03-01  9:10   ` [PATCH 08/11] pack-objects: faster reverse packed_git lookup Nguyễn Thái Ngọc Duy
@ 2018-03-01  9:10   ` Nguyễn Thái Ngọc Duy
  2018-03-01 18:08     ` Junio C Hamano
  2018-03-01  9:10   ` [PATCH 10/11] pack-objects: reorder 'hash' to pack struct object_entry Nguyễn Thái Ngọc Duy
                     ` (3 subsequent siblings)
  12 siblings, 1 reply; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-01  9:10 UTC (permalink / raw)
  To: git; +Cc: Jeff King, Eric Wong, Nguyễn Thái Ngọc Duy

Notice that packing_data::nr_objects is uint32_t, we could only handle
maximum 4G objects and can address all of them with an uint32_t. If we
use a pointer here, we waste 4 bytes on 64 bit architecture.

Convert these delta pointers to indexes. Since we need to handle NULL
pointers as well, the index is shifted by one [1].

There are holes in this struct but this patch is already big. Struct
packing can be done separately. Even with holes, we save 8 bytes per
object_entry.

[1] This means we can only index 2^32-2 objects even though nr_objects
    could contain 2^32-1 objects. It should not be a problem in
    practice because when we grow objects[], nr_alloc would probably
    blow up long before nr_objects hits the wall.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 124 +++++++++++++++++++++++------------------
 pack-objects.h         |  14 +++--
 2 files changed, 78 insertions(+), 60 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 1fdb85ebb5..45076f2523 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -29,6 +29,20 @@
 #include "list.h"
 #include "packfile.h"
 
+#define DELTA(obj) \
+	((obj)->delta_idx ? &to_pack.objects[(obj)->delta_idx - 1] : NULL)
+#define DELTA_CHILD(obj) \
+	((obj)->delta_child_idx ? &to_pack.objects[(obj)->delta_child_idx - 1] : NULL)
+#define DELTA_SIBLING(obj) \
+	((obj)->delta_sibling_idx ? &to_pack.objects[(obj)->delta_sibling_idx - 1] : NULL)
+
+#define CLEAR_DELTA(obj) (obj)->delta_idx = 0
+#define CLEAR_DELTA_CHILD(obj) (obj)->delta_child_idx = 0
+#define CLEAR_DELTA_SIBLING(obj) (obj)->delta_sibling_idx = 0
+
+#define SET_DELTA(obj, val) (obj)->delta_idx = ((val) - to_pack.objects) + 1
+#define SET_DELTA_CHILD(obj, val) (obj)->delta_child_idx = ((val) - to_pack.objects) + 1
+
 static const char *pack_usage[] = {
 	N_("git pack-objects --stdout [<options>...] [< <ref-list> | < <object-list>]"),
 	N_("git pack-objects [<options>...] <base-name> [< <ref-list> | < <object-list>]"),
@@ -125,11 +139,11 @@ static void *get_delta(struct object_entry *entry)
 	buf = read_sha1_file(entry->idx.oid.hash, &type, &size);
 	if (!buf)
 		die("unable to read %s", oid_to_hex(&entry->idx.oid));
-	base_buf = read_sha1_file(entry->delta->idx.oid.hash, &type,
+	base_buf = read_sha1_file(DELTA(entry)->idx.oid.hash, &type,
 				  &base_size);
 	if (!base_buf)
 		die("unable to read %s",
-		    oid_to_hex(&entry->delta->idx.oid));
+		    oid_to_hex(&DELTA(entry)->idx.oid));
 	delta_buf = diff_delta(base_buf, base_size,
 			       buf, size, &delta_size, 0);
 	if (!delta_buf || delta_size != entry->delta_size)
@@ -286,12 +300,12 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 		size = entry->delta_size;
 		buf = entry->delta_data;
 		entry->delta_data = NULL;
-		type = (allow_ofs_delta && entry->delta->idx.offset) ?
+		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	} else {
 		buf = get_delta(entry);
 		size = entry->delta_size;
-		type = (allow_ofs_delta && entry->delta->idx.offset) ?
+		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	}
 
@@ -315,7 +329,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 		 * encoding of the relative offset for the delta
 		 * base from this object's position in the pack.
 		 */
-		off_t ofs = entry->idx.offset - entry->delta->idx.offset;
+		off_t ofs = entry->idx.offset - DELTA(entry)->idx.offset;
 		unsigned pos = sizeof(dheader) - 1;
 		dheader[pos] = ofs & 127;
 		while (ofs >>= 7)
@@ -341,7 +355,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 			return 0;
 		}
 		hashwrite(f, header, hdrlen);
-		hashwrite(f, entry->delta->idx.oid.hash, 20);
+		hashwrite(f, DELTA(entry)->idx.oid.hash, 20);
 		hdrlen += 20;
 	} else {
 		if (limit && hdrlen + datalen + 20 >= limit) {
@@ -377,8 +391,8 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 		      dheader[MAX_PACK_OBJECT_HEADER];
 	unsigned hdrlen;
 
-	if (entry->delta)
-		type = (allow_ofs_delta && entry->delta->idx.offset) ?
+	if (DELTA(entry))
+		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	hdrlen = encode_in_pack_object_header(header, sizeof(header),
 					      type, entry->size);
@@ -406,7 +420,7 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 	}
 
 	if (type == OBJ_OFS_DELTA) {
-		off_t ofs = entry->idx.offset - entry->delta->idx.offset;
+		off_t ofs = entry->idx.offset - DELTA(entry)->idx.offset;
 		unsigned pos = sizeof(dheader) - 1;
 		dheader[pos] = ofs & 127;
 		while (ofs >>= 7)
@@ -425,7 +439,7 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 			return 0;
 		}
 		hashwrite(f, header, hdrlen);
-		hashwrite(f, entry->delta->idx.oid.hash, 20);
+		hashwrite(f, DELTA(entry)->idx.oid.hash, 20);
 		hdrlen += 20;
 		reused_delta++;
 	} else {
@@ -465,13 +479,13 @@ static off_t write_object(struct hashfile *f,
 	else
 		limit = pack_size_limit - write_offset;
 
-	if (!entry->delta)
+	if (!DELTA(entry))
 		usable_delta = 0;	/* no delta */
 	else if (!pack_size_limit)
 	       usable_delta = 1;	/* unlimited packfile */
-	else if (entry->delta->idx.offset == (off_t)-1)
+	else if (DELTA(entry)->idx.offset == (off_t)-1)
 		usable_delta = 0;	/* base was written to another pack */
-	else if (entry->delta->idx.offset)
+	else if (DELTA(entry)->idx.offset)
 		usable_delta = 1;	/* base already exists in this pack */
 	else
 		usable_delta = 0;	/* base could end up in another pack */
@@ -486,7 +500,7 @@ static off_t write_object(struct hashfile *f,
 				/* ... but pack split may override that */
 	else if (entry->type != entry->in_pack_type)
 		to_reuse = 0;	/* pack has delta which is unusable */
-	else if (entry->delta)
+	else if (DELTA(entry))
 		to_reuse = 0;	/* we want to pack afresh */
 	else
 		to_reuse = 1;	/* we have it in-pack undeltified,
@@ -538,12 +552,12 @@ static enum write_one_status write_one(struct hashfile *f,
 	}
 
 	/* if we are deltified, write out base object first. */
-	if (e->delta) {
+	if (DELTA(e)) {
 		e->idx.offset = 1; /* now recurse */
-		switch (write_one(f, e->delta, offset)) {
+		switch (write_one(f, DELTA(e), offset)) {
 		case WRITE_ONE_RECURSIVE:
 			/* we cannot depend on this one */
-			e->delta = NULL;
+			CLEAR_DELTA(e);
 			break;
 		default:
 			break;
@@ -605,34 +619,34 @@ static void add_descendants_to_write_order(struct object_entry **wo,
 			/* add this node... */
 			add_to_write_order(wo, endp, e);
 			/* all its siblings... */
-			for (s = e->delta_sibling; s; s = s->delta_sibling) {
+			for (s = DELTA_SIBLING(e); s; s = DELTA_SIBLING(s)) {
 				add_to_write_order(wo, endp, s);
 			}
 		}
 		/* drop down a level to add left subtree nodes if possible */
-		if (e->delta_child) {
+		if (DELTA_CHILD(e)) {
 			add_to_order = 1;
-			e = e->delta_child;
+			e = DELTA_CHILD(e);
 		} else {
 			add_to_order = 0;
 			/* our sibling might have some children, it is next */
-			if (e->delta_sibling) {
-				e = e->delta_sibling;
+			if (DELTA_SIBLING(e)) {
+				e = DELTA_SIBLING(e);
 				continue;
 			}
 			/* go back to our parent node */
-			e = e->delta;
-			while (e && !e->delta_sibling) {
+			e = DELTA(e);
+			while (e && !DELTA_SIBLING(e)) {
 				/* we're on the right side of a subtree, keep
 				 * going up until we can go right again */
-				e = e->delta;
+				e = DELTA(e);
 			}
 			if (!e) {
 				/* done- we hit our original root node */
 				return;
 			}
 			/* pass it off to sibling at this level */
-			e = e->delta_sibling;
+			e = DELTA_SIBLING(e);
 		}
 	};
 }
@@ -643,7 +657,7 @@ static void add_family_to_write_order(struct object_entry **wo,
 {
 	struct object_entry *root;
 
-	for (root = e; root->delta; root = root->delta)
+	for (root = e; DELTA(root); root = DELTA(root))
 		; /* nothing */
 	add_descendants_to_write_order(wo, endp, root);
 }
@@ -658,8 +672,8 @@ static struct object_entry **compute_write_order(void)
 	for (i = 0; i < to_pack.nr_objects; i++) {
 		objects[i].tagged = 0;
 		objects[i].filled = 0;
-		objects[i].delta_child = NULL;
-		objects[i].delta_sibling = NULL;
+		CLEAR_DELTA_CHILD(&objects[i]);
+		CLEAR_DELTA_SIBLING(&objects[i]);
 	}
 
 	/*
@@ -669,11 +683,11 @@ static struct object_entry **compute_write_order(void)
 	 */
 	for (i = to_pack.nr_objects; i > 0;) {
 		struct object_entry *e = &objects[--i];
-		if (!e->delta)
+		if (!DELTA(e))
 			continue;
 		/* Mark me as the first child */
-		e->delta_sibling = e->delta->delta_child;
-		e->delta->delta_child = e;
+		e->delta_sibling_idx = DELTA(e)->delta_child_idx;
+		SET_DELTA_CHILD(DELTA(e), e);
 	}
 
 	/*
@@ -1491,10 +1505,10 @@ static void check_object(struct object_entry *entry)
 			 * circular deltas.
 			 */
 			entry->type = entry->in_pack_type;
-			entry->delta = base_entry;
+			SET_DELTA(entry, base_entry);
 			entry->delta_size = entry->size;
-			entry->delta_sibling = base_entry->delta_child;
-			base_entry->delta_child = entry;
+			entry->delta_sibling_idx = base_entry->delta_child_idx;
+			SET_DELTA_CHILD(base_entry, entry);
 			unuse_pack(&w_curs);
 			return;
 		}
@@ -1565,17 +1579,19 @@ static int pack_offset_sort(const void *_a, const void *_b)
  */
 static void drop_reused_delta(struct object_entry *entry)
 {
-	struct object_entry **p = &entry->delta->delta_child;
+	unsigned *idx = &to_pack.objects[entry->delta_idx - 1].delta_child_idx;
 	struct object_info oi = OBJECT_INFO_INIT;
 	enum object_type type;
 
-	while (*p) {
-		if (*p == entry)
-			*p = (*p)->delta_sibling;
+	while (*idx) {
+		struct object_entry *oe = &to_pack.objects[*idx - 1];
+
+		if (oe == entry)
+			*idx = oe->delta_sibling_idx;
 		else
-			p = &(*p)->delta_sibling;
+			idx = &oe->delta_sibling_idx;
 	}
-	entry->delta = NULL;
+	CLEAR_DELTA(entry);
 	entry->depth = 0;
 
 	oi.sizep = &entry->size;
@@ -1617,7 +1633,7 @@ static void break_delta_chains(struct object_entry *entry)
 
 	for (cur = entry, total_depth = 0;
 	     cur;
-	     cur = cur->delta, total_depth++) {
+	     cur = DELTA(cur), total_depth++) {
 		if (cur->dfs_state == DFS_DONE) {
 			/*
 			 * We've already seen this object and know it isn't
@@ -1642,7 +1658,7 @@ static void break_delta_chains(struct object_entry *entry)
 		 * it's not a delta, we're done traversing, but we'll mark it
 		 * done to save time on future traversals.
 		 */
-		if (!cur->delta) {
+		if (!DELTA(cur)) {
 			cur->dfs_state = DFS_DONE;
 			break;
 		}
@@ -1665,7 +1681,7 @@ static void break_delta_chains(struct object_entry *entry)
 		 * We keep all commits in the chain that we examined.
 		 */
 		cur->dfs_state = DFS_ACTIVE;
-		if (cur->delta->dfs_state == DFS_ACTIVE) {
+		if (DELTA(cur)->dfs_state == DFS_ACTIVE) {
 			drop_reused_delta(cur);
 			cur->dfs_state = DFS_DONE;
 			break;
@@ -1680,7 +1696,7 @@ static void break_delta_chains(struct object_entry *entry)
 	 * an extra "next" pointer to keep going after we reset cur->delta.
 	 */
 	for (cur = entry; cur; cur = next) {
-		next = cur->delta;
+		next = DELTA(cur);
 
 		/*
 		 * We should have a chain of zero or more ACTIVE states down to
@@ -1863,7 +1879,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 
 	/* Now some size filtering heuristics. */
 	trg_size = trg_entry->size;
-	if (!trg_entry->delta) {
+	if (!DELTA(trg_entry)) {
 		max_size = trg_size/2 - 20;
 		ref_depth = 1;
 	} else {
@@ -1939,7 +1955,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	if (!delta_buf)
 		return 0;
 
-	if (trg_entry->delta) {
+	if (DELTA(trg_entry)) {
 		/* Prefer only shallower same-sized deltas. */
 		if (delta_size == trg_entry->delta_size &&
 		    src->depth + 1 >= trg->depth) {
@@ -1968,7 +1984,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 		free(delta_buf);
 	}
 
-	trg_entry->delta = src_entry;
+	SET_DELTA(trg_entry, src_entry);
 	trg_entry->delta_size = delta_size;
 	trg->depth = src->depth + 1;
 
@@ -1977,13 +1993,13 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 
 static unsigned int check_delta_limit(struct object_entry *me, unsigned int n)
 {
-	struct object_entry *child = me->delta_child;
+	struct object_entry *child = DELTA_CHILD(me);
 	unsigned int m = n;
 	while (child) {
 		unsigned int c = check_delta_limit(child, n + 1);
 		if (m < c)
 			m = c;
-		child = child->delta_sibling;
+		child = DELTA_SIBLING(child);
 	}
 	return m;
 }
@@ -2052,7 +2068,7 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
 		 * otherwise they would become too deep.
 		 */
 		max_depth = depth;
-		if (entry->delta_child) {
+		if (DELTA_CHILD(entry)) {
 			max_depth -= check_delta_limit(entry, 0);
 			if (max_depth <= 0)
 				goto next;
@@ -2102,7 +2118,7 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
 		 * depth, leaving it in the window is pointless.  we
 		 * should evict it first.
 		 */
-		if (entry->delta && max_depth <= n->depth)
+		if (DELTA(entry) && max_depth <= n->depth)
 			continue;
 
 		/*
@@ -2110,7 +2126,7 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
 		 * currently deltified object, to keep it longer.  It will
 		 * be the first base object to be attempted next.
 		 */
-		if (entry->delta) {
+		if (DELTA(entry)) {
 			struct unpacked swap = array[best_base];
 			int dist = (window + idx - best_base) % window;
 			int dst = best_base;
@@ -2431,7 +2447,7 @@ static void prepare_pack(int window, int depth)
 	for (i = 0; i < to_pack.nr_objects; i++) {
 		struct object_entry *entry = to_pack.objects + i;
 
-		if (entry->delta)
+		if (DELTA(entry))
 			/* This happens if we decided to reuse existing
 			 * delta from a pack.  "reuse_delta &&" is implied.
 			 */
diff --git a/pack-objects.h b/pack-objects.h
index 839d5dc4fd..f339f0411a 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -28,11 +28,13 @@ struct object_entry {
 	struct pack_idx_entry idx;
 	unsigned long size;	/* uncompressed size */
 	off_t in_pack_offset;
-	struct object_entry *delta;	/* delta base object */
-	struct object_entry *delta_child; /* deltified objects who bases me */
-	struct object_entry *delta_sibling; /* other deltified objects who
-					     * uses the same base as me
-					     */
+	uint32_t delta_idx;	/* delta base object */
+	uint32_t delta_child_idx; /* deltified objects who bases me */
+	uint32_t delta_sibling_idx; /* other deltified objects who
+				     * uses the same base as me
+				     */
+	/* XXX 4 bytes hole, try to pack */
+
 	void *delta_data;	/* cached delta (uncompressed) */
 	unsigned long delta_size;	/* delta data size (uncompressed) */
 	unsigned long z_delta_size;	/* delta data size (compressed) */
@@ -55,7 +57,7 @@ struct object_entry {
 
 	unsigned depth:OE_DEPTH_BITS;
 
-	/* size: 112, padding: 4 */
+	/* size: 104, padding: 4 */
 };
 
 struct packing_data {
-- 
2.16.1.435.g8f24da2e1a


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH 10/11] pack-objects: reorder 'hash' to pack struct object_entry
  2018-03-01  9:10 ` [PATCH 00/11] Reduce pack-objects memory footprint Nguyễn Thái Ngọc Duy
                     ` (8 preceding siblings ...)
  2018-03-01  9:10   ` [PATCH 09/11] pack-objects: refer to delta objects by index instead of pointer Nguyễn Thái Ngọc Duy
@ 2018-03-01  9:10   ` Nguyễn Thái Ngọc Duy
  2018-03-01  9:10   ` [PATCH 11/11] pack-objects: increase pack file limit to 4096 Nguyễn Thái Ngọc Duy
                     ` (2 subsequent siblings)
  12 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-01  9:10 UTC (permalink / raw)
  To: git; +Cc: Jeff King, Eric Wong, Nguyễn Thái Ngọc Duy

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 pack-objects.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/pack-objects.h b/pack-objects.h
index f339f0411a..52087b32e5 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -33,12 +33,10 @@ struct object_entry {
 	uint32_t delta_sibling_idx; /* other deltified objects who
 				     * uses the same base as me
 				     */
-	/* XXX 4 bytes hole, try to pack */
-
+	uint32_t hash;			/* name hint hash */
 	void *delta_data;	/* cached delta (uncompressed) */
 	unsigned long delta_size;	/* delta data size (uncompressed) */
 	unsigned long z_delta_size;	/* delta data size (compressed) */
-	uint32_t hash;			/* name hint hash */
 	unsigned char in_pack_header_size; /* note: spare bits available! */
 	unsigned in_pack_idx:OE_IN_PACK_BITS;	/* already in pack */
 	unsigned type:TYPE_BITS;
@@ -57,7 +55,7 @@ struct object_entry {
 
 	unsigned depth:OE_DEPTH_BITS;
 
-	/* size: 104, padding: 4 */
+	/* size: 96 */
 };
 
 struct packing_data {
-- 
2.16.1.435.g8f24da2e1a


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH 11/11] pack-objects: increase pack file limit to 4096
  2018-03-01  9:10 ` [PATCH 00/11] Reduce pack-objects memory footprint Nguyễn Thái Ngọc Duy
                     ` (9 preceding siblings ...)
  2018-03-01  9:10   ` [PATCH 10/11] pack-objects: reorder 'hash' to pack struct object_entry Nguyễn Thái Ngọc Duy
@ 2018-03-01  9:10   ` Nguyễn Thái Ngọc Duy
  2018-03-01 13:33   ` [PATCH 00/11] Reduce pack-objects memory footprint Ævar Arnfjörð Bjarmason
  2018-03-03  2:46   ` [PATCH/RFC v2 0/9] " Nguyễn Thái Ngọc Duy
  12 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-01  9:10 UTC (permalink / raw)
  To: git; +Cc: Jeff King, Eric Wong, Nguyễn Thái Ngọc Duy

When OE_IN_PACK_BITS was added, we didn't have many bits left to spare
so the max number of packs that pack-objects could handle was limited
to 256. Now we have more spare bits, let's increase it to 4096 to be
on the safe side. If you have more than this many packs, you may need
to reconsider if you're still sane.

Increasing this also increases memory a bit because in_pack[] array in
packing_data is bigger, roughly 32kb, which is insignificant in
pack-objects context.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 pack-objects.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/pack-objects.h b/pack-objects.h
index 52087b32e5..ec4eba4ee4 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -3,7 +3,7 @@
 
 #define OE_DFS_STATE_BITS 2
 #define OE_DEPTH_BITS 12
-#define OE_IN_PACK_BITS 8
+#define OE_IN_PACK_BITS 12
 
 #define IN_PACK_POS(to_pack, obj) \
 	(to_pack)->in_pack_pos[(struct object_entry *)(obj) - (to_pack)->objects]
@@ -24,6 +24,11 @@ enum dfs_state {
 	DFS_NUM_STATES
 };
 
+/*
+ * The size of struct nearly determines pack-objects's memory
+ * consumption. This struct is packed tight for that reason. When you
+ * add or reorder something in this struct, think a bit about this.
+ */
 struct object_entry {
 	struct pack_idx_entry idx;
 	unsigned long size;	/* uncompressed size */
@@ -51,7 +56,7 @@ struct object_entry {
 	unsigned filled:1; /* assigned write-order */
 	unsigned dfs_state:OE_DFS_STATE_BITS;
 
-	/* XXX 12 bits hole, try to pack */
+	/* XXX 8 bits hole, try to pack */
 
 	unsigned depth:OE_DEPTH_BITS;
 
-- 
2.16.1.435.g8f24da2e1a


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH/RFC 0/1] Avoid expensive 'repack -ad' in gc --auto
  2018-02-28  9:27 Reduce pack-objects memory footprint? Duy Nguyen
                   ` (2 preceding siblings ...)
  2018-03-01  9:10 ` [PATCH 00/11] Reduce pack-objects memory footprint Nguyễn Thái Ngọc Duy
@ 2018-03-01  9:20 ` Nguyễn Thái Ngọc Duy
  2018-03-01  9:20   ` [PATCH/RFC 1/1] gc --auto: exclude the largest giant pack in low-memory config Nguyễn Thái Ngọc Duy
  2018-03-06 10:41   ` [PATCH v2 0/5] Avoid expensive 'repack -ad' in gc --auto Nguyễn Thái Ngọc Duy
  2018-03-02 10:18 ` Reduce pack-objects memory footprint? Duy Nguyen
  2018-03-17 22:05 ` Why does pack-objects use so much memory on incremental packing? Ævar Arnfjörð Bjarmason
  5 siblings, 2 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-01  9:20 UTC (permalink / raw)
  To: git; +Cc: Jeff King, Eric Wong, Nguyễn Thái Ngọc Duy

The series [1] I just sent helps reduce pack-objects memory footprint
a bit. But even then it's still a huge memory hog. So this patch makes
a special treatment for gc --auto: avoid it completely.

The trick here is not new (pinning the largest pack with a .keep
file). It's just never done automatically. I think this is a good
thing to do, provided that gc --auto estimates memory usage more or
less correct.

And "git gc --auto" should run even on weak machines because it's part
of regular repo maintenance. You can't tell people "You can't work on
linux-2.6.git repository because your machine has too little memory".

The only thing left I think I should do is to use an external rev-list
to free up some more memory. But let's see how the first patch goes
first (documents and tests are missing, I know).

[1] https://public-inbox.org/git/%3C20180301091052.32267-1-pclouds@gmail.com%3E/

Nguyễn Thái Ngọc Duy (1):
  gc --auto: exclude the largest giant pack in low-memory config

 builtin/gc.c           | 125 +++++++++++++++++++++++++++++++++++++++--
 builtin/pack-objects.c |   2 +-
 config.mak.uname       |   1 +
 git-compat-util.h      |   4 ++
 pack-objects.h         |   2 +
 5 files changed, 128 insertions(+), 6 deletions(-)

-- 
2.16.1.435.g8f24da2e1a


^ permalink raw reply	[flat|nested] 273+ messages in thread

* [PATCH/RFC 1/1] gc --auto: exclude the largest giant pack in low-memory config
  2018-03-01  9:20 ` [PATCH/RFC 0/1] Avoid expensive 'repack -ad' in gc --auto Nguyễn Thái Ngọc Duy
@ 2018-03-01  9:20   ` Nguyễn Thái Ngọc Duy
  2018-03-01 18:14     ` Junio C Hamano
  2018-03-05 14:00     ` Ævar Arnfjörð Bjarmason
  2018-03-06 10:41   ` [PATCH v2 0/5] Avoid expensive 'repack -ad' in gc --auto Nguyễn Thái Ngọc Duy
  1 sibling, 2 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-01  9:20 UTC (permalink / raw)
  To: git; +Cc: Jeff King, Eric Wong, Nguyễn Thái Ngọc Duy

pack-objects could be a big memory hog especially on large repos,
everybody knows that. The suggestion to stick a .keep file on the
largest pack to avoid this problem is also known for a long time.

Let's do the suggestion automatically instead of waiting for people to
come to Git mailing list and get the advice. When a certain condition
is met, gc --auto create a .keep file temporary before repack is run,
then remove it afterward.

gc --auto does this based on an estimation of pack-objects memory
usage and whether that fits in one third of system memory (the
assumption here is for desktop environment where there are many other
applications running).

Since the estimation may be inaccurate and that 1/3 threshold is
arbitrary, give the user a finer control over this mechanism as well:
if the largest pack is larger than gc.bigPackThreshold, it's kept.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/gc.c           | 125 +++++++++++++++++++++++++++++++++++++++--
 builtin/pack-objects.c |   2 +-
 config.mak.uname       |   1 +
 git-compat-util.h      |   4 ++
 pack-objects.h         |   2 +
 5 files changed, 128 insertions(+), 6 deletions(-)

diff --git a/builtin/gc.c b/builtin/gc.c
index 77fa720bd0..2d9965bcdf 100644
--- a/builtin/gc.c
+++ b/builtin/gc.c
@@ -20,6 +20,10 @@
 #include "argv-array.h"
 #include "commit.h"
 #include "packfile.h"
+#include "pack.h"
+#include "pack-objects.h"
+#include "blob.h"
+#include "tree.h"
 
 #define FAILED_RUN "failed to run %s"
 
@@ -39,6 +43,8 @@ static timestamp_t gc_log_expire_time;
 static const char *gc_log_expire = "1.day.ago";
 static const char *prune_expire = "2.weeks.ago";
 static const char *prune_worktrees_expire = "3.months.ago";
+static unsigned long big_pack_threshold = 0;
+static unsigned long max_delta_cache_size = DEFAULT_DELTA_CACHE_SIZE;
 
 static struct argv_array pack_refs_cmd = ARGV_ARRAY_INIT;
 static struct argv_array reflog = ARGV_ARRAY_INIT;
@@ -49,6 +55,7 @@ static struct argv_array rerere = ARGV_ARRAY_INIT;
 
 static struct tempfile *pidfile;
 static struct lock_file log_lock;
+static struct strbuf temp_keep_file = STRBUF_INIT;
 
 static struct string_list pack_garbage = STRING_LIST_INIT_DUP;
 
@@ -93,6 +100,18 @@ static void process_log_file(void)
 	}
 }
 
+static void delete_temp_keep_file(void)
+{
+	unlink(temp_keep_file.buf);
+}
+
+static void delete_temp_keep_file_on_signal(int signo)
+{
+	delete_temp_keep_file();
+	sigchain_pop(signo);
+	raise(signo);
+}
+
 static void process_log_file_at_exit(void)
 {
 	fflush(stderr);
@@ -126,6 +145,9 @@ static void gc_config(void)
 	git_config_get_expiry("gc.worktreepruneexpire", &prune_worktrees_expire);
 	git_config_get_expiry("gc.logexpiry", &gc_log_expire);
 
+	git_config_get_ulong("gc.bigpackthreshold", &big_pack_threshold);
+	git_config_get_ulong("pack.deltacachesize", &max_delta_cache_size);
+
 	git_config(git_default_config, NULL);
 }
 
@@ -164,7 +186,7 @@ static int too_many_loose_objects(void)
 	return needed;
 }
 
-static int too_many_packs(void)
+static int too_many_packs(struct packed_git **largest_pack)
 {
 	struct packed_git *p;
 	int cnt;
@@ -173,22 +195,104 @@ static int too_many_packs(void)
 		return 0;
 
 	prepare_packed_git();
+	*largest_pack = NULL;
 	for (cnt = 0, p = packed_git; p; p = p->next) {
 		if (!p->pack_local)
 			continue;
 		if (p->pack_keep)
 			continue;
+		if (!*largest_pack || (*largest_pack)->pack_size  < p->pack_size)
+			*largest_pack = p;
 		/*
 		 * Perhaps check the size of the pack and count only
 		 * very small ones here?
 		 */
 		cnt++;
 	}
+
 	return gc_auto_pack_limit < cnt;
 }
 
-static void add_repack_all_option(void)
+static inline unsigned long total_ram(void)
+{
+	unsigned long default_ram = 4;
+#ifdef HAVE_SYSINFO
+	struct sysinfo si;
+
+	if (!sysinfo(&si))
+		return si.totalram;
+#elif defined(HAVE_BSD_SYSCTL) && defined(HW_MEMSIZE)
+	int64_t physical_memory;
+	int mib[2];
+	int length;
+
+	mib[0] = CTL_HW;
+	mib[1] = HW_MEMSIZE;
+	length = sizeof(int64_t);
+	if (!sysctl(mib, 2, &physical_memory, &length, NULL, 0))
+		return physical_memory;
+#elif defined(GIT_WINDOWS_NATIVE)
+	MEMORYSTATUSEX memInfo;
+
+	memInfo.dwLength = sizeof(MEMORYSTATUSEX);
+	if (GlobalMemoryStatusEx(&memInfo))
+		return memInfo;ullTotalPhys;
+#else
+	fprintf(stderr, _("unrecognized platform, assuming %lu GB RAM\n"),
+		default_ram);
+#endif
+	return default_ram * 1024 * 1024 * 1024;
+}
+
+static int pack_objects_uses_too_much_memory(struct packed_git *pack)
+{
+	unsigned long nr_objects = approximate_object_count();
+	size_t mem_want, mem_have;
+
+	if (!pack || !nr_objects)
+		return 0;
+
+	if (big_pack_threshold)
+		return pack->pack_size >= big_pack_threshold;
+
+	/* First we have to scan through at least one pack */
+	mem_want = pack->pack_size + pack->index_size;
+	/* then pack-objects needs lots more for book keeping */
+	mem_want += sizeof(struct object_entry) * nr_objects;
+	/*
+	 * internal rev-list --all --objects takes up some memory too,
+	 * let's say half of it is for blobs
+	 */
+	mem_want += sizeof(struct blob) * nr_objects / 2;
+	/*
+	 * and the other half is for trees (commits and tags are
+	 * usually insignificant)
+	 */
+	mem_want += sizeof(struct tree) * nr_objects / 2;
+	/* and then obj_hash[], underestimated in fact */
+	mem_want += sizeof(struct object *) * nr_objects;
+	/*
+	 * read_sha1_file() (either at delta calculation phase, or
+	 * writing phase) also fills up the delta base cache
+	 */
+	mem_want += delta_base_cache_limit;
+	/* and of course pack-objects has its own delta cache */
+	mem_want += max_delta_cache_size;
+
+	/* Only allow 1/3 of memory for pack-objects */
+	mem_have = total_ram() / 3;
+
+	return mem_want >= mem_have;
+
+}
+
+static void add_repack_all_option(struct packed_git *exclude_pack)
 {
+	if (pack_objects_uses_too_much_memory(exclude_pack)) {
+		strbuf_addstr(&temp_keep_file, exclude_pack->pack_name);
+		strbuf_strip_suffix(&temp_keep_file, ".pack");
+		strbuf_addstr(&temp_keep_file, ".keep");
+	}
 	if (prune_expire && !strcmp(prune_expire, "now"))
 		argv_array_push(&repack, "-a");
 	else {
@@ -205,6 +309,7 @@ static void add_repack_incremental_option(void)
 
 static int need_to_gc(void)
 {
+	struct packed_git *largest_pack = NULL;
 	/*
 	 * Setting gc.auto to 0 or negative can disable the
 	 * automatic gc.
@@ -218,8 +323,8 @@ static int need_to_gc(void)
 	 * we run "repack -A -d -l".  Otherwise we tell the caller
 	 * there is no need.
 	 */
-	if (too_many_packs())
-		add_repack_all_option();
+	if (too_many_packs(&largest_pack))
+		add_repack_all_option(largest_pack);
 	else if (too_many_loose_objects())
 		add_repack_incremental_option();
 	else
@@ -428,7 +533,7 @@ int cmd_gc(int argc, const char **argv, const char *prefix)
 			daemonized = !daemonize();
 		}
 	} else
-		add_repack_all_option();
+		add_repack_all_option(NULL);
 
 	name = lock_repo_for_gc(force, &pid);
 	if (name) {
@@ -450,6 +555,16 @@ int cmd_gc(int argc, const char **argv, const char *prefix)
 	if (gc_before_repack())
 		return -1;
 
+	if (temp_keep_file.len) {
+		int fd = open(temp_keep_file.buf, O_CREAT | O_RDWR, 0644);
+		if (fd != -1) {
+			sigchain_push_common(delete_temp_keep_file_on_signal);
+			atexit(delete_temp_keep_file);
+			close(fd);
+		} else {
+			strbuf_release(&temp_keep_file);
+		}
+	}
 	if (!repository_format_precious_objects) {
 		if (run_command_v_opt(repack.argv, RUN_GIT_CMD))
 			return error(FAILED_RUN, repack.argv[0]);
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 5c674b2843..722cc999dc 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -78,7 +78,7 @@ static uint16_t write_bitmap_options;
 static int exclude_promisor_objects;
 
 static unsigned long delta_cache_size = 0;
-static unsigned long max_delta_cache_size = 256 * 1024 * 1024;
+static unsigned long max_delta_cache_size = DEFAULT_DELTA_CACHE_SIZE;
 static unsigned long cache_max_small_delta_size = 1000;
 
 static unsigned long window_memory_limit = 0;
diff --git a/config.mak.uname b/config.mak.uname
index 6a1d0de0cc..ae9cbccec1 100644
--- a/config.mak.uname
+++ b/config.mak.uname
@@ -37,6 +37,7 @@ ifeq ($(uname_S),Linux)
 	HAVE_GETDELIM = YesPlease
 	SANE_TEXT_GREP=-a
 	FREAD_READS_DIRECTORIES = UnfortunatelyYes
+	BASIC_CFLAGS += -DHAVE_SYSINFO
 endif
 ifeq ($(uname_S),GNU/kFreeBSD)
 	HAVE_ALLOCA_H = YesPlease
diff --git a/git-compat-util.h b/git-compat-util.h
index 68b2ad531e..a84b21986d 100644
--- a/git-compat-util.h
+++ b/git-compat-util.h
@@ -284,6 +284,10 @@ extern char *gitdirname(char *);
 #include <openssl/err.h>
 #endif
 
+#ifdef HAVE_SYSINFO
+# include <sys/sysinfo.h>
+#endif
+
 /* On most systems <netdb.h> would have given us this, but
  * not on some systems (e.g. z/OS).
  */
diff --git a/pack-objects.h b/pack-objects.h
index 03f1191659..af4f46c026 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -1,6 +1,8 @@
 #ifndef PACK_OBJECTS_H
 #define PACK_OBJECTS_H
 
+#define DEFAULT_DELTA_CACHE_SIZE (256 * 1024 * 1024)
+
 struct object_entry {
 	struct pack_idx_entry idx;
 	unsigned long size;	/* uncompressed size */
-- 
2.16.1.435.g8f24da2e1a


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* Re: [PATCH 07/11] pack-objects: move in_pack out of struct object_entry
  2018-03-01  9:10   ` [PATCH 07/11] pack-objects: move in_pack " Nguyễn Thái Ngọc Duy
@ 2018-03-01 12:37     ` Ævar Arnfjörð Bjarmason
  2018-03-01 14:49     ` Jeff King
  2018-03-01 18:05     ` Junio C Hamano
  2 siblings, 0 replies; 273+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2018-03-01 12:37 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git, Jeff King, Eric Wong


On Thu, Mar 01 2018, Nguyễn Thái Ngọc Duy jotted:

> pack. Use an index isntead since the number of packs should be

s/isntead/instead/

> This limits the number of packs we can handle to 256 (still
> unreasonably high for a repo to work well). If you have more than 256
> packs, you'll need an older version of Git to repack first.

So if you have gc.autoPackLimit=300 this will break, how does it break?

Should we also make (& document) setting that variable higher than 256
an error?

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH 00/11] Reduce pack-objects memory footprint
  2018-03-01  9:10 ` [PATCH 00/11] Reduce pack-objects memory footprint Nguyễn Thái Ngọc Duy
                     ` (10 preceding siblings ...)
  2018-03-01  9:10   ` [PATCH 11/11] pack-objects: increase pack file limit to 4096 Nguyễn Thái Ngọc Duy
@ 2018-03-01 13:33   ` Ævar Arnfjörð Bjarmason
  2018-03-02  0:14     ` Duy Nguyen
  2018-03-03  2:46   ` [PATCH/RFC v2 0/9] " Nguyễn Thái Ngọc Duy
  12 siblings, 1 reply; 273+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2018-03-01 13:33 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git, Jeff King, Eric Wong


On Thu, Mar 01 2018, Nguyễn Thái Ngọc Duy jotted:

> The array of object_entry in pack-objects can take a lot of memory
> when pack-objects is run in "pack everything" mode. On linux-2.6.git,
> this array alone takes roughly 800MB.
>
> This series reorders some fields and reduces field size... to keep
> this struct smaller. Its size goes from 136 bytes to 96 bytes (29%) on
> 64-bit linux and saves 260MB on linux-2.6.git.

I'm very interested in this patch series. I don't have time to test this
one right now (have to run), but with your previous RFC patch memory use
(in the ~4GB range) on a big in-house repo went down by a bit over 3%,
and it's ~5% faster.

Before/after RSS 4440812 / 4290000 & runtime 172.73 / 162.45. This is
after having already done a full git gc before, data via /usr/bin/time
-v.

So not huge, but respectable.

We have a big repo, and this gets repacked on 6-8GB of memory on dev
KVMs, so we're under a fair bit of memory pressure. git-gc slows things
down a lot.

It would be really nice to have something that made it use drastically
less memory at the cost of less efficient packs. Is the property that
you need to spend give or take the size of .git/objects in memory
something inherent, or just a limitation of the current implementation?
I.e. could we do a first pass to pick some objects based on some
heuristic, then repack them N at a time, and finally delete the
now-obsolete packs?

Another thing I've dealt with is that on these machines their
NFS-mounted storage gets exhausted (I'm told) due to some pathological
operations git does during repack, I/O tends to get 5-6x slower. Of
course ionice doesn't help because the local kernel doesn't know
anything about how harmful it is.

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH 07/11] pack-objects: move in_pack out of struct object_entry
  2018-03-01  9:10   ` [PATCH 07/11] pack-objects: move in_pack " Nguyễn Thái Ngọc Duy
  2018-03-01 12:37     ` Ævar Arnfjörð Bjarmason
@ 2018-03-01 14:49     ` Jeff King
  2018-03-02  0:02       ` Duy Nguyen
  2018-03-01 18:05     ` Junio C Hamano
  2 siblings, 1 reply; 273+ messages in thread
From: Jeff King @ 2018-03-01 14:49 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git, Eric Wong

On Thu, Mar 01, 2018 at 04:10:48PM +0700, Nguyễn Thái Ngọc Duy wrote:

> Instead of using 8 bytes (on 64 bit arch) to store a pointer to a
> pack. Use an index isntead since the number of packs should be
> relatively small.
> 
> This limits the number of packs we can handle to 256 (still
> unreasonably high for a repo to work well). If you have more than 256
> packs, you'll need an older version of Git to repack first.

I overall like the direction of this series, but I think this one is
just too much. While you definitely shouldn't have a ton of packs, this
leaves the user with no real escape hatch. And 256 isn't actually that
many packs.

-Peff

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH 04/11] pack-objects: use bitfield for object_entry::depth
  2018-03-01  9:10   ` [PATCH 04/11] pack-objects: use bitfield for object_entry::depth Nguyễn Thái Ngọc Duy
@ 2018-03-01 18:00     ` Junio C Hamano
  0 siblings, 0 replies; 273+ messages in thread
From: Junio C Hamano @ 2018-03-01 18:00 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git, Jeff King, Eric Wong

Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:

> This does not give us any saving due to padding. But we will be able
> to save once we cut 4 bytes out of this struct in a subsequent patch.
>
> Because of struct packing from now on we can only handle max depth
> 4095 (or even lower when new booleans are added in this struct). This
> should be ok since long delta chain will cause significant slow down
> anyway.
>
> Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
> ---

This should be marked as RFC/PATCH; I do not have objection to
limiting the delta depth to some reasonable length (rather than the
current 1<<31 or worse 1<<63), and 4k may be such a reasonable limit
(I'd actually think anything more than a few hundreds is probably a
bad idea), but it needs to be documented.

>  builtin/pack-objects.c | 4 ++++
>  pack-objects.h         | 6 ++----
>  2 files changed, 6 insertions(+), 4 deletions(-)
>
> diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
> index a4dbb40824..cfd97da7db 100644
> --- a/builtin/pack-objects.c
> +++ b/builtin/pack-objects.c
> @@ -3068,6 +3068,10 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
>  	if (pack_to_stdout != !base_name || argc)
>  		usage_with_options(pack_usage, pack_objects_options);
>  
> +	if (depth > (1 << OE_DEPTH_BITS))
> +		die(_("delta chain depth %d is greater than maximum limit %d"),
> +		    depth, (1 << OE_DEPTH_BITS));
> +
>  	argv_array_push(&rp, "pack-objects");
>  	if (thin) {
>  		use_internal_rev_list = 1;
> diff --git a/pack-objects.h b/pack-objects.h
> index fca334ab4d..3941e6c9a6 100644
> --- a/pack-objects.h
> +++ b/pack-objects.h
> @@ -2,6 +2,7 @@
>  #define PACK_OBJECTS_H
>  
>  #define OE_DFS_STATE_BITS 2
> +#define OE_DEPTH_BITS 12
>  
>  /*
>   * State flags for depth-first search used for analyzing delta cycles.
> @@ -43,10 +44,7 @@ struct object_entry {
>  	unsigned tagged:1; /* near the very tip of refs */
>  	unsigned filled:1; /* assigned write-order */
>  	unsigned dfs_state:OE_DFS_STATE_BITS;
> -
> -	/* XXX 20 bits hole, try to pack */
> -
> -	int depth;
> +	unsigned depth:OE_DEPTH_BITS;
>  
>  	/* size: 120, padding: 4 */
>  };

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH 07/11] pack-objects: move in_pack out of struct object_entry
  2018-03-01  9:10   ` [PATCH 07/11] pack-objects: move in_pack " Nguyễn Thái Ngọc Duy
  2018-03-01 12:37     ` Ævar Arnfjörð Bjarmason
  2018-03-01 14:49     ` Jeff King
@ 2018-03-01 18:05     ` Junio C Hamano
  2 siblings, 0 replies; 273+ messages in thread
From: Junio C Hamano @ 2018-03-01 18:05 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git, Jeff King, Eric Wong

Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:

> Instead of using 8 bytes (on 64 bit arch) to store a pointer to a
> pack. Use an index isntead since the number of packs should be
> relatively small.
>
> This limits the number of packs we can handle to 256 (still
> unreasonably high for a repo to work well). If you have more than 256
> packs, you'll need an older version of Git to repack first.

I can tell without looking at the rest of the thread that people had
reasonable objection against this stance ;-)  This will not fly well.

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH 09/11] pack-objects: refer to delta objects by index instead of pointer
  2018-03-01  9:10   ` [PATCH 09/11] pack-objects: refer to delta objects by index instead of pointer Nguyễn Thái Ngọc Duy
@ 2018-03-01 18:08     ` Junio C Hamano
  0 siblings, 0 replies; 273+ messages in thread
From: Junio C Hamano @ 2018-03-01 18:08 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git, Jeff King, Eric Wong

Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:

> Notice that packing_data::nr_objects is uint32_t, we could only handle
> maximum 4G objects and can address all of them with an uint32_t. If we
> use a pointer here, we waste 4 bytes on 64 bit architecture.
>
> Convert these delta pointers to indexes. Since we need to handle NULL
> pointers as well, the index is shifted by one [1].

Makes perfect sense.

I do not think losing 1 slot out of possible 4G is a regression,
unlike the 256 packfile limit 07/11 imposes.

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH/RFC 1/1] gc --auto: exclude the largest giant pack in low-memory config
  2018-03-01  9:20   ` [PATCH/RFC 1/1] gc --auto: exclude the largest giant pack in low-memory config Nguyễn Thái Ngọc Duy
@ 2018-03-01 18:14     ` Junio C Hamano
  2018-03-02  0:00       ` Duy Nguyen
  2018-03-05 14:00     ` Ævar Arnfjörð Bjarmason
  1 sibling, 1 reply; 273+ messages in thread
From: Junio C Hamano @ 2018-03-01 18:14 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git, Jeff King, Eric Wong

Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:

> pack-objects could be a big memory hog especially on large repos,
> everybody knows that. The suggestion to stick a .keep file on the
> largest pack to avoid this problem is also known for a long time.

Yup, but not that it is not "largest" per-se.  The thing being large
is a mere consequence that it is the base pack that holds the bulk
of older parts of the history (e.g. the one that you obtained via
the initial clone).

> Let's do the suggestion automatically instead of waiting for people to
> come to Git mailing list and get the advice. When a certain condition
> is met, gc --auto create a .keep file temporary before repack is run,
> then remove it afterward.
>
> gc --auto does this based on an estimation of pack-objects memory
> usage and whether that fits in one third of system memory (the
> assumption here is for desktop environment where there are many other
> applications running).
>
> Since the estimation may be inaccurate and that 1/3 threshold is
> arbitrary, give the user a finer control over this mechanism as well:
> if the largest pack is larger than gc.bigPackThreshold, it's kept.

If this is a transient mechanism during a single gc session, it
would be far more preferrable if we can find a way to do this
without actually having a .keep file on the filesystem.


^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH/RFC 1/1] gc --auto: exclude the largest giant pack in low-memory config
  2018-03-01 18:14     ` Junio C Hamano
@ 2018-03-02  0:00       ` Duy Nguyen
  0 siblings, 0 replies; 273+ messages in thread
From: Duy Nguyen @ 2018-03-02  0:00 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Git Mailing List, Jeff King, Eric Wong

On Fri, Mar 2, 2018 at 1:14 AM, Junio C Hamano <gitster@pobox.com> wrote:
> Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:
>
>> pack-objects could be a big memory hog especially on large repos,
>> everybody knows that. The suggestion to stick a .keep file on the
>> largest pack to avoid this problem is also known for a long time.
>
> Yup, but not that it is not "largest" per-se.  The thing being large
> is a mere consequence that it is the base pack that holds the bulk
> of older parts of the history (e.g. the one that you obtained via
> the initial clone).

Thanks, "base pack" sounds much better. I was having trouble with
wording because I didn't nail this one down.

>> Let's do the suggestion automatically instead of waiting for people to
>> come to Git mailing list and get the advice. When a certain condition
>> is met, gc --auto create a .keep file temporary before repack is run,
>> then remove it afterward.
>>
>> gc --auto does this based on an estimation of pack-objects memory
>> usage and whether that fits in one third of system memory (the
>> assumption here is for desktop environment where there are many other
>> applications running).
>>
>> Since the estimation may be inaccurate and that 1/3 threshold is
>> arbitrary, give the user a finer control over this mechanism as well:
>> if the largest pack is larger than gc.bigPackThreshold, it's kept.
>
> If this is a transient mechanism during a single gc session, it
> would be far more preferrable if we can find a way to do this
> without actually having a .keep file on the filesystem.

That was my first attempt, manipulating packed_git::pack_keep inside
pack-objects. Then my whole git.git was gone. I was scared off so I
did this instead.

I've learned my lesson though (never test dangerous operations on your
worktree!) and will do that pack_keep again _if_ this gc --auto still
sounds like a good direction to go.
-- 
Duy

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH 07/11] pack-objects: move in_pack out of struct object_entry
  2018-03-01 14:49     ` Jeff King
@ 2018-03-02  0:02       ` Duy Nguyen
  0 siblings, 0 replies; 273+ messages in thread
From: Duy Nguyen @ 2018-03-02  0:02 UTC (permalink / raw)
  To: Jeff King; +Cc: Git Mailing List, Eric Wong

On Thu, Mar 1, 2018 at 9:49 PM, Jeff King <peff@peff.net> wrote:
> On Thu, Mar 01, 2018 at 04:10:48PM +0700, Nguyễn Thái Ngọc Duy wrote:
>
>> Instead of using 8 bytes (on 64 bit arch) to store a pointer to a
>> pack. Use an index isntead since the number of packs should be
>> relatively small.
>>
>> This limits the number of packs we can handle to 256 (still
>> unreasonably high for a repo to work well). If you have more than 256
>> packs, you'll need an older version of Git to repack first.
>
> I overall like the direction of this series, but I think this one is
> just too much. While you definitely shouldn't have a ton of packs, this
> leaves the user with no real escape hatch. And 256 isn't actually that
> many packs.

It was raised back to 4096 at the end (I didn't know how many spare
bits we had at this point).

Agreed on the escape hatch though. I think we could do better: if
there are more than X packs, we repack X packs into one and leave the
rest alone. The _next_ pack-objects will pick another X packs to
combine. Repeat until you only have one pack left.
-- 
Duy

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH 00/11] Reduce pack-objects memory footprint
  2018-03-01 13:33   ` [PATCH 00/11] Reduce pack-objects memory footprint Ævar Arnfjörð Bjarmason
@ 2018-03-02  0:14     ` Duy Nguyen
  2018-03-02 10:57       ` Jeff King
  0 siblings, 1 reply; 273+ messages in thread
From: Duy Nguyen @ 2018-03-02  0:14 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Git Mailing List, Jeff King, Eric Wong

On Thu, Mar 1, 2018 at 8:33 PM, Ævar Arnfjörð Bjarmason
<avarab@gmail.com> wrote:
>
> On Thu, Mar 01 2018, Nguyễn Thái Ngọc Duy jotted:
>
>> The array of object_entry in pack-objects can take a lot of memory
>> when pack-objects is run in "pack everything" mode. On linux-2.6.git,
>> this array alone takes roughly 800MB.
>>
>> This series reorders some fields and reduces field size... to keep
>> this struct smaller. Its size goes from 136 bytes to 96 bytes (29%) on
>> 64-bit linux and saves 260MB on linux-2.6.git.
>
> I'm very interested in this patch series. I don't have time to test this
> one right now (have to run), but with your previous RFC patch memory use
> (in the ~4GB range) on a big in-house repo went down by a bit over 3%,
> and it's ~5% faster.
>
> Before/after RSS 4440812 / 4290000 & runtime 172.73 / 162.45. This is
> after having already done a full git gc before, data via /usr/bin/time
> -v.

Jeff correctly pointed out elsewhere in this thread that RSS covers
both heap (this is what I try to reduce) and some file cache (we mmap
the whole pack file just to ease the reading) so RSS might not a good
indicator of memory reduction. Any new freed memory should be used for
cache which raises RSS back up. I think the RssAnon field in
/proc/<pid>/status shows it better.

> So not huge, but respectable.
>
> We have a big repo, and this gets repacked on 6-8GB of memory on dev
> KVMs, so we're under a fair bit of memory pressure. git-gc slows things
> down a lot.
>
> It would be really nice to have something that made it use drastically
> less memory at the cost of less efficient packs. Is the property that

Ahh.. less efficient. You may be more interested in [1] then. It
avoids rewriting the base pack. Without the base pack, book keeping
becomes much much cheaper.

We still read every single byte in all packs though (I think, unless
you use pack-bitmap) and this amount of I/O affect the rest of the
system too. Perhaps reducing core.packedgitwindowsize might make it
friendlier to the OS, I don't know.

> you need to spend give or take the size of .git/objects in memory
> something inherent, or just a limitation of the current implementation?
> I.e. could we do a first pass to pick some objects based on some
> heuristic, then repack them N at a time, and finally delete the
> now-obsolete packs?
>
> Another thing I've dealt with is that on these machines their
> NFS-mounted storage gets exhausted (I'm told) due to some pathological
> operations git does during repack, I/O tends to get 5-6x slower. Of
> course ionice doesn't help because the local kernel doesn't know
> anything about how harmful it is.

[1] https://public-inbox.org/git/20180301092046.2769-1-pclouds@gmail.com/T/#u
-- 
Duy

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: Reduce pack-objects memory footprint?
  2018-02-28  9:27 Reduce pack-objects memory footprint? Duy Nguyen
                   ` (3 preceding siblings ...)
  2018-03-01  9:20 ` [PATCH/RFC 0/1] Avoid expensive 'repack -ad' in gc --auto Nguyễn Thái Ngọc Duy
@ 2018-03-02 10:18 ` Duy Nguyen
  2018-03-02 10:37   ` Eric Wong
  2018-03-02 10:54   ` Jeff King
  2018-03-17 22:05 ` Why does pack-objects use so much memory on incremental packing? Ævar Arnfjörð Bjarmason
  5 siblings, 2 replies; 273+ messages in thread
From: Duy Nguyen @ 2018-03-02 10:18 UTC (permalink / raw)
  To: Git Mailing List
  Cc: Jeff King, Eric Wong, Ævar Arnfjörð Bjarmason

On Wed, Feb 28, 2018 at 4:27 PM, Duy Nguyen <pclouds@gmail.com> wrote:
> linux-2.6.git current has 6483999 objects. "git gc" on my poor laptop
> consumes 1.7G out of 4G RAM, pushing lots of data to swap and making
> all apps nearly unusuable (granted the problem is partly Linux I/O
> scheduler too). So I wonder if we can reduce pack-objects memory
> footprint a bit.

Next low hanging fruit item:

struct revindex_entry {
        off_t offset;
        unsigned int nr;
};

We need on entry per object, so 6.5M objects * 16 bytes = 104 MB. If
we break this struct apart and store two arrays of offset and nr in
struct packed_git, we save 4 bytes per struct, 26 MB total.

It's getting low but every megabyte counts for me, and it does not
look like breaking this struct will make horrible code (we recreate
the struct at find_pack_revindex()) so I'm going to do this too unless
someone objects. There will be slight performance regression due to
cache effects, but hopefully it's ok.
-- 
Duy

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: Reduce pack-objects memory footprint?
  2018-03-02 10:18 ` Reduce pack-objects memory footprint? Duy Nguyen
@ 2018-03-02 10:37   ` Eric Wong
  2018-03-02 10:54   ` Jeff King
  1 sibling, 0 replies; 273+ messages in thread
From: Eric Wong @ 2018-03-02 10:37 UTC (permalink / raw)
  To: Duy Nguyen
  Cc: Git Mailing List, Jeff King, Ævar Arnfjörð Bjarmason

Duy Nguyen <pclouds@gmail.com> wrote:
> struct revindex_entry {
>         off_t offset;
>         unsigned int nr;
> };
> 
> We need on entry per object, so 6.5M objects * 16 bytes = 104 MB. If
> we break this struct apart and store two arrays of offset and nr in
> struct packed_git, we save 4 bytes per struct, 26 MB total.

Can the offset array can be a union which stores
int32_t/uint32_t instead of off_t for projects which never
exceed 2/4GB?

Likewise, places object_entry where "unsigned long" and off_t
are 64-bit could benefit from being 32-bit.  Testing/maintenance
overhead could be bad, for those, though.

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: Reduce pack-objects memory footprint?
  2018-03-02 10:18 ` Reduce pack-objects memory footprint? Duy Nguyen
  2018-03-02 10:37   ` Eric Wong
@ 2018-03-02 10:54   ` Jeff King
  2018-03-02 10:55     ` Duy Nguyen
  2018-03-02 14:38     ` Duy Nguyen
  1 sibling, 2 replies; 273+ messages in thread
From: Jeff King @ 2018-03-02 10:54 UTC (permalink / raw)
  To: Duy Nguyen
  Cc: Git Mailing List, Eric Wong, Ævar Arnfjörð Bjarmason

On Fri, Mar 02, 2018 at 05:18:45PM +0700, Duy Nguyen wrote:

> On Wed, Feb 28, 2018 at 4:27 PM, Duy Nguyen <pclouds@gmail.com> wrote:
> > linux-2.6.git current has 6483999 objects. "git gc" on my poor laptop
> > consumes 1.7G out of 4G RAM, pushing lots of data to swap and making
> > all apps nearly unusuable (granted the problem is partly Linux I/O
> > scheduler too). So I wonder if we can reduce pack-objects memory
> > footprint a bit.
> 
> Next low hanging fruit item:
> 
> struct revindex_entry {
>         off_t offset;
>         unsigned int nr;
> };
> 
> We need on entry per object, so 6.5M objects * 16 bytes = 104 MB. If
> we break this struct apart and store two arrays of offset and nr in
> struct packed_git, we save 4 bytes per struct, 26 MB total.
> 
> It's getting low but every megabyte counts for me, and it does not
> look like breaking this struct will make horrible code (we recreate
> the struct at find_pack_revindex()) so I'm going to do this too unless
> someone objects. There will be slight performance regression due to
> cache effects, but hopefully it's ok.

Maybe you will prove me wrong, but I don't think splitting them is going
to work. The point of the revindex_entry is that we sort the (offset,nr)
tuple as a unit.

Or are you planning to sort it, and then copy the result into two
separate arrays? I think that would work, but it sounds kind of nasty
(arcane code, and extra CPU work for systems that don't care about the
26MB).

-Peff

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: Reduce pack-objects memory footprint?
  2018-03-02 10:54   ` Jeff King
@ 2018-03-02 10:55     ` Duy Nguyen
  2018-03-02 14:38     ` Duy Nguyen
  1 sibling, 0 replies; 273+ messages in thread
From: Duy Nguyen @ 2018-03-02 10:55 UTC (permalink / raw)
  To: Jeff King
  Cc: Git Mailing List, Eric Wong, Ævar Arnfjörð Bjarmason

On Fri, Mar 2, 2018 at 5:54 PM, Jeff King <peff@peff.net> wrote:
> On Fri, Mar 02, 2018 at 05:18:45PM +0700, Duy Nguyen wrote:
>
>> On Wed, Feb 28, 2018 at 4:27 PM, Duy Nguyen <pclouds@gmail.com> wrote:
>> > linux-2.6.git current has 6483999 objects. "git gc" on my poor laptop
>> > consumes 1.7G out of 4G RAM, pushing lots of data to swap and making
>> > all apps nearly unusuable (granted the problem is partly Linux I/O
>> > scheduler too). So I wonder if we can reduce pack-objects memory
>> > footprint a bit.
>>
>> Next low hanging fruit item:
>>
>> struct revindex_entry {
>>         off_t offset;
>>         unsigned int nr;
>> };
>>
>> We need on entry per object, so 6.5M objects * 16 bytes = 104 MB. If
>> we break this struct apart and store two arrays of offset and nr in
>> struct packed_git, we save 4 bytes per struct, 26 MB total.
>>
>> It's getting low but every megabyte counts for me, and it does not
>> look like breaking this struct will make horrible code (we recreate
>> the struct at find_pack_revindex()) so I'm going to do this too unless
>> someone objects. There will be slight performance regression due to
>> cache effects, but hopefully it's ok.
>
> Maybe you will prove me wrong, but I don't think splitting them is going
> to work. The point of the revindex_entry is that we sort the (offset,nr)
> tuple as a unit.
>
> Or are you planning to sort it, and then copy the result into two
> separate arrays?

Yep.

> I think that would work, but it sounds kind of nasty

Yeah :(

> (arcane code, and extra CPU work for systems that don't care about the
> 26MB).
-- 
Duy

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH 00/11] Reduce pack-objects memory footprint
  2018-03-02  0:14     ` Duy Nguyen
@ 2018-03-02 10:57       ` Jeff King
  0 siblings, 0 replies; 273+ messages in thread
From: Jeff King @ 2018-03-02 10:57 UTC (permalink / raw)
  To: Duy Nguyen
  Cc: Ævar Arnfjörð Bjarmason, Git Mailing List, Eric Wong

On Fri, Mar 02, 2018 at 07:14:01AM +0700, Duy Nguyen wrote:

> > We have a big repo, and this gets repacked on 6-8GB of memory on dev
> > KVMs, so we're under a fair bit of memory pressure. git-gc slows things
> > down a lot.
> >
> > It would be really nice to have something that made it use drastically
> > less memory at the cost of less efficient packs. Is the property that
> 
> Ahh.. less efficient. You may be more interested in [1] then. It
> avoids rewriting the base pack. Without the base pack, book keeping
> becomes much much cheaper.
> 
> We still read every single byte in all packs though (I think, unless
> you use pack-bitmap) and this amount of I/O affect the rest of the
> system too. Perhaps reducing core.packedgitwindowsize might make it
> friendlier to the OS, I don't know.

Yes, the ".keep" thing is actually quite expensive. We still do a
complete rev-list to find all the objects we want, and then for each
object say "is this in a pack with .keep?". And worse, the mru doesn't
help there because even if we find it in the first pack, we have to keep
looking to see if it's _another_ pack.

There are probably some low-hanging optimizations there (e.g., only
looking in the .keep packs if that's all we're looking for; we may even
do that already).

But I think fundamentally you'd do much better to generate the partial
list of objects outside of pack-objects entirely, and then just feed it
to pack-objects without using "--revs".

-Peff

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: Reduce pack-objects memory footprint?
  2018-03-02 10:54   ` Jeff King
  2018-03-02 10:55     ` Duy Nguyen
@ 2018-03-02 14:38     ` Duy Nguyen
  1 sibling, 0 replies; 273+ messages in thread
From: Duy Nguyen @ 2018-03-02 14:38 UTC (permalink / raw)
  To: Jeff King
  Cc: Git Mailing List, Eric Wong, Ævar Arnfjörð Bjarmason

On Fri, Mar 2, 2018 at 5:54 PM, Jeff King <peff@peff.net> wrote:
> On Fri, Mar 02, 2018 at 05:18:45PM +0700, Duy Nguyen wrote:
>
>> On Wed, Feb 28, 2018 at 4:27 PM, Duy Nguyen <pclouds@gmail.com> wrote:
>> > linux-2.6.git current has 6483999 objects. "git gc" on my poor laptop
>> > consumes 1.7G out of 4G RAM, pushing lots of data to swap and making
>> > all apps nearly unusuable (granted the problem is partly Linux I/O
>> > scheduler too). So I wonder if we can reduce pack-objects memory
>> > footprint a bit.
>>
>> Next low hanging fruit item:
>>
>> struct revindex_entry {
>>         off_t offset;
>>         unsigned int nr;
>> };
>>
>> We need on entry per object, so 6.5M objects * 16 bytes = 104 MB. If
>> we break this struct apart and store two arrays of offset and nr in
>> struct packed_git, we save 4 bytes per struct, 26 MB total.
>>
>> It's getting low but every megabyte counts for me, and it does not
>> look like breaking this struct will make horrible code (we recreate
>> the struct at find_pack_revindex()) so I'm going to do this too unless
>> someone objects. There will be slight performance regression due to
>> cache effects, but hopefully it's ok.
>
> Maybe you will prove me wrong, but I don't think splitting them is going
> to work. The point of the revindex_entry is that we sort the (offset,nr)
> tuple as a unit.
>
> Or are you planning to sort it, and then copy the result into two
> separate arrays? I think that would work, but it sounds kind of nasty
> (arcane code, and extra CPU work for systems that don't care about the
> 26MB).


How about two level lookups? We have

struct revindex_entry_l2 {
        uint32_t offset; /* the lowest 32 bits */
        uint32_t nr;
};

struct revindex {
        struct revindex_entry *level1[256]; /* 8 high bits */
};

This limits us to 1024GB pack files, which should give us some time
before we have to worry about it again and most of the time we'll need
just one or two items in level1[] so cache effects are not that bad.
Preparing/Sorting this could be a problem though.
-- 
Duy

^ permalink raw reply	[flat|nested] 273+ messages in thread

* [PATCH/RFC v2 0/9] Reduce pack-objects memory footprint
  2018-03-01  9:10 ` [PATCH 00/11] Reduce pack-objects memory footprint Nguyễn Thái Ngọc Duy
                     ` (11 preceding siblings ...)
  2018-03-01 13:33   ` [PATCH 00/11] Reduce pack-objects memory footprint Ævar Arnfjörð Bjarmason
@ 2018-03-03  2:46   ` Nguyễn Thái Ngọc Duy
  2018-03-03  2:46     ` [PATCH/RFC v2 1/9] pack-objects: document holes in struct object_entry.h Nguyễn Thái Ngọc Duy
                       ` (10 more replies)
  12 siblings, 11 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-03  2:46 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Jeff King, Eric Wong,
	Ævar Arnfjörð Bjarmason,
	Nguyễn Thái Ngọc Duy

The array of object_entry in pack-objects can take a lot of memory
when pack-objects is run in "pack everything" mode. On linux-2.6.git,
this array alone takes roughly 800MB.

This series reorders some fields and reduces field size... to keep
this struct smaller. Its size goes from 136 bytes to 96 bytes (29%) on
64-bit linux and saves 260MB on linux-2.6.git.

v2 only differs in limits, documentation and a new escape hatch for
the pack file limit.

Now the bad side:

- the number of pack files pack-objects can handle is reduced to 16k
  (previously unlimited, v1 has it at 4k)
- max delta chain is also limited to 4096 (previously practically
  unlimited), same as v1
- some patches are quite invasive (e.g. replacing pointer with
  uint32_t) and reduces readability a bit.
- it may be tricker to add more data in object_entry in the future.

In v1, if the total pack count is above the 4k limit, pack-objects
dies. v2 is a bit smarter and only count packs that do not have the
companion .keep files. This allows users with 16k+ pack files to
continue to use pack-objects by first adding .keep to reduce pack
count, repack, remove (some) .keep files, repack again...

While this process could be automated at least by 'git repack', given
the unbelievably high limit 16k, I don't think it's worth doing it.

Interdiff

diff --git a/Documentation/config.txt b/Documentation/config.txt
index f57e9cf10c..9bd3f5a789 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -2412,6 +2412,7 @@ pack.window::
 pack.depth::
 	The maximum delta depth used by linkgit:git-pack-objects[1] when no
 	maximum depth is given on the command line. Defaults to 50.
+	Maximum value is 4095.
 
 pack.windowMemory::
 	The maximum size of memory that is consumed by each thread
diff --git a/Documentation/git-pack-objects.txt b/Documentation/git-pack-objects.txt
index 81bc490ac5..b8d936ccf5 100644
--- a/Documentation/git-pack-objects.txt
+++ b/Documentation/git-pack-objects.txt
@@ -96,7 +96,9 @@ base-name::
 	it too deep affects the performance on the unpacker
 	side, because delta data needs to be applied that many
 	times to get to the necessary object.
-	The default value for --window is 10 and --depth is 50.
++
+The default value for --window is 10 and --depth is 50. The maximum
+depth is 4095.
 
 --window-memory=<n>::
 	This option provides an additional limit on top of `--window`;
@@ -267,6 +269,15 @@ Unexpected missing object will raise an error.
 	locally created objects [without .promisor] and objects from the
 	promisor remote [with .promisor].)  This is used with partial clone.
 
+LIMITATIONS
+-----------
+
+This command could only handle 16384 existing pack files at a time.
+If you have more than this, you need to exclude some pack files with
+".keep" file and --honor-pack-keep option, to combine 16k pack files
+in one, then remove these .keep files and run pack-objects one more
+time.
+
 SEE ALSO
 --------
 linkgit:git-rev-list[1]
diff --git a/Documentation/git-repack.txt b/Documentation/git-repack.txt
index ae750e9e11..25c83c4927 100644
--- a/Documentation/git-repack.txt
+++ b/Documentation/git-repack.txt
@@ -90,7 +90,9 @@ other objects in that pack they already have locally.
 	space. `--depth` limits the maximum delta depth; making it too deep
 	affects the performance on the unpacker side, because delta data needs
 	to be applied that many times to get to the necessary object.
-	The default value for --window is 10 and --depth is 50.
++
+The default value for --window is 10 and --depth is 50. The maximum
+depth is 4095.
 
 --threads=<n>::
 	This option is passed through to `git pack-objects`.
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 45076f2523..55f19a1f18 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -1038,7 +1038,7 @@ static int want_object_in_pack(const struct object_id *oid,
 	if (*found_pack) {
 		want = want_found_object(exclude, *found_pack);
 		if (want != -1)
-			return want;
+			goto done;
 	}
 
 	list_for_each(pos, &packed_git_mru) {
@@ -1061,11 +1061,27 @@ static int want_object_in_pack(const struct object_id *oid,
 			if (!exclude && want > 0)
 				list_move(&p->mru, &packed_git_mru);
 			if (want != -1)
-				return want;
+				goto done;
 		}
 	}
 
-	return 1;
+	want = 1;
+done:
+	if (want && *found_pack && !(*found_pack)->index) {
+		struct packed_git *p = *found_pack;
+
+		if (to_pack.in_pack_count >= (1 << OE_IN_PACK_BITS))
+			die(_("too many packs to handle in one go. "
+			      "Please add .keep files to exclude\n"
+			      "some pack files and keep the number "
+			      "of non-kept files below %d."),
+			    1 << OE_IN_PACK_BITS);
+
+		p->index = to_pack.in_pack_count++;
+		to_pack.in_pack[p->index] = p;
+	}
+
+	return want;
 }
 
 static void create_object_entry(const struct object_id *oid,
@@ -1088,6 +1104,8 @@ static void create_object_entry(const struct object_id *oid,
 	else
 		nr_result++;
 	if (found_pack) {
+		if (found_pack->index <= 0)
+			die("BUG: found_pack should be NULL instead of having non-positive index");
 		entry->in_pack_idx = found_pack->index;
 		entry->in_pack_offset = found_offset;
 	}
@@ -2978,18 +2996,12 @@ static int option_parse_unpack_unreachable(const struct option *opt,
 
 static void init_in_pack_mapping(struct packing_data *to_pack)
 {
-	struct packed_git *p;
-	int i = 0;
-
 	/* let IN_PACK() return NULL if in_pack_idx is zero */
-	to_pack->in_pack[i++] = NULL;
-
-	for (p = packed_git; p; p = p->next, i++) {
-		if (i >= (1 << OE_IN_PACK_BITS))
-			die("BUG: too many packs to handle!");
-		to_pack->in_pack[i] = p;
-		p->index = i;
-	}
+	to_pack->in_pack[to_pack->in_pack_count++] = NULL;
+	/*
+	 * the rest is lazily initialized only for packs that we want
+	 * in want_object_in_pack().
+	 */
 }
 
 int cmd_pack_objects(int argc, const char **argv, const char *prefix)
diff --git a/pack-objects.h b/pack-objects.h
index ec4eba4ee4..a57aca5f03 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -3,7 +3,7 @@
 
 #define OE_DFS_STATE_BITS 2
 #define OE_DEPTH_BITS 12
-#define OE_IN_PACK_BITS 12
+#define OE_IN_PACK_BITS 14
 
 #define IN_PACK_POS(to_pack, obj) \
 	(to_pack)->in_pack_pos[(struct object_entry *)(obj) - (to_pack)->objects]
@@ -60,7 +60,7 @@ struct object_entry {
 
 	unsigned depth:OE_DEPTH_BITS;
 
-	/* size: 96 */
+	/* size: 96, bit_padding: 18 bits */
 };
 
 struct packing_data {
@@ -71,6 +71,7 @@ struct packing_data {
 	uint32_t index_size;
 
 	unsigned int *in_pack_pos;
+	int in_pack_count;
 	struct packed_git *in_pack[1 << OE_IN_PACK_BITS];
 };
 

Nguyễn Thái Ngọc Duy (9):
  pack-objects: document holes in struct object_entry.h
  pack-objects: turn type and in_pack_type to bitfields
  pack-objects: use bitfield for object_entry::dfs_state
  pack-objects: use bitfield for object_entry::depth
  pack-objects: note about in_pack_header_size
  pack-objects: move in_pack_pos out of struct object_entry
  pack-objects: move in_pack out of struct object_entry
  pack-objects: refer to delta objects by index instead of pointer
  pack-objects: reorder 'hash' to pack struct object_entry

 Documentation/config.txt           |   1 +
 Documentation/git-pack-objects.txt |  13 +-
 Documentation/git-repack.txt       |   4 +-
 builtin/pack-objects.c             | 207 +++++++++++++++++++----------
 cache.h                            |   3 +
 object.h                           |   1 -
 pack-bitmap-write.c                |   8 +-
 pack-bitmap.c                      |   2 +-
 pack-bitmap.h                      |   4 +-
 pack-objects.h                     |  71 ++++++----
 10 files changed, 212 insertions(+), 102 deletions(-)

-- 
2.16.1.435.g8f24da2e1a


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH/RFC v2 1/9] pack-objects: document holes in struct object_entry.h
  2018-03-03  2:46   ` [PATCH/RFC v2 0/9] " Nguyễn Thái Ngọc Duy
@ 2018-03-03  2:46     ` Nguyễn Thái Ngọc Duy
  2018-03-03  2:46     ` [PATCH/RFC v2 2/9] pack-objects: turn type and in_pack_type to bitfields Nguyễn Thái Ngọc Duy
                       ` (9 subsequent siblings)
  10 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-03  2:46 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Jeff King, Eric Wong,
	Ævar Arnfjörð Bjarmason,
	Nguyễn Thái Ngọc Duy

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 pack-objects.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pack-objects.h b/pack-objects.h
index 03f1191659..720a8e8756 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -28,6 +28,7 @@ struct object_entry {
 	unsigned tagged:1; /* near the very tip of refs */
 	unsigned filled:1; /* assigned write-order */
 
+	/* XXX 28 bits hole, try to pack */
 	/*
 	 * State flags for depth-first search used for analyzing delta cycles.
 	 *
@@ -40,6 +41,7 @@ struct object_entry {
 		DFS_DONE
 	} dfs_state;
 	int depth;
+	/* size: 136, padding: 4 */
 };
 
 struct packing_data {
-- 
2.16.1.435.g8f24da2e1a


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH/RFC v2 2/9] pack-objects: turn type and in_pack_type to bitfields
  2018-03-03  2:46   ` [PATCH/RFC v2 0/9] " Nguyễn Thái Ngọc Duy
  2018-03-03  2:46     ` [PATCH/RFC v2 1/9] pack-objects: document holes in struct object_entry.h Nguyễn Thái Ngọc Duy
@ 2018-03-03  2:46     ` Nguyễn Thái Ngọc Duy
  2018-03-03  2:47     ` [PATCH/RFC v2 3/9] pack-objects: use bitfield for object_entry::dfs_state Nguyễn Thái Ngọc Duy
                       ` (8 subsequent siblings)
  10 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-03  2:46 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Jeff King, Eric Wong,
	Ævar Arnfjörð Bjarmason,
	Nguyễn Thái Ngọc Duy

This saves 8 bytes in sizeof(struct object_entry). On a large
repository like linux-2.6.git (6.5M objects), this saves us 52MB
memory.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 14 ++++++++++++--
 cache.h                |  2 ++
 object.h               |  1 -
 pack-objects.h         |  8 ++++----
 4 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 5c674b2843..fd217cb51f 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -1407,6 +1407,7 @@ static void check_object(struct object_entry *entry)
 		unsigned long avail;
 		off_t ofs;
 		unsigned char *buf, c;
+		enum object_type type;
 
 		buf = use_pack(p, &w_curs, entry->in_pack_offset, &avail);
 
@@ -1415,11 +1416,15 @@ static void check_object(struct object_entry *entry)
 		 * since non-delta representations could still be reused.
 		 */
 		used = unpack_object_header_buffer(buf, avail,
-						   &entry->in_pack_type,
+						   &type,
 						   &entry->size);
 		if (used == 0)
 			goto give_up;
 
+		if (type < 0)
+			die("BUG: invalid type %d", type);
+		entry->in_pack_type = type;
+
 		/*
 		 * Determine if this is a delta and if so whether we can
 		 * reuse it or not.  Otherwise let's find out as cheaply as
@@ -1559,6 +1564,7 @@ static void drop_reused_delta(struct object_entry *entry)
 {
 	struct object_entry **p = &entry->delta->delta_child;
 	struct object_info oi = OBJECT_INFO_INIT;
+	enum object_type type;
 
 	while (*p) {
 		if (*p == entry)
@@ -1570,7 +1576,7 @@ static void drop_reused_delta(struct object_entry *entry)
 	entry->depth = 0;
 
 	oi.sizep = &entry->size;
-	oi.typep = &entry->type;
+	oi.typep = &type;
 	if (packed_object_info(entry->in_pack, entry->in_pack_offset, &oi) < 0) {
 		/*
 		 * We failed to get the info from this pack for some reason;
@@ -1580,6 +1586,10 @@ static void drop_reused_delta(struct object_entry *entry)
 		 */
 		entry->type = sha1_object_info(entry->idx.oid.hash,
 					       &entry->size);
+	} else {
+		if (type < 0)
+			die("BUG: invalid type %d", type);
+		entry->type = type;
 	}
 }
 
diff --git a/cache.h b/cache.h
index 21fbcc2414..862bdff83a 100644
--- a/cache.h
+++ b/cache.h
@@ -373,6 +373,8 @@ extern void free_name_hash(struct index_state *istate);
 #define read_blob_data_from_cache(path, sz) read_blob_data_from_index(&the_index, (path), (sz))
 #endif
 
+#define TYPE_BITS 3
+
 enum object_type {
 	OBJ_BAD = -1,
 	OBJ_NONE = 0,
diff --git a/object.h b/object.h
index 87563d9056..8ce294d6ec 100644
--- a/object.h
+++ b/object.h
@@ -25,7 +25,6 @@ struct object_array {
 
 #define OBJECT_ARRAY_INIT { 0, 0, NULL }
 
-#define TYPE_BITS   3
 /*
  * object flag allocation:
  * revision.h:      0---------10                                26
diff --git a/pack-objects.h b/pack-objects.h
index 720a8e8756..f8b06e2521 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -14,11 +14,11 @@ struct object_entry {
 	void *delta_data;	/* cached delta (uncompressed) */
 	unsigned long delta_size;	/* delta data size (uncompressed) */
 	unsigned long z_delta_size;	/* delta data size (compressed) */
-	enum object_type type;
-	enum object_type in_pack_type;	/* could be delta */
 	uint32_t hash;			/* name hint hash */
 	unsigned int in_pack_pos;
 	unsigned char in_pack_header_size;
+	unsigned type:TYPE_BITS;
+	unsigned in_pack_type:TYPE_BITS; /* could be delta */
 	unsigned preferred_base:1; /*
 				    * we do not pack this, but is available
 				    * to be used as the base object to delta
@@ -28,7 +28,7 @@ struct object_entry {
 	unsigned tagged:1; /* near the very tip of refs */
 	unsigned filled:1; /* assigned write-order */
 
-	/* XXX 28 bits hole, try to pack */
+	/* XXX 22 bits hole, try to pack */
 	/*
 	 * State flags for depth-first search used for analyzing delta cycles.
 	 *
@@ -41,7 +41,7 @@ struct object_entry {
 		DFS_DONE
 	} dfs_state;
 	int depth;
-	/* size: 136, padding: 4 */
+	/* size: 128, padding: 4 */
 };
 
 struct packing_data {
-- 
2.16.1.435.g8f24da2e1a


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH/RFC v2 3/9] pack-objects: use bitfield for object_entry::dfs_state
  2018-03-03  2:46   ` [PATCH/RFC v2 0/9] " Nguyễn Thái Ngọc Duy
  2018-03-03  2:46     ` [PATCH/RFC v2 1/9] pack-objects: document holes in struct object_entry.h Nguyễn Thái Ngọc Duy
  2018-03-03  2:46     ` [PATCH/RFC v2 2/9] pack-objects: turn type and in_pack_type to bitfields Nguyễn Thái Ngọc Duy
@ 2018-03-03  2:47     ` Nguyễn Thái Ngọc Duy
  2018-03-03  2:47     ` [PATCH/RFC v2 4/9] pack-objects: use bitfield for object_entry::depth Nguyễn Thái Ngọc Duy
                       ` (7 subsequent siblings)
  10 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-03  2:47 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Jeff King, Eric Wong,
	Ævar Arnfjörð Bjarmason,
	Nguyễn Thái Ngọc Duy

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c |  3 +++
 pack-objects.h         | 33 ++++++++++++++++++++-------------
 2 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index fd217cb51f..a4dbb40824 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -3049,6 +3049,9 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 		OPT_END(),
 	};
 
+	if (DFS_NUM_STATES > (1 << OE_DFS_STATE_BITS))
+		die("BUG: too many dfs states, increase OE_DFS_STATE_BITS");
+
 	check_replace_refs = 0;
 
 	reset_pack_idx_option(&pack_idx_opts);
diff --git a/pack-objects.h b/pack-objects.h
index f8b06e2521..6a85cc60c9 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -1,6 +1,21 @@
 #ifndef PACK_OBJECTS_H
 #define PACK_OBJECTS_H
 
+#define OE_DFS_STATE_BITS 2
+
+/*
+ * State flags for depth-first search used for analyzing delta cycles.
+ *
+ * The depth is measured in delta-links to the base (so if A is a delta
+ * against B, then A has a depth of 1, and B a depth of 0).
+ */
+enum dfs_state {
+	DFS_NONE = 0,
+	DFS_ACTIVE,
+	DFS_DONE,
+	DFS_NUM_STATES
+};
+
 struct object_entry {
 	struct pack_idx_entry idx;
 	unsigned long size;	/* uncompressed size */
@@ -27,21 +42,13 @@ struct object_entry {
 	unsigned no_try_delta:1;
 	unsigned tagged:1; /* near the very tip of refs */
 	unsigned filled:1; /* assigned write-order */
+	unsigned dfs_state:OE_DFS_STATE_BITS;
+
+	/* XXX 20 bits hole, try to pack */
 
-	/* XXX 22 bits hole, try to pack */
-	/*
-	 * State flags for depth-first search used for analyzing delta cycles.
-	 *
-	 * The depth is measured in delta-links to the base (so if A is a delta
-	 * against B, then A has a depth of 1, and B a depth of 0).
-	 */
-	enum {
-		DFS_NONE = 0,
-		DFS_ACTIVE,
-		DFS_DONE
-	} dfs_state;
 	int depth;
-	/* size: 128, padding: 4 */
+
+	/* size: 120 */
 };
 
 struct packing_data {
-- 
2.16.1.435.g8f24da2e1a


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH/RFC v2 4/9] pack-objects: use bitfield for object_entry::depth
  2018-03-03  2:46   ` [PATCH/RFC v2 0/9] " Nguyễn Thái Ngọc Duy
                       ` (2 preceding siblings ...)
  2018-03-03  2:47     ` [PATCH/RFC v2 3/9] pack-objects: use bitfield for object_entry::dfs_state Nguyễn Thái Ngọc Duy
@ 2018-03-03  2:47     ` Nguyễn Thái Ngọc Duy
  2018-03-03  2:47     ` [PATCH/RFC v2 5/9] pack-objects: note about in_pack_header_size Nguyễn Thái Ngọc Duy
                       ` (6 subsequent siblings)
  10 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-03  2:47 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Jeff King, Eric Wong,
	Ævar Arnfjörð Bjarmason,
	Nguyễn Thái Ngọc Duy

This does not give us any saving due to padding. But we will be able
to save once we cut 4 bytes out of this struct in a subsequent patch.

Because of struct packing from now on we can only handle max depth
4095 (or even lower when new booleans are added in this struct). This
should be ok since long delta chain will cause significant slow down
anyway.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 Documentation/config.txt           | 1 +
 Documentation/git-pack-objects.txt | 4 +++-
 Documentation/git-repack.txt       | 4 +++-
 builtin/pack-objects.c             | 4 ++++
 pack-objects.h                     | 8 +++-----
 5 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index f57e9cf10c..9bd3f5a789 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -2412,6 +2412,7 @@ pack.window::
 pack.depth::
 	The maximum delta depth used by linkgit:git-pack-objects[1] when no
 	maximum depth is given on the command line. Defaults to 50.
+	Maximum value is 4095.
 
 pack.windowMemory::
 	The maximum size of memory that is consumed by each thread
diff --git a/Documentation/git-pack-objects.txt b/Documentation/git-pack-objects.txt
index 81bc490ac5..3503c9e3e6 100644
--- a/Documentation/git-pack-objects.txt
+++ b/Documentation/git-pack-objects.txt
@@ -96,7 +96,9 @@ base-name::
 	it too deep affects the performance on the unpacker
 	side, because delta data needs to be applied that many
 	times to get to the necessary object.
-	The default value for --window is 10 and --depth is 50.
++
+The default value for --window is 10 and --depth is 50. The maximum
+depth is 4095.
 
 --window-memory=<n>::
 	This option provides an additional limit on top of `--window`;
diff --git a/Documentation/git-repack.txt b/Documentation/git-repack.txt
index ae750e9e11..25c83c4927 100644
--- a/Documentation/git-repack.txt
+++ b/Documentation/git-repack.txt
@@ -90,7 +90,9 @@ other objects in that pack they already have locally.
 	space. `--depth` limits the maximum delta depth; making it too deep
 	affects the performance on the unpacker side, because delta data needs
 	to be applied that many times to get to the necessary object.
-	The default value for --window is 10 and --depth is 50.
++
+The default value for --window is 10 and --depth is 50. The maximum
+depth is 4095.
 
 --threads=<n>::
 	This option is passed through to `git pack-objects`.
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index a4dbb40824..cfd97da7db 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -3068,6 +3068,10 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 	if (pack_to_stdout != !base_name || argc)
 		usage_with_options(pack_usage, pack_objects_options);
 
+	if (depth > (1 << OE_DEPTH_BITS))
+		die(_("delta chain depth %d is greater than maximum limit %d"),
+		    depth, (1 << OE_DEPTH_BITS));
+
 	argv_array_push(&rp, "pack-objects");
 	if (thin) {
 		use_internal_rev_list = 1;
diff --git a/pack-objects.h b/pack-objects.h
index 6a85cc60c9..2050a05a0b 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -2,6 +2,7 @@
 #define PACK_OBJECTS_H
 
 #define OE_DFS_STATE_BITS 2
+#define OE_DEPTH_BITS 12
 
 /*
  * State flags for depth-first search used for analyzing delta cycles.
@@ -43,12 +44,9 @@ struct object_entry {
 	unsigned tagged:1; /* near the very tip of refs */
 	unsigned filled:1; /* assigned write-order */
 	unsigned dfs_state:OE_DFS_STATE_BITS;
+	unsigned depth:OE_DEPTH_BITS;
 
-	/* XXX 20 bits hole, try to pack */
-
-	int depth;
-
-	/* size: 120 */
+	/* size: 120, bit_padding: 8 bits */
 };
 
 struct packing_data {
-- 
2.16.1.435.g8f24da2e1a


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH/RFC v2 5/9] pack-objects: note about in_pack_header_size
  2018-03-03  2:46   ` [PATCH/RFC v2 0/9] " Nguyễn Thái Ngọc Duy
                       ` (3 preceding siblings ...)
  2018-03-03  2:47     ` [PATCH/RFC v2 4/9] pack-objects: use bitfield for object_entry::depth Nguyễn Thái Ngọc Duy
@ 2018-03-03  2:47     ` Nguyễn Thái Ngọc Duy
  2018-03-03  2:47     ` [PATCH/RFC v2 6/9] pack-objects: move in_pack_pos out of struct object_entry Nguyễn Thái Ngọc Duy
                       ` (5 subsequent siblings)
  10 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-03  2:47 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Jeff King, Eric Wong,
	Ævar Arnfjörð Bjarmason,
	Nguyễn Thái Ngọc Duy

Object header in a pack is packed really tight (see
pack-format.txt). Even with 8 bytes length, we need 9-10 bytes most,
plus a hash (20 bytes). Which means this field only needs to store a
number as big as 32 (5 bits).

This is trickier to pack tight though since a new hash algorithm is
coming, the number of bits needed may quickly increase. So leave it
for now.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 pack-objects.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pack-objects.h b/pack-objects.h
index 2050a05a0b..fb2a3c8f48 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -32,7 +32,7 @@ struct object_entry {
 	unsigned long z_delta_size;	/* delta data size (compressed) */
 	uint32_t hash;			/* name hint hash */
 	unsigned int in_pack_pos;
-	unsigned char in_pack_header_size;
+	unsigned char in_pack_header_size; /* note: spare bits available! */
 	unsigned type:TYPE_BITS;
 	unsigned in_pack_type:TYPE_BITS; /* could be delta */
 	unsigned preferred_base:1; /*
-- 
2.16.1.435.g8f24da2e1a


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH/RFC v2 6/9] pack-objects: move in_pack_pos out of struct object_entry
  2018-03-03  2:46   ` [PATCH/RFC v2 0/9] " Nguyễn Thái Ngọc Duy
                       ` (4 preceding siblings ...)
  2018-03-03  2:47     ` [PATCH/RFC v2 5/9] pack-objects: note about in_pack_header_size Nguyễn Thái Ngọc Duy
@ 2018-03-03  2:47     ` Nguyễn Thái Ngọc Duy
  2018-03-03  2:47     ` [PATCH/RFC v2 7/9] pack-objects: move in_pack " Nguyễn Thái Ngọc Duy
                       ` (4 subsequent siblings)
  10 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-03  2:47 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Jeff King, Eric Wong,
	Ævar Arnfjörð Bjarmason,
	Nguyễn Thái Ngọc Duy

This field is only need for pack-bitmap, which is an optional
feature. Move it to a separate array that is only allocated when
pack-bitmap is used (it's not freed in the same way that objects[] is
not). This saves us 8 bytes in struct object_entry.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 3 ++-
 pack-bitmap-write.c    | 8 +++++---
 pack-bitmap.c          | 2 +-
 pack-bitmap.h          | 4 +++-
 pack-objects.h         | 8 ++++++--
 5 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index cfd97da7db..7bb5544883 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -878,7 +878,8 @@ static void write_pack_file(void)
 
 			if (write_bitmap_index) {
 				bitmap_writer_set_checksum(oid.hash);
-				bitmap_writer_build_type_index(written_list, nr_written);
+				bitmap_writer_build_type_index(
+					&to_pack, written_list, nr_written);
 			}
 
 			finish_tmp_packfile(&tmpname, pack_tmp_name,
diff --git a/pack-bitmap-write.c b/pack-bitmap-write.c
index e01f992884..1360a93311 100644
--- a/pack-bitmap-write.c
+++ b/pack-bitmap-write.c
@@ -48,7 +48,8 @@ void bitmap_writer_show_progress(int show)
 /**
  * Build the initial type index for the packfile
  */
-void bitmap_writer_build_type_index(struct pack_idx_entry **index,
+void bitmap_writer_build_type_index(struct packing_data *to_pack,
+				    struct pack_idx_entry **index,
 				    uint32_t index_nr)
 {
 	uint32_t i;
@@ -57,12 +58,13 @@ void bitmap_writer_build_type_index(struct pack_idx_entry **index,
 	writer.trees = ewah_new();
 	writer.blobs = ewah_new();
 	writer.tags = ewah_new();
+	ALLOC_ARRAY(to_pack->in_pack_pos, to_pack->nr_objects);
 
 	for (i = 0; i < index_nr; ++i) {
 		struct object_entry *entry = (struct object_entry *)index[i];
 		enum object_type real_type;
 
-		entry->in_pack_pos = i;
+		IN_PACK_POS(to_pack, entry) = i;
 
 		switch (entry->type) {
 		case OBJ_COMMIT:
@@ -147,7 +149,7 @@ static uint32_t find_object_pos(const unsigned char *sha1)
 			"(object %s is missing)", sha1_to_hex(sha1));
 	}
 
-	return entry->in_pack_pos;
+	return IN_PACK_POS(writer.to_pack, entry);
 }
 
 static void show_object(struct object *object, const char *name, void *data)
diff --git a/pack-bitmap.c b/pack-bitmap.c
index 9270983e5f..f21479fe16 100644
--- a/pack-bitmap.c
+++ b/pack-bitmap.c
@@ -1032,7 +1032,7 @@ int rebuild_existing_bitmaps(struct packing_data *mapping,
 		oe = packlist_find(mapping, sha1, NULL);
 
 		if (oe)
-			reposition[i] = oe->in_pack_pos + 1;
+			reposition[i] = IN_PACK_POS(mapping, oe) + 1;
 	}
 
 	rebuild = bitmap_new();
diff --git a/pack-bitmap.h b/pack-bitmap.h
index 3742a00e14..5ded2f139a 100644
--- a/pack-bitmap.h
+++ b/pack-bitmap.h
@@ -44,7 +44,9 @@ int rebuild_existing_bitmaps(struct packing_data *mapping, khash_sha1 *reused_bi
 
 void bitmap_writer_show_progress(int show);
 void bitmap_writer_set_checksum(unsigned char *sha1);
-void bitmap_writer_build_type_index(struct pack_idx_entry **index, uint32_t index_nr);
+void bitmap_writer_build_type_index(struct packing_data *to_pack,
+				    struct pack_idx_entry **index,
+				    uint32_t index_nr);
 void bitmap_writer_reuse_bitmaps(struct packing_data *to_pack);
 void bitmap_writer_select_commits(struct commit **indexed_commits,
 		unsigned int indexed_commits_nr, int max_bitmaps);
diff --git a/pack-objects.h b/pack-objects.h
index fb2a3c8f48..737e89b665 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -4,6 +4,9 @@
 #define OE_DFS_STATE_BITS 2
 #define OE_DEPTH_BITS 12
 
+#define IN_PACK_POS(to_pack, obj) \
+	(to_pack)->in_pack_pos[(struct object_entry *)(obj) - (to_pack)->objects]
+
 /*
  * State flags for depth-first search used for analyzing delta cycles.
  *
@@ -31,7 +34,6 @@ struct object_entry {
 	unsigned long delta_size;	/* delta data size (uncompressed) */
 	unsigned long z_delta_size;	/* delta data size (compressed) */
 	uint32_t hash;			/* name hint hash */
-	unsigned int in_pack_pos;
 	unsigned char in_pack_header_size; /* note: spare bits available! */
 	unsigned type:TYPE_BITS;
 	unsigned in_pack_type:TYPE_BITS; /* could be delta */
@@ -46,7 +48,7 @@ struct object_entry {
 	unsigned dfs_state:OE_DFS_STATE_BITS;
 	unsigned depth:OE_DEPTH_BITS;
 
-	/* size: 120, bit_padding: 8 bits */
+	/* size: 112, bit_padding: 8 bits */
 };
 
 struct packing_data {
@@ -55,6 +57,8 @@ struct packing_data {
 
 	int32_t *index;
 	uint32_t index_size;
+
+	unsigned int *in_pack_pos;
 };
 
 struct object_entry *packlist_alloc(struct packing_data *pdata,
-- 
2.16.1.435.g8f24da2e1a


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH/RFC v2 7/9] pack-objects: move in_pack out of struct object_entry
  2018-03-03  2:46   ` [PATCH/RFC v2 0/9] " Nguyễn Thái Ngọc Duy
                       ` (5 preceding siblings ...)
  2018-03-03  2:47     ` [PATCH/RFC v2 6/9] pack-objects: move in_pack_pos out of struct object_entry Nguyễn Thái Ngọc Duy
@ 2018-03-03  2:47     ` Nguyễn Thái Ngọc Duy
  2018-03-03  2:47     ` [PATCH/RFC v2 8/9] pack-objects: refer to delta objects by index instead of pointer Nguyễn Thái Ngọc Duy
                       ` (3 subsequent siblings)
  10 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-03  2:47 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Jeff King, Eric Wong,
	Ævar Arnfjörð Bjarmason,
	Nguyễn Thái Ngọc Duy

Instead of using 8 bytes (on 64 bit arch) to store a pointer to a
pack. Use an index isntead since the number of packs should be
relatively small.

This limits the number of packs we can handle to 16k. For now if you hit
16k pack files limit, pack-objects will simply fail [1].

This technically saves 7 bytes. But we don't see any of that in
practice due to padding. The saving becomes real when we pack this
struct tighter later.

[1] The escape hatch is .keep file to limit the non-kept pack files
    below 16k limit. Then you can go for another pack-objects run to
    combine another 16k pack files. Repeat until you're satisfied.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 Documentation/git-pack-objects.txt |  9 +++++
 builtin/pack-objects.c             | 59 +++++++++++++++++++++++-------
 cache.h                            |  1 +
 pack-objects.h                     | 18 ++++++++-
 4 files changed, 71 insertions(+), 16 deletions(-)

diff --git a/Documentation/git-pack-objects.txt b/Documentation/git-pack-objects.txt
index 3503c9e3e6..b8d936ccf5 100644
--- a/Documentation/git-pack-objects.txt
+++ b/Documentation/git-pack-objects.txt
@@ -269,6 +269,15 @@ Unexpected missing object will raise an error.
 	locally created objects [without .promisor] and objects from the
 	promisor remote [with .promisor].)  This is used with partial clone.
 
+LIMITATIONS
+-----------
+
+This command could only handle 16384 existing pack files at a time.
+If you have more than this, you need to exclude some pack files with
+".keep" file and --honor-pack-keep option, to combine 16k pack files
+in one, then remove these .keep files and run pack-objects one more
+time.
+
 SEE ALSO
 --------
 linkgit:git-rev-list[1]
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 7bb5544883..5818bf73ca 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -367,7 +367,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 				unsigned long limit, int usable_delta)
 {
-	struct packed_git *p = entry->in_pack;
+	struct packed_git *p = IN_PACK(&to_pack, entry);
 	struct pack_window *w_curs = NULL;
 	struct revindex_entry *revidx;
 	off_t offset;
@@ -478,7 +478,7 @@ static off_t write_object(struct hashfile *f,
 
 	if (!reuse_object)
 		to_reuse = 0;	/* explicit */
-	else if (!entry->in_pack)
+	else if (!IN_PACK(&to_pack, entry))
 		to_reuse = 0;	/* can't reuse what we don't have */
 	else if (entry->type == OBJ_REF_DELTA || entry->type == OBJ_OFS_DELTA)
 				/* check_object() decided it for us ... */
@@ -1024,7 +1024,7 @@ static int want_object_in_pack(const struct object_id *oid,
 	if (*found_pack) {
 		want = want_found_object(exclude, *found_pack);
 		if (want != -1)
-			return want;
+			goto done;
 	}
 
 	list_for_each(pos, &packed_git_mru) {
@@ -1047,11 +1047,27 @@ static int want_object_in_pack(const struct object_id *oid,
 			if (!exclude && want > 0)
 				list_move(&p->mru, &packed_git_mru);
 			if (want != -1)
-				return want;
+				goto done;
 		}
 	}
 
-	return 1;
+	want = 1;
+done:
+	if (want && *found_pack && !(*found_pack)->index) {
+		struct packed_git *p = *found_pack;
+
+		if (to_pack.in_pack_count >= (1 << OE_IN_PACK_BITS))
+			die(_("too many packs to handle in one go. "
+			      "Please add .keep files to exclude\n"
+			      "some pack files and keep the number "
+			      "of non-kept files below %d."),
+			    1 << OE_IN_PACK_BITS);
+
+		p->index = to_pack.in_pack_count++;
+		to_pack.in_pack[p->index] = p;
+	}
+
+	return want;
 }
 
 static void create_object_entry(const struct object_id *oid,
@@ -1074,7 +1090,9 @@ static void create_object_entry(const struct object_id *oid,
 	else
 		nr_result++;
 	if (found_pack) {
-		entry->in_pack = found_pack;
+		if (found_pack->index <= 0)
+			die("BUG: found_pack should be NULL instead of having non-positive index");
+		entry->in_pack_idx = found_pack->index;
 		entry->in_pack_offset = found_offset;
 	}
 
@@ -1399,8 +1417,8 @@ static void cleanup_preferred_base(void)
 
 static void check_object(struct object_entry *entry)
 {
-	if (entry->in_pack) {
-		struct packed_git *p = entry->in_pack;
+	if (IN_PACK(&to_pack, entry)) {
+		struct packed_git *p = IN_PACK(&to_pack, entry);
 		struct pack_window *w_curs = NULL;
 		const unsigned char *base_ref = NULL;
 		struct object_entry *base_entry;
@@ -1535,14 +1553,16 @@ static int pack_offset_sort(const void *_a, const void *_b)
 {
 	const struct object_entry *a = *(struct object_entry **)_a;
 	const struct object_entry *b = *(struct object_entry **)_b;
+	const struct packed_git *a_in_pack = IN_PACK(&to_pack, a);
+	const struct packed_git *b_in_pack = IN_PACK(&to_pack, b);
 
 	/* avoid filesystem trashing with loose objects */
-	if (!a->in_pack && !b->in_pack)
+	if (!a_in_pack && !b_in_pack)
 		return oidcmp(&a->idx.oid, &b->idx.oid);
 
-	if (a->in_pack < b->in_pack)
+	if (a_in_pack < b_in_pack)
 		return -1;
-	if (a->in_pack > b->in_pack)
+	if (a_in_pack > b_in_pack)
 		return 1;
 	return a->in_pack_offset < b->in_pack_offset ? -1 :
 			(a->in_pack_offset > b->in_pack_offset);
@@ -1578,7 +1598,7 @@ static void drop_reused_delta(struct object_entry *entry)
 
 	oi.sizep = &entry->size;
 	oi.typep = &type;
-	if (packed_object_info(entry->in_pack, entry->in_pack_offset, &oi) < 0) {
+	if (packed_object_info(IN_PACK(&to_pack, entry), entry->in_pack_offset, &oi) < 0) {
 		/*
 		 * We failed to get the info from this pack for some reason;
 		 * fall back to sha1_object_info, which may find another copy.
@@ -1848,8 +1868,8 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	 * it, we will still save the transfer cost, as we already know
 	 * the other side has it and we won't send src_entry at all.
 	 */
-	if (reuse_delta && trg_entry->in_pack &&
-	    trg_entry->in_pack == src_entry->in_pack &&
+	if (reuse_delta && IN_PACK(&to_pack, trg_entry) &&
+	    IN_PACK(&to_pack, trg_entry) == IN_PACK(&to_pack, src_entry) &&
 	    !src_entry->preferred_base &&
 	    trg_entry->in_pack_type != OBJ_REF_DELTA &&
 	    trg_entry->in_pack_type != OBJ_OFS_DELTA)
@@ -2958,6 +2978,16 @@ static int option_parse_unpack_unreachable(const struct option *opt,
 	return 0;
 }
 
+static void init_in_pack_mapping(struct packing_data *to_pack)
+{
+	/* let IN_PACK() return NULL if in_pack_idx is zero */
+	to_pack->in_pack[to_pack->in_pack_count++] = NULL;
+	/*
+	 * the rest is lazily initialized only for packs that we want
+	 * in want_object_in_pack().
+	 */
+}
+
 int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 {
 	int use_internal_rev_list = 0;
@@ -3190,6 +3220,7 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 			}
 		}
 	}
+	init_in_pack_mapping(&to_pack);
 
 	if (progress)
 		progress_state = start_progress(_("Counting objects"), 0);
diff --git a/cache.h b/cache.h
index 862bdff83a..b90feb3802 100644
--- a/cache.h
+++ b/cache.h
@@ -1635,6 +1635,7 @@ extern struct packed_git {
 	int index_version;
 	time_t mtime;
 	int pack_fd;
+	int index;		/* for builtin/pack-objects.c */
 	unsigned pack_local:1,
 		 pack_keep:1,
 		 freshened:1,
diff --git a/pack-objects.h b/pack-objects.h
index 737e89b665..83d91a0765 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -3,10 +3,14 @@
 
 #define OE_DFS_STATE_BITS 2
 #define OE_DEPTH_BITS 12
+#define OE_IN_PACK_BITS 14
 
 #define IN_PACK_POS(to_pack, obj) \
 	(to_pack)->in_pack_pos[(struct object_entry *)(obj) - (to_pack)->objects]
 
+#define IN_PACK(to_pack, obj) \
+	(to_pack)->in_pack[(obj)->in_pack_idx]
+
 /*
  * State flags for depth-first search used for analyzing delta cycles.
  *
@@ -20,10 +24,14 @@ enum dfs_state {
 	DFS_NUM_STATES
 };
 
+/*
+ * The size of struct nearly determines pack-objects's memory
+ * consumption. This struct is packed tight for that reason. When you
+ * add or reorder something in this struct, think a bit about this.
+ */
 struct object_entry {
 	struct pack_idx_entry idx;
 	unsigned long size;	/* uncompressed size */
-	struct packed_git *in_pack;	/* already in pack */
 	off_t in_pack_offset;
 	struct object_entry *delta;	/* delta base object */
 	struct object_entry *delta_child; /* deltified objects who bases me */
@@ -35,6 +43,7 @@ struct object_entry {
 	unsigned long z_delta_size;	/* delta data size (compressed) */
 	uint32_t hash;			/* name hint hash */
 	unsigned char in_pack_header_size; /* note: spare bits available! */
+	unsigned in_pack_idx:OE_IN_PACK_BITS;	/* already in pack */
 	unsigned type:TYPE_BITS;
 	unsigned in_pack_type:TYPE_BITS; /* could be delta */
 	unsigned preferred_base:1; /*
@@ -46,9 +55,12 @@ struct object_entry {
 	unsigned tagged:1; /* near the very tip of refs */
 	unsigned filled:1; /* assigned write-order */
 	unsigned dfs_state:OE_DFS_STATE_BITS;
+
+	/* XXX 8 bits hole, try to pack */
+
 	unsigned depth:OE_DEPTH_BITS;
 
-	/* size: 112, bit_padding: 8 bits */
+	/* size: 112, padding: 4, bit_padding: 18 bits */
 };
 
 struct packing_data {
@@ -59,6 +71,8 @@ struct packing_data {
 	uint32_t index_size;
 
 	unsigned int *in_pack_pos;
+	int in_pack_count;
+	struct packed_git *in_pack[1 << OE_IN_PACK_BITS];
 };
 
 struct object_entry *packlist_alloc(struct packing_data *pdata,
-- 
2.16.1.435.g8f24da2e1a


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH/RFC v2 8/9] pack-objects: refer to delta objects by index instead of pointer
  2018-03-03  2:46   ` [PATCH/RFC v2 0/9] " Nguyễn Thái Ngọc Duy
                       ` (6 preceding siblings ...)
  2018-03-03  2:47     ` [PATCH/RFC v2 7/9] pack-objects: move in_pack " Nguyễn Thái Ngọc Duy
@ 2018-03-03  2:47     ` Nguyễn Thái Ngọc Duy
  2018-03-03  2:47     ` [PATCH/RFC v2 9/9] pack-objects: reorder 'hash' to pack struct object_entry Nguyễn Thái Ngọc Duy
                       ` (2 subsequent siblings)
  10 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-03  2:47 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Jeff King, Eric Wong,
	Ævar Arnfjörð Bjarmason,
	Nguyễn Thái Ngọc Duy

Notice that packing_data::nr_objects is uint32_t, we could only handle
maximum 4G objects and can address all of them with an uint32_t. If we
use a pointer here, we waste 4 bytes on 64 bit architecture.

Convert these delta pointers to indexes. Since we need to handle NULL
pointers as well, the index is shifted by one [1].

There are holes in this struct but this patch is already big. Struct
packing can be done separately. Even with holes, we save 8 bytes per
object_entry.

[1] This means we can only index 2^32-2 objects even though nr_objects
    could contain 2^32-1 objects. It should not be a problem in
    practice because when we grow objects[], nr_alloc would probably
    blow up long before nr_objects hits the wall.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 124 +++++++++++++++++++++++------------------
 pack-objects.h         |  14 +++--
 2 files changed, 78 insertions(+), 60 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 5818bf73ca..55f19a1f18 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -29,6 +29,20 @@
 #include "list.h"
 #include "packfile.h"
 
+#define DELTA(obj) \
+	((obj)->delta_idx ? &to_pack.objects[(obj)->delta_idx - 1] : NULL)
+#define DELTA_CHILD(obj) \
+	((obj)->delta_child_idx ? &to_pack.objects[(obj)->delta_child_idx - 1] : NULL)
+#define DELTA_SIBLING(obj) \
+	((obj)->delta_sibling_idx ? &to_pack.objects[(obj)->delta_sibling_idx - 1] : NULL)
+
+#define CLEAR_DELTA(obj) (obj)->delta_idx = 0
+#define CLEAR_DELTA_CHILD(obj) (obj)->delta_child_idx = 0
+#define CLEAR_DELTA_SIBLING(obj) (obj)->delta_sibling_idx = 0
+
+#define SET_DELTA(obj, val) (obj)->delta_idx = ((val) - to_pack.objects) + 1
+#define SET_DELTA_CHILD(obj, val) (obj)->delta_child_idx = ((val) - to_pack.objects) + 1
+
 static const char *pack_usage[] = {
 	N_("git pack-objects --stdout [<options>...] [< <ref-list> | < <object-list>]"),
 	N_("git pack-objects [<options>...] <base-name> [< <ref-list> | < <object-list>]"),
@@ -125,11 +139,11 @@ static void *get_delta(struct object_entry *entry)
 	buf = read_sha1_file(entry->idx.oid.hash, &type, &size);
 	if (!buf)
 		die("unable to read %s", oid_to_hex(&entry->idx.oid));
-	base_buf = read_sha1_file(entry->delta->idx.oid.hash, &type,
+	base_buf = read_sha1_file(DELTA(entry)->idx.oid.hash, &type,
 				  &base_size);
 	if (!base_buf)
 		die("unable to read %s",
-		    oid_to_hex(&entry->delta->idx.oid));
+		    oid_to_hex(&DELTA(entry)->idx.oid));
 	delta_buf = diff_delta(base_buf, base_size,
 			       buf, size, &delta_size, 0);
 	if (!delta_buf || delta_size != entry->delta_size)
@@ -286,12 +300,12 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 		size = entry->delta_size;
 		buf = entry->delta_data;
 		entry->delta_data = NULL;
-		type = (allow_ofs_delta && entry->delta->idx.offset) ?
+		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	} else {
 		buf = get_delta(entry);
 		size = entry->delta_size;
-		type = (allow_ofs_delta && entry->delta->idx.offset) ?
+		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	}
 
@@ -315,7 +329,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 		 * encoding of the relative offset for the delta
 		 * base from this object's position in the pack.
 		 */
-		off_t ofs = entry->idx.offset - entry->delta->idx.offset;
+		off_t ofs = entry->idx.offset - DELTA(entry)->idx.offset;
 		unsigned pos = sizeof(dheader) - 1;
 		dheader[pos] = ofs & 127;
 		while (ofs >>= 7)
@@ -341,7 +355,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 			return 0;
 		}
 		hashwrite(f, header, hdrlen);
-		hashwrite(f, entry->delta->idx.oid.hash, 20);
+		hashwrite(f, DELTA(entry)->idx.oid.hash, 20);
 		hdrlen += 20;
 	} else {
 		if (limit && hdrlen + datalen + 20 >= limit) {
@@ -377,8 +391,8 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 		      dheader[MAX_PACK_OBJECT_HEADER];
 	unsigned hdrlen;
 
-	if (entry->delta)
-		type = (allow_ofs_delta && entry->delta->idx.offset) ?
+	if (DELTA(entry))
+		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	hdrlen = encode_in_pack_object_header(header, sizeof(header),
 					      type, entry->size);
@@ -406,7 +420,7 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 	}
 
 	if (type == OBJ_OFS_DELTA) {
-		off_t ofs = entry->idx.offset - entry->delta->idx.offset;
+		off_t ofs = entry->idx.offset - DELTA(entry)->idx.offset;
 		unsigned pos = sizeof(dheader) - 1;
 		dheader[pos] = ofs & 127;
 		while (ofs >>= 7)
@@ -425,7 +439,7 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 			return 0;
 		}
 		hashwrite(f, header, hdrlen);
-		hashwrite(f, entry->delta->idx.oid.hash, 20);
+		hashwrite(f, DELTA(entry)->idx.oid.hash, 20);
 		hdrlen += 20;
 		reused_delta++;
 	} else {
@@ -465,13 +479,13 @@ static off_t write_object(struct hashfile *f,
 	else
 		limit = pack_size_limit - write_offset;
 
-	if (!entry->delta)
+	if (!DELTA(entry))
 		usable_delta = 0;	/* no delta */
 	else if (!pack_size_limit)
 	       usable_delta = 1;	/* unlimited packfile */
-	else if (entry->delta->idx.offset == (off_t)-1)
+	else if (DELTA(entry)->idx.offset == (off_t)-1)
 		usable_delta = 0;	/* base was written to another pack */
-	else if (entry->delta->idx.offset)
+	else if (DELTA(entry)->idx.offset)
 		usable_delta = 1;	/* base already exists in this pack */
 	else
 		usable_delta = 0;	/* base could end up in another pack */
@@ -486,7 +500,7 @@ static off_t write_object(struct hashfile *f,
 				/* ... but pack split may override that */
 	else if (entry->type != entry->in_pack_type)
 		to_reuse = 0;	/* pack has delta which is unusable */
-	else if (entry->delta)
+	else if (DELTA(entry))
 		to_reuse = 0;	/* we want to pack afresh */
 	else
 		to_reuse = 1;	/* we have it in-pack undeltified,
@@ -538,12 +552,12 @@ static enum write_one_status write_one(struct hashfile *f,
 	}
 
 	/* if we are deltified, write out base object first. */
-	if (e->delta) {
+	if (DELTA(e)) {
 		e->idx.offset = 1; /* now recurse */
-		switch (write_one(f, e->delta, offset)) {
+		switch (write_one(f, DELTA(e), offset)) {
 		case WRITE_ONE_RECURSIVE:
 			/* we cannot depend on this one */
-			e->delta = NULL;
+			CLEAR_DELTA(e);
 			break;
 		default:
 			break;
@@ -605,34 +619,34 @@ static void add_descendants_to_write_order(struct object_entry **wo,
 			/* add this node... */
 			add_to_write_order(wo, endp, e);
 			/* all its siblings... */
-			for (s = e->delta_sibling; s; s = s->delta_sibling) {
+			for (s = DELTA_SIBLING(e); s; s = DELTA_SIBLING(s)) {
 				add_to_write_order(wo, endp, s);
 			}
 		}
 		/* drop down a level to add left subtree nodes if possible */
-		if (e->delta_child) {
+		if (DELTA_CHILD(e)) {
 			add_to_order = 1;
-			e = e->delta_child;
+			e = DELTA_CHILD(e);
 		} else {
 			add_to_order = 0;
 			/* our sibling might have some children, it is next */
-			if (e->delta_sibling) {
-				e = e->delta_sibling;
+			if (DELTA_SIBLING(e)) {
+				e = DELTA_SIBLING(e);
 				continue;
 			}
 			/* go back to our parent node */
-			e = e->delta;
-			while (e && !e->delta_sibling) {
+			e = DELTA(e);
+			while (e && !DELTA_SIBLING(e)) {
 				/* we're on the right side of a subtree, keep
 				 * going up until we can go right again */
-				e = e->delta;
+				e = DELTA(e);
 			}
 			if (!e) {
 				/* done- we hit our original root node */
 				return;
 			}
 			/* pass it off to sibling at this level */
-			e = e->delta_sibling;
+			e = DELTA_SIBLING(e);
 		}
 	};
 }
@@ -643,7 +657,7 @@ static void add_family_to_write_order(struct object_entry **wo,
 {
 	struct object_entry *root;
 
-	for (root = e; root->delta; root = root->delta)
+	for (root = e; DELTA(root); root = DELTA(root))
 		; /* nothing */
 	add_descendants_to_write_order(wo, endp, root);
 }
@@ -658,8 +672,8 @@ static struct object_entry **compute_write_order(void)
 	for (i = 0; i < to_pack.nr_objects; i++) {
 		objects[i].tagged = 0;
 		objects[i].filled = 0;
-		objects[i].delta_child = NULL;
-		objects[i].delta_sibling = NULL;
+		CLEAR_DELTA_CHILD(&objects[i]);
+		CLEAR_DELTA_SIBLING(&objects[i]);
 	}
 
 	/*
@@ -669,11 +683,11 @@ static struct object_entry **compute_write_order(void)
 	 */
 	for (i = to_pack.nr_objects; i > 0;) {
 		struct object_entry *e = &objects[--i];
-		if (!e->delta)
+		if (!DELTA(e))
 			continue;
 		/* Mark me as the first child */
-		e->delta_sibling = e->delta->delta_child;
-		e->delta->delta_child = e;
+		e->delta_sibling_idx = DELTA(e)->delta_child_idx;
+		SET_DELTA_CHILD(DELTA(e), e);
 	}
 
 	/*
@@ -1509,10 +1523,10 @@ static void check_object(struct object_entry *entry)
 			 * circular deltas.
 			 */
 			entry->type = entry->in_pack_type;
-			entry->delta = base_entry;
+			SET_DELTA(entry, base_entry);
 			entry->delta_size = entry->size;
-			entry->delta_sibling = base_entry->delta_child;
-			base_entry->delta_child = entry;
+			entry->delta_sibling_idx = base_entry->delta_child_idx;
+			SET_DELTA_CHILD(base_entry, entry);
 			unuse_pack(&w_curs);
 			return;
 		}
@@ -1583,17 +1597,19 @@ static int pack_offset_sort(const void *_a, const void *_b)
  */
 static void drop_reused_delta(struct object_entry *entry)
 {
-	struct object_entry **p = &entry->delta->delta_child;
+	unsigned *idx = &to_pack.objects[entry->delta_idx - 1].delta_child_idx;
 	struct object_info oi = OBJECT_INFO_INIT;
 	enum object_type type;
 
-	while (*p) {
-		if (*p == entry)
-			*p = (*p)->delta_sibling;
+	while (*idx) {
+		struct object_entry *oe = &to_pack.objects[*idx - 1];
+
+		if (oe == entry)
+			*idx = oe->delta_sibling_idx;
 		else
-			p = &(*p)->delta_sibling;
+			idx = &oe->delta_sibling_idx;
 	}
-	entry->delta = NULL;
+	CLEAR_DELTA(entry);
 	entry->depth = 0;
 
 	oi.sizep = &entry->size;
@@ -1635,7 +1651,7 @@ static void break_delta_chains(struct object_entry *entry)
 
 	for (cur = entry, total_depth = 0;
 	     cur;
-	     cur = cur->delta, total_depth++) {
+	     cur = DELTA(cur), total_depth++) {
 		if (cur->dfs_state == DFS_DONE) {
 			/*
 			 * We've already seen this object and know it isn't
@@ -1660,7 +1676,7 @@ static void break_delta_chains(struct object_entry *entry)
 		 * it's not a delta, we're done traversing, but we'll mark it
 		 * done to save time on future traversals.
 		 */
-		if (!cur->delta) {
+		if (!DELTA(cur)) {
 			cur->dfs_state = DFS_DONE;
 			break;
 		}
@@ -1683,7 +1699,7 @@ static void break_delta_chains(struct object_entry *entry)
 		 * We keep all commits in the chain that we examined.
 		 */
 		cur->dfs_state = DFS_ACTIVE;
-		if (cur->delta->dfs_state == DFS_ACTIVE) {
+		if (DELTA(cur)->dfs_state == DFS_ACTIVE) {
 			drop_reused_delta(cur);
 			cur->dfs_state = DFS_DONE;
 			break;
@@ -1698,7 +1714,7 @@ static void break_delta_chains(struct object_entry *entry)
 	 * an extra "next" pointer to keep going after we reset cur->delta.
 	 */
 	for (cur = entry; cur; cur = next) {
-		next = cur->delta;
+		next = DELTA(cur);
 
 		/*
 		 * We should have a chain of zero or more ACTIVE states down to
@@ -1881,7 +1897,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 
 	/* Now some size filtering heuristics. */
 	trg_size = trg_entry->size;
-	if (!trg_entry->delta) {
+	if (!DELTA(trg_entry)) {
 		max_size = trg_size/2 - 20;
 		ref_depth = 1;
 	} else {
@@ -1957,7 +1973,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	if (!delta_buf)
 		return 0;
 
-	if (trg_entry->delta) {
+	if (DELTA(trg_entry)) {
 		/* Prefer only shallower same-sized deltas. */
 		if (delta_size == trg_entry->delta_size &&
 		    src->depth + 1 >= trg->depth) {
@@ -1986,7 +2002,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 		free(delta_buf);
 	}
 
-	trg_entry->delta = src_entry;
+	SET_DELTA(trg_entry, src_entry);
 	trg_entry->delta_size = delta_size;
 	trg->depth = src->depth + 1;
 
@@ -1995,13 +2011,13 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 
 static unsigned int check_delta_limit(struct object_entry *me, unsigned int n)
 {
-	struct object_entry *child = me->delta_child;
+	struct object_entry *child = DELTA_CHILD(me);
 	unsigned int m = n;
 	while (child) {
 		unsigned int c = check_delta_limit(child, n + 1);
 		if (m < c)
 			m = c;
-		child = child->delta_sibling;
+		child = DELTA_SIBLING(child);
 	}
 	return m;
 }
@@ -2070,7 +2086,7 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
 		 * otherwise they would become too deep.
 		 */
 		max_depth = depth;
-		if (entry->delta_child) {
+		if (DELTA_CHILD(entry)) {
 			max_depth -= check_delta_limit(entry, 0);
 			if (max_depth <= 0)
 				goto next;
@@ -2120,7 +2136,7 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
 		 * depth, leaving it in the window is pointless.  we
 		 * should evict it first.
 		 */
-		if (entry->delta && max_depth <= n->depth)
+		if (DELTA(entry) && max_depth <= n->depth)
 			continue;
 
 		/*
@@ -2128,7 +2144,7 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
 		 * currently deltified object, to keep it longer.  It will
 		 * be the first base object to be attempted next.
 		 */
-		if (entry->delta) {
+		if (DELTA(entry)) {
 			struct unpacked swap = array[best_base];
 			int dist = (window + idx - best_base) % window;
 			int dst = best_base;
@@ -2449,7 +2465,7 @@ static void prepare_pack(int window, int depth)
 	for (i = 0; i < to_pack.nr_objects; i++) {
 		struct object_entry *entry = to_pack.objects + i;
 
-		if (entry->delta)
+		if (DELTA(entry))
 			/* This happens if we decided to reuse existing
 			 * delta from a pack.  "reuse_delta &&" is implied.
 			 */
diff --git a/pack-objects.h b/pack-objects.h
index 83d91a0765..db50e56223 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -33,11 +33,13 @@ struct object_entry {
 	struct pack_idx_entry idx;
 	unsigned long size;	/* uncompressed size */
 	off_t in_pack_offset;
-	struct object_entry *delta;	/* delta base object */
-	struct object_entry *delta_child; /* deltified objects who bases me */
-	struct object_entry *delta_sibling; /* other deltified objects who
-					     * uses the same base as me
-					     */
+	uint32_t delta_idx;	/* delta base object */
+	uint32_t delta_child_idx; /* deltified objects who bases me */
+	uint32_t delta_sibling_idx; /* other deltified objects who
+				     * uses the same base as me
+				     */
+	/* XXX 4 bytes hole, try to pack */
+
 	void *delta_data;	/* cached delta (uncompressed) */
 	unsigned long delta_size;	/* delta data size (uncompressed) */
 	unsigned long z_delta_size;	/* delta data size (compressed) */
@@ -60,7 +62,7 @@ struct object_entry {
 
 	unsigned depth:OE_DEPTH_BITS;
 
-	/* size: 112, padding: 4, bit_padding: 18 bits */
+	/* size: 104, padding: 4, bit_padding: 18 bits */
 };
 
 struct packing_data {
-- 
2.16.1.435.g8f24da2e1a


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH/RFC v2 9/9] pack-objects: reorder 'hash' to pack struct object_entry
  2018-03-03  2:46   ` [PATCH/RFC v2 0/9] " Nguyễn Thái Ngọc Duy
                       ` (7 preceding siblings ...)
  2018-03-03  2:47     ` [PATCH/RFC v2 8/9] pack-objects: refer to delta objects by index instead of pointer Nguyễn Thái Ngọc Duy
@ 2018-03-03  2:47     ` Nguyễn Thái Ngọc Duy
  2018-03-05  9:28     ` [PATCH/RFC v2 0/9] Reduce pack-objects memory footprint Duy Nguyen
  2018-03-08 11:42     ` [PATCH/RFC v3 00/12] " Nguyễn Thái Ngọc Duy
  10 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-03  2:47 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Jeff King, Eric Wong,
	Ævar Arnfjörð Bjarmason,
	Nguyễn Thái Ngọc Duy

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 pack-objects.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/pack-objects.h b/pack-objects.h
index db50e56223..a57aca5f03 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -38,12 +38,10 @@ struct object_entry {
 	uint32_t delta_sibling_idx; /* other deltified objects who
 				     * uses the same base as me
 				     */
-	/* XXX 4 bytes hole, try to pack */
-
+	uint32_t hash;			/* name hint hash */
 	void *delta_data;	/* cached delta (uncompressed) */
 	unsigned long delta_size;	/* delta data size (uncompressed) */
 	unsigned long z_delta_size;	/* delta data size (compressed) */
-	uint32_t hash;			/* name hint hash */
 	unsigned char in_pack_header_size; /* note: spare bits available! */
 	unsigned in_pack_idx:OE_IN_PACK_BITS;	/* already in pack */
 	unsigned type:TYPE_BITS;
@@ -62,7 +60,7 @@ struct object_entry {
 
 	unsigned depth:OE_DEPTH_BITS;
 
-	/* size: 104, padding: 4, bit_padding: 18 bits */
+	/* size: 96, bit_padding: 18 bits */
 };
 
 struct packing_data {
-- 
2.16.1.435.g8f24da2e1a


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* Re: [PATCH/RFC v2 0/9] Reduce pack-objects memory footprint
  2018-03-03  2:46   ` [PATCH/RFC v2 0/9] " Nguyễn Thái Ngọc Duy
                       ` (8 preceding siblings ...)
  2018-03-03  2:47     ` [PATCH/RFC v2 9/9] pack-objects: reorder 'hash' to pack struct object_entry Nguyễn Thái Ngọc Duy
@ 2018-03-05  9:28     ` Duy Nguyen
  2018-03-08 11:42     ` [PATCH/RFC v3 00/12] " Nguyễn Thái Ngọc Duy
  10 siblings, 0 replies; 273+ messages in thread
From: Duy Nguyen @ 2018-03-05  9:28 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Jeff King, Eric Wong,
	Ævar Arnfjörð Bjarmason

On Sat, Mar 03, 2018 at 09:46:57AM +0700, Nguyễn Thái Ngọc Duy wrote:
> The array of object_entry in pack-objects can take a lot of memory
> when pack-objects is run in "pack everything" mode. On linux-2.6.git,
> this array alone takes roughly 800MB.
> 
> This series reorders some fields and reduces field size... to keep
> this struct smaller. Its size goes from 136 bytes to 96 bytes (29%) on
> 64-bit linux and saves 260MB on linux-2.6.git.

And I continue to push this until someone screams "enough is enough!".
This patch saves 4 more bytes. The trade off is, processing objects
with offset beyond 4 GB will be slower. But I think this is a reasonable
trade off.

The same trick could be done for "size" field in this struct
(i.e. uncompressed object size greater than 32 bits must be read back
from disk). Interestingly though, "size" is unsigned long which is 32
bits on Windows and nobody has complained about it so far, we could
even just unconditionally reject objects larger than 4GB.

-- 8< --
Subject: [PATCH] pack-objects: shrink in_pack_offset for 4GB pack files

If a pack file is smaller than 4GB, pack offsets should fit within 32
bits. If not (which is not considered a common case), this field
in_pack_location stores the object index instead (which still fits in 32
bits) and getting pack offset requires extra lookups through
nth_packed_object_offset() function call.

This saves us 4 bytes but lose it to padding until this struct is shrunk
further.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 26 ++++++++++++++-----------
 pack-objects.h         | 44 ++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 57 insertions(+), 13 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 55f19a1f18..57c04b277b 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -43,6 +43,8 @@
 #define SET_DELTA(obj, val) (obj)->delta_idx = ((val) - to_pack.objects) + 1
 #define SET_DELTA_CHILD(obj, val) (obj)->delta_child_idx = ((val) - to_pack.objects) + 1
 
+#define IN_PACK_OFFSET(obj) oe_in_pack_offset(&to_pack, obj)
+
 static const char *pack_usage[] = {
 	N_("git pack-objects --stdout [<options>...] [< <ref-list> | < <object-list>]"),
 	N_("git pack-objects [<options>...] <base-name> [< <ref-list> | < <object-list>]"),
@@ -397,7 +399,7 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 	hdrlen = encode_in_pack_object_header(header, sizeof(header),
 					      type, entry->size);
 
-	offset = entry->in_pack_offset;
+	offset = IN_PACK_OFFSET(entry);
 	revidx = find_pack_revindex(p, offset);
 	datalen = revidx[1].offset - offset;
 	if (!pack_to_stdout && p->index_version > 1 &&
@@ -1107,7 +1109,7 @@ static void create_object_entry(const struct object_id *oid,
 		if (found_pack->index <= 0)
 			die("BUG: found_pack should be NULL instead of having non-positive index");
 		entry->in_pack_idx = found_pack->index;
-		entry->in_pack_offset = found_offset;
+		oe_set_in_pack_offset(&to_pack, entry, found_offset);
 	}
 
 	entry->no_try_delta = no_try_delta;
@@ -1442,7 +1444,7 @@ static void check_object(struct object_entry *entry)
 		unsigned char *buf, c;
 		enum object_type type;
 
-		buf = use_pack(p, &w_curs, entry->in_pack_offset, &avail);
+		buf = use_pack(p, &w_curs, IN_PACK_OFFSET(entry), &avail);
 
 		/*
 		 * We want in_pack_type even if we do not reuse delta
@@ -1475,12 +1477,12 @@ static void check_object(struct object_entry *entry)
 		case OBJ_REF_DELTA:
 			if (reuse_delta && !entry->preferred_base)
 				base_ref = use_pack(p, &w_curs,
-						entry->in_pack_offset + used, NULL);
+						IN_PACK_OFFSET(entry) + used, NULL);
 			entry->in_pack_header_size = used + 20;
 			break;
 		case OBJ_OFS_DELTA:
 			buf = use_pack(p, &w_curs,
-				       entry->in_pack_offset + used, NULL);
+				       IN_PACK_OFFSET(entry) + used, NULL);
 			used_0 = 0;
 			c = buf[used_0++];
 			ofs = c & 127;
@@ -1494,8 +1496,8 @@ static void check_object(struct object_entry *entry)
 				c = buf[used_0++];
 				ofs = (ofs << 7) + (c & 127);
 			}
-			ofs = entry->in_pack_offset - ofs;
-			if (ofs <= 0 || ofs >= entry->in_pack_offset) {
+			ofs = IN_PACK_OFFSET(entry) - ofs;
+			if (ofs <= 0 || ofs >= IN_PACK_OFFSET(entry)) {
 				error("delta base offset out of bound for %s",
 				      oid_to_hex(&entry->idx.oid));
 				goto give_up;
@@ -1538,7 +1540,7 @@ static void check_object(struct object_entry *entry)
 			 * object size from the delta header.
 			 */
 			entry->size = get_size_from_delta(p, &w_curs,
-					entry->in_pack_offset + entry->in_pack_header_size);
+				IN_PACK_OFFSET(entry) + entry->in_pack_header_size);
 			if (entry->size == 0)
 				goto give_up;
 			unuse_pack(&w_curs);
@@ -1578,8 +1580,8 @@ static int pack_offset_sort(const void *_a, const void *_b)
 		return -1;
 	if (a_in_pack > b_in_pack)
 		return 1;
-	return a->in_pack_offset < b->in_pack_offset ? -1 :
-			(a->in_pack_offset > b->in_pack_offset);
+	return IN_PACK_OFFSET(a) < IN_PACK_OFFSET(b) ? -1 :
+		(IN_PACK_OFFSET(a) > IN_PACK_OFFSET(b));
 }
 
 /*
@@ -1614,7 +1616,9 @@ static void drop_reused_delta(struct object_entry *entry)
 
 	oi.sizep = &entry->size;
 	oi.typep = &type;
-	if (packed_object_info(IN_PACK(&to_pack, entry), entry->in_pack_offset, &oi) < 0) {
+	if (packed_object_info(IN_PACK(&to_pack, entry),
+			       IN_PACK_OFFSET(entry),
+			       &oi) < 0) {
 		/*
 		 * We failed to get the info from this pack for some reason;
 		 * fall back to sha1_object_info, which may find another copy.
diff --git a/pack-objects.h b/pack-objects.h
index a57aca5f03..cb752fb4d8 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -1,6 +1,8 @@
 #ifndef PACK_OBJECTS_H
 #define PACK_OBJECTS_H
 
+#include "packfile.h"
+
 #define OE_DFS_STATE_BITS 2
 #define OE_DEPTH_BITS 12
 #define OE_IN_PACK_BITS 14
@@ -32,7 +34,6 @@ enum dfs_state {
 struct object_entry {
 	struct pack_idx_entry idx;
 	unsigned long size;	/* uncompressed size */
-	off_t in_pack_offset;
 	uint32_t delta_idx;	/* delta base object */
 	uint32_t delta_child_idx; /* deltified objects who bases me */
 	uint32_t delta_sibling_idx; /* other deltified objects who
@@ -42,6 +43,13 @@ struct object_entry {
 	void *delta_data;	/* cached delta (uncompressed) */
 	unsigned long delta_size;	/* delta data size (uncompressed) */
 	unsigned long z_delta_size;	/* delta data size (compressed) */
+	/*
+	 * If in_pack_location_is_offset is true, this contains offset
+	 * to the object in "in_pack". If false, it contains the
+	 * object _index_ and pack offset must be retrieved via
+	 * nth_packed_object_offset().
+	 */
+	uint32_t in_pack_location;
 	unsigned char in_pack_header_size; /* note: spare bits available! */
 	unsigned in_pack_idx:OE_IN_PACK_BITS;	/* already in pack */
 	unsigned type:TYPE_BITS;
@@ -54,13 +62,14 @@ struct object_entry {
 	unsigned no_try_delta:1;
 	unsigned tagged:1; /* near the very tip of refs */
 	unsigned filled:1; /* assigned write-order */
+	unsigned in_pack_location_is_offset:1;
 	unsigned dfs_state:OE_DFS_STATE_BITS;
 
 	/* XXX 8 bits hole, try to pack */
 
 	unsigned depth:OE_DEPTH_BITS;
 
-	/* size: 96, bit_padding: 18 bits */
+	/* size: 96, padding: 4, bit_padding: 17 bits */
 };
 
 struct packing_data {
@@ -103,4 +112,35 @@ static inline uint32_t pack_name_hash(const char *name)
 	return hash;
 }
 
+static inline off_t oe_in_pack_offset(const struct packing_data *pack,
+				      const struct object_entry *e)
+{
+	if (e->in_pack_location_is_offset)
+		return e->in_pack_location;
+
+	/*
+	 * Slow path where in_pack_location contains the object index
+	 * instead of offset. We perform one more lookup.
+	 */
+	return nth_packed_object_offset(IN_PACK(pack, e),
+					e->in_pack_location);
+}
+
+static inline void oe_set_in_pack_offset(struct packing_data *pack,
+					 struct object_entry *e,
+					 off_t offset)
+{
+	struct revindex_entry *revidx;
+
+	e->in_pack_location = offset;
+	if (e->in_pack_location == offset) {
+		e->in_pack_location_is_offset = 1;
+		return;
+	}
+
+	revidx = find_pack_revindex(IN_PACK(pack, e), offset);
+	e->in_pack_location = revidx->nr;
+	e->in_pack_location_is_offset = 0;
+}
+
 #endif
-- 
2.16.1.435.g8f24da2e1a

-- 8< --

^ permalink raw reply related	[flat|nested] 273+ messages in thread

* Re: [PATCH/RFC 1/1] gc --auto: exclude the largest giant pack in low-memory config
  2018-03-01  9:20   ` [PATCH/RFC 1/1] gc --auto: exclude the largest giant pack in low-memory config Nguyễn Thái Ngọc Duy
  2018-03-01 18:14     ` Junio C Hamano
@ 2018-03-05 14:00     ` Ævar Arnfjörð Bjarmason
  1 sibling, 0 replies; 273+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2018-03-05 14:00 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git, Jeff King, Eric Wong


On Thu, Mar 01 2018, Nguyễn Thái Ngọc Duy jotted:

> pack-objects could be a big memory hog especially on large repos,
> everybody knows that. The suggestion to stick a .keep file on the
> largest pack to avoid this problem is also known for a long time.
>
> Let's do the suggestion automatically instead of waiting for people to
> come to Git mailing list and get the advice. When a certain condition
> is met, gc --auto create a .keep file temporary before repack is run,
> then remove it afterward.
>
> gc --auto does this based on an estimation of pack-objects memory
> usage and whether that fits in one third of system memory (the
> assumption here is for desktop environment where there are many other
> applications running).
>
> Since the estimation may be inaccurate and that 1/3 threshold is
> arbitrary, give the user a finer control over this mechanism as well:
> if the largest pack is larger than gc.bigPackThreshold, it's kept.

This is very promising. Saves lots of memory on my ad-hoc testing of
adding a *.keep file on an in-house repo.

> +	if (big_pack_threshold)
> +		return pack->pack_size >= big_pack_threshold;
> +
> +	/* First we have to scan through at least one pack */
> +	mem_want = pack->pack_size + pack->index_size;
> +	/* then pack-objects needs lots more for book keeping */
> +	mem_want += sizeof(struct object_entry) * nr_objects;
> +	/*
> +	 * internal rev-list --all --objects takes up some memory too,
> +	 * let's say half of it is for blobs
> +	 */
> +	mem_want += sizeof(struct blob) * nr_objects / 2;
> +	/*
> +	 * and the other half is for trees (commits and tags are
> +	 * usually insignificant)
> +	 */
> +	mem_want += sizeof(struct tree) * nr_objects / 2;
> +	/* and then obj_hash[], underestimated in fact */
> +	mem_want += sizeof(struct object *) * nr_objects;
> +	/*
> +	 * read_sha1_file() (either at delta calculation phase, or
> +	 * writing phase) also fills up the delta base cache
> +	 */
> +	mem_want += delta_base_cache_limit;
> +	/* and of course pack-objects has its own delta cache */
> +	mem_want += max_delta_cache_size;

I'm not familiar enough with this part to say, but isn't this assuming a
lot about the distribution of objects in a way that will cause is not to
repack in some pathological cases?

Probably worth documenting...

> +	/* Only allow 1/3 of memory for pack-objects */
> +	mem_have = total_ram() / 3;

Would be great to have this be a configurable variable, so you could set
it to e.g. 33% (like here), 50% etc.

^ permalink raw reply	[flat|nested] 273+ messages in thread

* [PATCH v2 0/5] Avoid expensive 'repack -ad' in gc --auto
  2018-03-01  9:20 ` [PATCH/RFC 0/1] Avoid expensive 'repack -ad' in gc --auto Nguyễn Thái Ngọc Duy
  2018-03-01  9:20   ` [PATCH/RFC 1/1] gc --auto: exclude the largest giant pack in low-memory config Nguyễn Thái Ngọc Duy
@ 2018-03-06 10:41   ` Nguyễn Thái Ngọc Duy
  2018-03-06 10:41     ` [PATCH v2 1/5] fixup! Add a test showing that 'git repack' throws away grafted-away parents Nguyễn Thái Ngọc Duy
                       ` (6 more replies)
  1 sibling, 7 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-06 10:41 UTC (permalink / raw)
  To: pclouds
  Cc: e, git, peff, Ævar Arnfjörð Bjarmason, Junio C Hamano

I'm pretty happy with this now. Like v1, this is about not touching
the giant base pack when doing background gc. This saves about 2/3 of
memory, which in turn should improve performance if you're under
memory pressure.

v2 changes:

- the core idea remains the same, mem_have is increased to half total
  memory though. I figure including the whole mmap'd pack in the
  memory estimation may be a bit much, which is why I make this
  change.
- no creating .keep files temporarily
- the config key is renamed gc.bigBasePackThreshold (named after
  core.bigFileThreshold)
- note that if you set gc.bigFileThreshold, then normal gc (without
  --auto) can trigger this mode too.
- you can also control this with --[no-]keep-base-pack
- documents and tests
- some more progress output improvements

I'm _not_ doing external rev-list in this series though. I found out
that we have added more and more stuff in the internal rev-list code
path over the year and simply running

    git rev-list .... | git pack-objects

will break stuff (the test suite first for example). I will do it
because it does help. But it will take some time.

PS. This conflicts with sb/packfiles-in-repository on 'pu' because I
introduced new references to the global variable "packed_git" and
prepare_packed_git(). Resolving this should be simple though:

- drop prepare_packed_git()
- replace packed_git with get_packed_git(the_repository)

Nguyễn Thái Ngọc Duy (5):
  fixup! Add a test showing that 'git repack' throws away grafted-away
    parents
  repack: add --keep-pack option
  gc --auto: exclude base pack if not enough mem to "repack -ad"
  pack-objects: show some progress when counting kept objects
  pack-objects: display progress in get_object_details()

 Documentation/config.txt           |   7 ++
 Documentation/git-gc.txt           |  13 +++
 Documentation/git-pack-objects.txt |   4 +
 Documentation/git-repack.txt       |   4 +
 builtin/gc.c                       | 153 +++++++++++++++++++++++++++--
 builtin/pack-objects.c             |  51 ++++++++--
 builtin/repack.c                   |  23 ++++-
 config.mak.uname                   |   1 +
 git-compat-util.h                  |   4 +
 pack-objects.h                     |   2 +
 t/t6500-gc.sh                      |  29 ++++++
 t/t7700-repack.sh                  |  21 +++-
 12 files changed, 295 insertions(+), 17 deletions(-)

-- 
2.16.2.784.gb291bd247e


^ permalink raw reply	[flat|nested] 273+ messages in thread

* [PATCH v2 1/5] fixup! Add a test showing that 'git repack' throws away grafted-away parents
  2018-03-06 10:41   ` [PATCH v2 0/5] Avoid expensive 'repack -ad' in gc --auto Nguyễn Thái Ngọc Duy
@ 2018-03-06 10:41     ` Nguyễn Thái Ngọc Duy
  2018-03-06 18:01       ` Junio C Hamano
  2018-03-06 10:41     ` [PATCH v2 2/5] repack: add --keep-pack option Nguyễn Thái Ngọc Duy
                       ` (5 subsequent siblings)
  6 siblings, 1 reply; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-06 10:41 UTC (permalink / raw)
  To: pclouds
  Cc: e, git, peff, Ævar Arnfjörð Bjarmason, Junio C Hamano

The closing quote of a test body by convention is always at the start
of line.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 t/t7700-repack.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/t/t7700-repack.sh b/t/t7700-repack.sh
index 6061a04147..38247afbec 100755
--- a/t/t7700-repack.sh
+++ b/t/t7700-repack.sh
@@ -194,7 +194,7 @@ test_expect_success 'objects made unreachable by grafts only are kept' '
 	git reflog expire --expire=$test_tick --expire-unreachable=$test_tick --all &&
 	git repack -a -d &&
 	git cat-file -t $H1
-	'
+'
 
 test_done
 
-- 
2.16.2.784.gb291bd247e


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v2 2/5] repack: add --keep-pack option
  2018-03-06 10:41   ` [PATCH v2 0/5] Avoid expensive 'repack -ad' in gc --auto Nguyễn Thái Ngọc Duy
  2018-03-06 10:41     ` [PATCH v2 1/5] fixup! Add a test showing that 'git repack' throws away grafted-away parents Nguyễn Thái Ngọc Duy
@ 2018-03-06 10:41     ` Nguyễn Thái Ngọc Duy
  2018-03-06 18:25       ` Junio C Hamano
  2018-03-06 10:41     ` [PATCH v2 3/5] gc --auto: exclude base pack if not enough mem to "repack -ad" Nguyễn Thái Ngọc Duy
                       ` (4 subsequent siblings)
  6 siblings, 1 reply; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-06 10:41 UTC (permalink / raw)
  To: pclouds
  Cc: e, git, peff, Ævar Arnfjörð Bjarmason, Junio C Hamano

We allow to keep existing packs by having companion .keep files. This
is helpful when a pack is permanently kept. In the next patch, git-gc
just wants to keep a pack temporarily, for one pack-objects
run. git-gc can use --keep-pack for this use case.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 Documentation/git-pack-objects.txt |  4 ++++
 Documentation/git-repack.txt       |  4 ++++
 builtin/pack-objects.c             | 31 ++++++++++++++++++++++++++++++
 builtin/repack.c                   | 23 +++++++++++++++++++---
 t/t7700-repack.sh                  | 19 ++++++++++++++++++
 5 files changed, 78 insertions(+), 3 deletions(-)

diff --git a/Documentation/git-pack-objects.txt b/Documentation/git-pack-objects.txt
index 81bc490ac5..1975477160 100644
--- a/Documentation/git-pack-objects.txt
+++ b/Documentation/git-pack-objects.txt
@@ -126,6 +126,10 @@ base-name::
 	has a .keep file to be ignored, even if it would have
 	otherwise been packed.
 
+--keep-pack=<pack name>::
+	Ignore the given pack. This is the equivalent of having
+	`.keep` file on the pack. Implies `--honor-pack-keep`.
+
 --incremental::
 	This flag causes an object already in a pack to be ignored
 	even if it would have otherwise been packed.
diff --git a/Documentation/git-repack.txt b/Documentation/git-repack.txt
index ae750e9e11..12b073e115 100644
--- a/Documentation/git-repack.txt
+++ b/Documentation/git-repack.txt
@@ -133,6 +133,10 @@ other objects in that pack they already have locally.
 	with `-b` or `repack.writeBitmaps`, as it ensures that the
 	bitmapped packfile has the necessary objects.
 
+--keep-pack=<pack name>::
+	Exclude the given pack from repacking. This is the equivalent
+	of having `.keep` file on the pack. Implies `--pack-kept-objects`.
+
 --unpack-unreachable=<when>::
 	When loosening unreachable objects, do not bother loosening any
 	objects older than `<when>`. This can be used to optimize out
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 5c674b2843..8e3f870d71 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -28,6 +28,7 @@
 #include "argv-array.h"
 #include "list.h"
 #include "packfile.h"
+#include "dir.h"
 
 static const char *pack_usage[] = {
 	N_("git pack-objects --stdout [<options>...] [< <ref-list> | < <object-list>]"),
@@ -2917,6 +2918,32 @@ static void get_object_list(int ac, const char **av)
 	oid_array_clear(&recent_objects);
 }
 
+static void add_extra_kept_packs(const struct string_list *names)
+{
+	struct packed_git *p;
+
+	if (!names->nr)
+		return;
+
+	prepare_packed_git();
+	for (p = packed_git; p; p = p->next) {
+		const char *name = basename(p->pack_name);
+		int i;
+
+		if (!p->pack_local)
+			continue;
+
+		for (i = 0; i < names->nr; i++) {
+			if (fspathcmp(name, names->items[i].string))
+				continue;
+
+			p->pack_keep = 1;
+			ignore_packed_keep = 1;
+			break;
+		}
+	}
+}
+
 static int option_parse_index_version(const struct option *opt,
 				      const char *arg, int unset)
 {
@@ -2956,6 +2983,7 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 	struct argv_array rp = ARGV_ARRAY_INIT;
 	int rev_list_unpacked = 0, rev_list_all = 0, rev_list_reflog = 0;
 	int rev_list_index = 0;
+	struct string_list keep_pack_list = STRING_LIST_INIT_NODUP;
 	struct option pack_objects_options[] = {
 		OPT_SET_INT('q', "quiet", &progress,
 			    N_("do not show progress meter"), 0),
@@ -3022,6 +3050,8 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 			 N_("create packs suitable for shallow fetches")),
 		OPT_BOOL(0, "honor-pack-keep", &ignore_packed_keep,
 			 N_("ignore packs that have companion .keep file")),
+		OPT_STRING_LIST(0, "keep-pack", &keep_pack_list, N_("name"),
+				N_("ignore this pack")),
 		OPT_INTEGER(0, "compression", &pack_compression_level,
 			    N_("pack compression level")),
 		OPT_SET_INT(0, "keep-true-parents", &grafts_replace_parents,
@@ -3150,6 +3180,7 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 		progress = 2;
 
 	prepare_packed_git();
+	add_extra_kept_packs(&keep_pack_list);
 	if (ignore_packed_keep) {
 		struct packed_git *p;
 		for (p = packed_git; p; p = p->next)
diff --git a/builtin/repack.c b/builtin/repack.c
index 7bdb40142f..6a1dade0e1 100644
--- a/builtin/repack.c
+++ b/builtin/repack.c
@@ -86,7 +86,8 @@ static void remove_pack_on_signal(int signo)
  * have a corresponding .keep or .promisor file. These packs are not to
  * be kept if we are going to pack everything into one file.
  */
-static void get_non_kept_pack_filenames(struct string_list *fname_list)
+static void get_non_kept_pack_filenames(struct string_list *fname_list,
+					const struct string_list *extra_keep)
 {
 	DIR *dir;
 	struct dirent *e;
@@ -97,6 +98,14 @@ static void get_non_kept_pack_filenames(struct string_list *fname_list)
 
 	while ((e = readdir(dir)) != NULL) {
 		size_t len;
+		int i;
+
+		for (i = 0;i < extra_keep->nr; i++)
+			if (!fspathcmp(e->d_name, extra_keep->items[i].string))
+				break;
+		if (extra_keep->nr > 0 && i < extra_keep->nr)
+			continue;
+
 		if (!strip_suffix(e->d_name, ".pack", &len))
 			continue;
 
@@ -148,7 +157,7 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
 	struct string_list rollback = STRING_LIST_INIT_NODUP;
 	struct string_list existing_packs = STRING_LIST_INIT_DUP;
 	struct strbuf line = STRBUF_INIT;
-	int ext, ret, failed;
+	int i, ext, ret, failed;
 	FILE *out;
 
 	/* variables to be filled by option parsing */
@@ -160,6 +169,7 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
 	const char *depth = NULL;
 	const char *threads = NULL;
 	const char *max_pack_size = NULL;
+	struct string_list keep_pack_list = STRING_LIST_INIT_NODUP;
 	int no_reuse_delta = 0, no_reuse_object = 0;
 	int no_update_server_info = 0;
 	int quiet = 0;
@@ -200,6 +210,8 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
 				N_("maximum size of each packfile")),
 		OPT_BOOL(0, "pack-kept-objects", &pack_kept_objects,
 				N_("repack objects in packs marked with .keep")),
+		OPT_STRING_LIST(0, "keep-pack", &keep_pack_list, N_("name"),
+				N_("do not repack this pack")),
 		OPT_END()
 	};
 
@@ -215,6 +227,8 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
 	    (unpack_unreachable || (pack_everything & LOOSEN_UNREACHABLE)))
 		die(_("--keep-unreachable and -A are incompatible"));
 
+	if (keep_pack_list.nr && pack_kept_objects > 0)
+		die(_("incompatible --keep-pack and --pack-kept-objects"));
 	if (pack_kept_objects < 0)
 		pack_kept_objects = write_bitmaps;
 
@@ -230,6 +244,9 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
 	argv_array_push(&cmd.args, "--keep-true-parents");
 	if (!pack_kept_objects)
 		argv_array_push(&cmd.args, "--honor-pack-keep");
+	for (i = 0; i < keep_pack_list.nr; i++)
+		argv_array_pushf(&cmd.args, "--keep-pack=%s",
+				 keep_pack_list.items[i].string);
 	argv_array_push(&cmd.args, "--non-empty");
 	argv_array_push(&cmd.args, "--all");
 	argv_array_push(&cmd.args, "--reflog");
@@ -254,7 +271,7 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
 		argv_array_push(&cmd.args, "--write-bitmap-index");
 
 	if (pack_everything & ALL_INTO_ONE) {
-		get_non_kept_pack_filenames(&existing_packs);
+		get_non_kept_pack_filenames(&existing_packs, &keep_pack_list);
 
 		if (existing_packs.nr && delete_redundant) {
 			if (unpack_unreachable) {
diff --git a/t/t7700-repack.sh b/t/t7700-repack.sh
index 38247afbec..553d907d34 100755
--- a/t/t7700-repack.sh
+++ b/t/t7700-repack.sh
@@ -196,5 +196,24 @@ test_expect_success 'objects made unreachable by grafts only are kept' '
 	git cat-file -t $H1
 '
 
+test_expect_success 'repack --keep-pack' '
+	test_create_repo keep-pack &&
+	(
+		cd keep-pack &&
+		for cmit in one two three four; do
+			test_commit $cmit &&
+			git repack -d
+		done &&
+		( cd .git/objects/pack && ls *.pack ) >pack-list &&
+		test_line_count = 4 pack-list &&
+		KEEP1=`head -n1 pack-list` &&
+		KEEP4=`tail -n1 pack-list` &&
+		git repack -a -d --keep-pack $KEEP1 --keep-pack $KEEP4 &&
+		ls .git/objects/pack/*.pack >new-counts &&
+		test_line_count = 3 new-counts &&
+		git fsck
+	)
+'
+
 test_done
 
-- 
2.16.2.784.gb291bd247e


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v2 3/5] gc --auto: exclude base pack if not enough mem to "repack -ad"
  2018-03-06 10:41   ` [PATCH v2 0/5] Avoid expensive 'repack -ad' in gc --auto Nguyễn Thái Ngọc Duy
  2018-03-06 10:41     ` [PATCH v2 1/5] fixup! Add a test showing that 'git repack' throws away grafted-away parents Nguyễn Thái Ngọc Duy
  2018-03-06 10:41     ` [PATCH v2 2/5] repack: add --keep-pack option Nguyễn Thái Ngọc Duy
@ 2018-03-06 10:41     ` Nguyễn Thái Ngọc Duy
  2018-03-06 19:19       ` Junio C Hamano
                         ` (2 more replies)
  2018-03-06 10:41     ` [PATCH v2 4/5] pack-objects: show some progress when counting kept objects Nguyễn Thái Ngọc Duy
                       ` (3 subsequent siblings)
  6 siblings, 3 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-06 10:41 UTC (permalink / raw)
  To: pclouds
  Cc: e, git, peff, Ævar Arnfjörð Bjarmason, Junio C Hamano

pack-objects could be a big memory hog especially on large repos,
everybody knows that. The suggestion to stick a .keep file on the
giant base pack to avoid this problem is also known for a long time.

Let's do the suggestion automatically instead of waiting for people to
come to Git mailing list and get the advice. When a certain condition
is met, "gc --auto" tells "git repack" to keep the base pack around.
The end result would be two packs instead of one.

On linux-2.6.git, valgrind massif reports 1.6GB heap in "pack all"
case, and 535MB [1] in "pack all except the base pack" case. We save
roughly 1GB memory by excluding the base pack.

gc --auto decides to do this based on an estimation of pack-objects
memory usage, which is quite accurate at least for the heap part, and
whether that fits in half of system memory (the assumption here is for
desktop environment where there are many other applications running).

Since the estimation may be inaccurate and that 1/2 threshold is
really arbitrary, give the user a finer control over this mechanism:
if the largest pack is larger than gc.bigBasePackThreshold, it's kept.

PS. A big chunk of the remaining 535MB is the result of pack-objects
running rev-list internally. This will be dealt with when we could run
rev-list externally. Right now we can't because pack-objects internal
rev-list does more regarding unreachable objects, which cannot be done
by "git rev-list".

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 Documentation/config.txt |   7 ++
 Documentation/git-gc.txt |  13 ++++
 builtin/gc.c             | 153 +++++++++++++++++++++++++++++++++++++--
 builtin/pack-objects.c   |   2 +-
 config.mak.uname         |   1 +
 git-compat-util.h        |   4 +
 pack-objects.h           |   2 +
 t/t6500-gc.sh            |  29 ++++++++
 8 files changed, 204 insertions(+), 7 deletions(-)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index f57e9cf10c..120cf6bac9 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -1549,6 +1549,13 @@ gc.autoDetach::
 	Make `git gc --auto` return immediately and run in background
 	if the system supports it. Default is true.
 
+gc.bigBasePackThreshold::
+	Make `git gc --auto` only enable `--keep-base-pack` when the
+	base pack's size is larger than this limit (in bytes).
+	Defaults to zero, which disables this check and lets
+	`git gc --auto` determine when to enable `--keep-base-pack`
+	based on memory usage.
+
 gc.logExpiry::
 	If the file gc.log exists, then `git gc --auto` won't run
 	unless that file is more than 'gc.logExpiry' old.  Default is
diff --git a/Documentation/git-gc.txt b/Documentation/git-gc.txt
index 571b5a7e3c..35ad420d5c 100644
--- a/Documentation/git-gc.txt
+++ b/Documentation/git-gc.txt
@@ -59,6 +59,11 @@ then existing packs (except those marked with a `.keep` file)
 are consolidated into a single pack by using the `-A` option of
 'git repack'. Setting `gc.autoPackLimit` to 0 disables
 automatic consolidation of packs.
++
+If the physical amount of memory is considered not enough for `git
+repack` to run smoothly, `--keep-base-pack` is enabled. This could be
+overridden by setting `gc.bigBasePackThreshold` which only enables
+`--keep-base-pack` when the base pack is larger the specified limit.
 
 --prune=<date>::
 	Prune loose objects older than date (default is 2 weeks ago,
@@ -78,6 +83,10 @@ automatic consolidation of packs.
 	Force `git gc` to run even if there may be another `git gc`
 	instance running on this repository.
 
+--keep-base-pack::
+	All packs except the base pack are consolidated into a single
+	pack. The largest pack is considered the base pack.
+
 Configuration
 -------------
 
@@ -167,6 +176,10 @@ run commands concurrently have to live with some risk of corruption (which
 seems to be low in practice) unless they turn off automatic garbage
 collection with 'git config gc.auto 0'.
 
+Set environment variable `GIT_TRACE` in order to see the memory usage
+estimation in `git gc --auto` that determines whether the base pack is
+kept.
+
 HOOKS
 -----
 
diff --git a/builtin/gc.c b/builtin/gc.c
index 77fa720bd0..273657ddf4 100644
--- a/builtin/gc.c
+++ b/builtin/gc.c
@@ -20,6 +20,10 @@
 #include "argv-array.h"
 #include "commit.h"
 #include "packfile.h"
+#include "pack.h"
+#include "pack-objects.h"
+#include "blob.h"
+#include "tree.h"
 
 #define FAILED_RUN "failed to run %s"
 
@@ -39,6 +43,8 @@ static timestamp_t gc_log_expire_time;
 static const char *gc_log_expire = "1.day.ago";
 static const char *prune_expire = "2.weeks.ago";
 static const char *prune_worktrees_expire = "3.months.ago";
+static unsigned long big_base_pack_threshold;
+static unsigned long max_delta_cache_size = DEFAULT_DELTA_CACHE_SIZE;
 
 static struct argv_array pack_refs_cmd = ARGV_ARRAY_INIT;
 static struct argv_array reflog = ARGV_ARRAY_INIT;
@@ -126,6 +132,9 @@ static void gc_config(void)
 	git_config_get_expiry("gc.worktreepruneexpire", &prune_worktrees_expire);
 	git_config_get_expiry("gc.logexpiry", &gc_log_expire);
 
+	git_config_get_ulong("gc.bigbasepackthreshold", &big_base_pack_threshold);
+	git_config_get_ulong("pack.deltacachesize", &max_delta_cache_size);
+
 	git_config(git_default_config, NULL);
 }
 
@@ -164,6 +173,21 @@ static int too_many_loose_objects(void)
 	return needed;
 }
 
+static struct packed_git *find_the_base_pack(void)
+{
+	struct packed_git *p, *base = NULL;
+
+	prepare_packed_git();
+
+	for (p = packed_git; p; p = p->next) {
+		if (p->pack_local &&
+		    (!base || base->pack_size < p->pack_size))
+			base = p;
+	}
+
+	return base;
+}
+
 static int too_many_packs(void)
 {
 	struct packed_git *p;
@@ -187,7 +211,101 @@ static int too_many_packs(void)
 	return gc_auto_pack_limit < cnt;
 }
 
-static void add_repack_all_option(void)
+static inline unsigned long total_ram(void)
+{
+	unsigned long default_ram = 4;
+#ifdef HAVE_SYSINFO
+	struct sysinfo si;
+
+	if (!sysinfo(&si))
+		return si.totalram;
+#elif defined(HAVE_BSD_SYSCTL) && defined(HW_MEMSIZE)
+	int64_t physical_memory;
+	int mib[2];
+	size_t length;
+
+	mib[0] = CTL_HW;
+	mib[1] = HW_MEMSIZE;
+	length = sizeof(int64_t);
+	if (!sysctl(mib, 2, &physical_memory, &length, NULL, 0))
+		return physical_memory;
+#elif defined(GIT_WINDOWS_NATIVE)
+	MEMORYSTATUSEX memInfo;
+
+	memInfo.dwLength = sizeof(MEMORYSTATUSEX);
+	if (GlobalMemoryStatusEx(&memInfo))
+		return memInfo;ullTotalPhys;
+#else
+	fprintf(stderr, _("unrecognized platform, assuming %lu GB RAM\n"),
+		default_ram);
+#endif
+	return default_ram * 1024 * 1024 * 1024;
+}
+
+static int pack_objects_uses_too_much_memory(struct packed_git *pack)
+{
+	unsigned long nr_objects = approximate_object_count();
+	size_t mem_want, mem_have, os_cache, heap;
+
+	if (!pack || !nr_objects)
+		return 0;
+
+	if (big_base_pack_threshold)
+		return pack->pack_size >= big_base_pack_threshold;
+
+	/*
+	 * First we have to scan through at least one pack.
+	 * Assume enough room in OS file cache to keep the entire pack
+	 * or we may accidentally evict data of other processes from
+	 * the cache.
+	 */
+	os_cache = pack->pack_size + pack->index_size;
+	/* then pack-objects needs lots more for book keeping */
+	heap = sizeof(struct object_entry) * nr_objects;
+	/*
+	 * internal rev-list --all --objects takes up some memory too,
+	 * let's say half of it is for blobs
+	 */
+	heap += sizeof(struct blob) * nr_objects / 2;
+	/*
+	 * and the other half is for trees (commits and tags are
+	 * usually insignificant)
+	 */
+	heap += sizeof(struct tree) * nr_objects / 2;
+	/* and then obj_hash[], underestimated in fact */
+	heap += sizeof(struct object *) * nr_objects;
+	/* revindex is used also */
+	heap += sizeof(struct revindex_entry) * nr_objects;
+	/*
+	 * read_sha1_file() (either at delta calculation phase, or
+	 * writing phase) also fills up the delta base cache
+	 */
+	heap += delta_base_cache_limit;
+	/* and of course pack-objects has its own delta cache */
+	heap += max_delta_cache_size;
+
+	/*
+	 * Only allow 1/2 of memory for pack-objects, leave the rest
+	 * for the OS and other processes in the system.
+	 */
+	mem_have = total_ram() / 2;
+	mem_want = os_cache + heap;
+
+	trace_printf("gc mem estimation\n"
+		     "mem_have: %" PRIuMAX ", mem_want: %" PRIuMAX ", "
+		     "heap: %" PRIuMAX "\n"
+		     "pack_size: %" PRIuMAX ", index_size: %" PRIuMAX ", "
+		     "nr_objects: %" PRIuMAX "\n"
+		     "base_cache: %" PRIuMAX ", delta_cache: %" PRIuMAX "\n",
+		     (uintmax_t)mem_have, (uintmax_t)mem_want, (uintmax_t)heap,
+		     (uintmax_t)pack->pack_size, (uintmax_t)pack->index_size,
+		     (uintmax_t)nr_objects,
+		     (uintmax_t)delta_base_cache_limit, (uintmax_t)max_delta_cache_size);
+
+	return mem_want >= mem_have;
+}
+
+static void add_repack_all_option(struct packed_git *keep_pack)
 {
 	if (prune_expire && !strcmp(prune_expire, "now"))
 		argv_array_push(&repack, "-a");
@@ -196,6 +314,10 @@ static void add_repack_all_option(void)
 		if (prune_expire)
 			argv_array_pushf(&repack, "--unpack-unreachable=%s", prune_expire);
 	}
+
+	if (keep_pack)
+		argv_array_pushf(&repack, "--keep-pack=%s",
+				 basename(keep_pack->pack_name));
 }
 
 static void add_repack_incremental_option(void)
@@ -218,9 +340,14 @@ static int need_to_gc(void)
 	 * we run "repack -A -d -l".  Otherwise we tell the caller
 	 * there is no need.
 	 */
-	if (too_many_packs())
-		add_repack_all_option();
-	else if (too_many_loose_objects())
+	if (too_many_packs()) {
+		struct packed_git *exclude = find_the_base_pack();
+
+		if (!pack_objects_uses_too_much_memory(exclude))
+			exclude = NULL;
+
+		add_repack_all_option(exclude);
+	} else if (too_many_loose_objects())
 		add_repack_incremental_option();
 	else
 		return 0;
@@ -353,6 +480,7 @@ int cmd_gc(int argc, const char **argv, const char *prefix)
 	const char *name;
 	pid_t pid;
 	int daemonized = 0;
+	int keep_base_pack = -1;
 
 	struct option builtin_gc_options[] = {
 		OPT__QUIET(&quiet, N_("suppress progress reporting")),
@@ -362,6 +490,8 @@ int cmd_gc(int argc, const char **argv, const char *prefix)
 		OPT_BOOL(0, "aggressive", &aggressive, N_("be more thorough (increased runtime)")),
 		OPT_BOOL(0, "auto", &auto_gc, N_("enable auto-gc mode")),
 		OPT_BOOL(0, "force", &force, N_("force running gc even if there may be another gc running")),
+		OPT_BOOL(0, "keep-base-pack", &keep_base_pack,
+			 N_("repack all other packs except the base pack")),
 		OPT_END()
 	};
 
@@ -427,8 +557,19 @@ int cmd_gc(int argc, const char **argv, const char *prefix)
 			 */
 			daemonized = !daemonize();
 		}
-	} else
-		add_repack_all_option();
+	} else {
+		struct packed_git *base_pack = find_the_base_pack();
+		struct packed_git *exclude = NULL;
+
+		if (keep_base_pack != -1) {
+			if (keep_base_pack)
+				exclude = base_pack;
+		} else if (base_pack && big_base_pack_threshold &&
+			   base_pack->pack_size >= big_base_pack_threshold)
+			exclude = base_pack;
+
+		add_repack_all_option(exclude);
+	}
 
 	name = lock_repo_for_gc(force, &pid);
 	if (name) {
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 8e3f870d71..fcdd398eb7 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -79,7 +79,7 @@ static uint16_t write_bitmap_options;
 static int exclude_promisor_objects;
 
 static unsigned long delta_cache_size = 0;
-static unsigned long max_delta_cache_size = 256 * 1024 * 1024;
+static unsigned long max_delta_cache_size = DEFAULT_DELTA_CACHE_SIZE;
 static unsigned long cache_max_small_delta_size = 1000;
 
 static unsigned long window_memory_limit = 0;
diff --git a/config.mak.uname b/config.mak.uname
index 6a1d0de0cc..ae9cbccec1 100644
--- a/config.mak.uname
+++ b/config.mak.uname
@@ -37,6 +37,7 @@ ifeq ($(uname_S),Linux)
 	HAVE_GETDELIM = YesPlease
 	SANE_TEXT_GREP=-a
 	FREAD_READS_DIRECTORIES = UnfortunatelyYes
+	BASIC_CFLAGS += -DHAVE_SYSINFO
 endif
 ifeq ($(uname_S),GNU/kFreeBSD)
 	HAVE_ALLOCA_H = YesPlease
diff --git a/git-compat-util.h b/git-compat-util.h
index 68b2ad531e..a84b21986d 100644
--- a/git-compat-util.h
+++ b/git-compat-util.h
@@ -284,6 +284,10 @@ extern char *gitdirname(char *);
 #include <openssl/err.h>
 #endif
 
+#ifdef HAVE_SYSINFO
+# include <sys/sysinfo.h>
+#endif
+
 /* On most systems <netdb.h> would have given us this, but
  * not on some systems (e.g. z/OS).
  */
diff --git a/pack-objects.h b/pack-objects.h
index 03f1191659..af4f46c026 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -1,6 +1,8 @@
 #ifndef PACK_OBJECTS_H
 #define PACK_OBJECTS_H
 
+#define DEFAULT_DELTA_CACHE_SIZE (256 * 1024 * 1024)
+
 struct object_entry {
 	struct pack_idx_entry idx;
 	unsigned long size;	/* uncompressed size */
diff --git a/t/t6500-gc.sh b/t/t6500-gc.sh
index 41b0be575d..863fdbb0fd 100755
--- a/t/t6500-gc.sh
+++ b/t/t6500-gc.sh
@@ -5,6 +5,13 @@ test_description='basic git gc tests
 
 . ./test-lib.sh
 
+test_expect_success 'setup' '
+	# do not let the amount of physical memory affects gc
+	# behavior, make sure the pack_objects_uses_too_much_memory()
+	# always returns false
+	git config gc.bigBasePackThreshold 2g
+'
+
 test_expect_success 'gc empty repository' '
 	git gc
 '
@@ -116,6 +123,28 @@ test_expect_success 'background auto gc respects lock for all operations' '
 	test_path_is_file .git/refs/heads/should-be-loose
 '
 
+test_expect_success 'gc --keep-base-pack' '
+	test_create_repo keep-pack &&
+	(
+		cd keep-pack &&
+		for i in 10; do
+			test_commit $i
+		done &&
+		git gc &&
+		( cd .git/objects/pack && ls *.pack ) >pack-list &&
+		test_line_count = 1 pack-list &&
+		BASE_PACK=.git/objects/pack/pack-*.pack &&
+		for i in 10; do
+			test_commit more-$i
+		done &&
+		git gc --keep-base-pack &&
+		( cd .git/objects/pack && ls *.pack ) >pack-list &&
+		test_line_count = 2 pack-list &&
+		test_path_is_file $BASE_PACK &&
+		git fsck
+	)
+'
+
 # DO NOT leave a detached auto gc process running near the end of the
 # test script: it can run long enough in the background to racily
 # interfere with the cleanup in 'test_done'.
-- 
2.16.2.784.gb291bd247e


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v2 4/5] pack-objects: show some progress when counting kept objects
  2018-03-06 10:41   ` [PATCH v2 0/5] Avoid expensive 'repack -ad' in gc --auto Nguyễn Thái Ngọc Duy
                       ` (2 preceding siblings ...)
  2018-03-06 10:41     ` [PATCH v2 3/5] gc --auto: exclude base pack if not enough mem to "repack -ad" Nguyễn Thái Ngọc Duy
@ 2018-03-06 10:41     ` Nguyễn Thái Ngọc Duy
  2018-03-12 18:32       ` Ævar Arnfjörð Bjarmason
  2018-03-06 10:41     ` [PATCH v2 5/5] pack-objects: display progress in get_object_details() Nguyễn Thái Ngọc Duy
                       ` (2 subsequent siblings)
  6 siblings, 1 reply; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-06 10:41 UTC (permalink / raw)
  To: pclouds
  Cc: e, git, peff, Ævar Arnfjörð Bjarmason, Junio C Hamano

We only show progress when there are new objects to be packed. But
when --keep-pack is specified on the base pack, we will exclude most
of objects. This makes 'pack-objects' stay silent for a long time
while the counting phase is going.

Let's show some progress whenever we visit an object instead. The
number of packed objects will be shown after if it's not the same as
the number of visited objects.

Since the meaning of this number has changed, use another word instead
of "Counting" to hint about the change.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index fcdd398eb7..24af4068a9 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -44,7 +44,7 @@ static const char *pack_usage[] = {
 static struct packing_data to_pack;
 
 static struct pack_idx_entry **written_list;
-static uint32_t nr_result, nr_written;
+static uint32_t nr_result, nr_written, nr_seen;
 
 static int non_empty;
 static int reuse_delta = 1, reuse_object = 1;
@@ -1092,6 +1092,8 @@ static int add_object_entry(const struct object_id *oid, enum object_type type,
 	off_t found_offset = 0;
 	uint32_t index_pos;
 
+	display_progress(progress_state, nr_seen++);
+
 	if (have_duplicate_entry(oid, exclude, &index_pos))
 		return 0;
 
@@ -1107,8 +1109,6 @@ static int add_object_entry(const struct object_id *oid, enum object_type type,
 	create_object_entry(oid, type, pack_name_hash(name),
 			    exclude, name && no_try_delta(name),
 			    index_pos, found_pack, found_offset);
-
-	display_progress(progress_state, nr_result);
 	return 1;
 }
 
@@ -1119,6 +1119,8 @@ static int add_object_entry_from_bitmap(const struct object_id *oid,
 {
 	uint32_t index_pos;
 
+	display_progress(progress_state, nr_seen++);
+
 	if (have_duplicate_entry(oid, 0, &index_pos))
 		return 0;
 
@@ -1126,8 +1128,6 @@ static int add_object_entry_from_bitmap(const struct object_id *oid,
 		return 0;
 
 	create_object_entry(oid, type, name_hash, 0, 0, index_pos, pack, offset);
-
-	display_progress(progress_state, nr_result);
 	return 1;
 }
 
@@ -3205,7 +3205,7 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 	}
 
 	if (progress)
-		progress_state = start_progress(_("Counting objects"), 0);
+		progress_state = start_progress(_("Enumerating objects"), 0);
 	if (!use_internal_rev_list)
 		read_object_list_from_stdin();
 	else {
-- 
2.16.2.784.gb291bd247e


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v2 5/5] pack-objects: display progress in get_object_details()
  2018-03-06 10:41   ` [PATCH v2 0/5] Avoid expensive 'repack -ad' in gc --auto Nguyễn Thái Ngọc Duy
                       ` (3 preceding siblings ...)
  2018-03-06 10:41     ` [PATCH v2 4/5] pack-objects: show some progress when counting kept objects Nguyễn Thái Ngọc Duy
@ 2018-03-06 10:41     ` Nguyễn Thái Ngọc Duy
  2018-03-06 17:49     ` [PATCH v2 0/5] Avoid expensive 'repack -ad' in gc --auto Junio C Hamano
  2018-03-16 19:27     ` [PATCH v3 0/7] nd/repack-keep-pack updates Nguyễn Thái Ngọc Duy
  6 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-06 10:41 UTC (permalink / raw)
  To: pclouds
  Cc: e, git, peff, Ævar Arnfjörð Bjarmason, Junio C Hamano

This code is mostly about reading object headers, which is cheap. But
when the number of objects is very large (e.g. 6.5M on linux-2.6.git)
and the system is under memory pressure, this could take some time (86
seconds on my system).

Show something during this time to let the user know pack-objects is
still going strong.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 24af4068a9..2ec911bf10 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -1713,6 +1713,10 @@ static void get_object_details(void)
 	uint32_t i;
 	struct object_entry **sorted_by_offset;
 
+	if (progress)
+		progress_state = start_progress(_("Getting object details"),
+						to_pack.nr_objects);
+
 	sorted_by_offset = xcalloc(to_pack.nr_objects, sizeof(struct object_entry *));
 	for (i = 0; i < to_pack.nr_objects; i++)
 		sorted_by_offset[i] = to_pack.objects + i;
@@ -1723,7 +1727,9 @@ static void get_object_details(void)
 		check_object(entry);
 		if (big_file_threshold < entry->size)
 			entry->no_try_delta = 1;
+		display_progress(progress_state, i + 1);
 	}
+	stop_progress(&progress_state);
 
 	/*
 	 * This must happen in a second pass, since we rely on the delta
-- 
2.16.2.784.gb291bd247e


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* Re: [PATCH v2 0/5] Avoid expensive 'repack -ad' in gc --auto
  2018-03-06 10:41   ` [PATCH v2 0/5] Avoid expensive 'repack -ad' in gc --auto Nguyễn Thái Ngọc Duy
                       ` (4 preceding siblings ...)
  2018-03-06 10:41     ` [PATCH v2 5/5] pack-objects: display progress in get_object_details() Nguyễn Thái Ngọc Duy
@ 2018-03-06 17:49     ` Junio C Hamano
  2018-03-16 19:27     ` [PATCH v3 0/7] nd/repack-keep-pack updates Nguyễn Thái Ngọc Duy
  6 siblings, 0 replies; 273+ messages in thread
From: Junio C Hamano @ 2018-03-06 17:49 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy
  Cc: e, git, peff, Ævar Arnfjörð Bjarmason

Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:

> I'm pretty happy with this now. Like v1, this is about not touching
> the giant base pack when doing background gc. This saves about 2/3 of
> memory, which in turn should improve performance if you're under
> memory pressure.

Thanks.  I've quickly scanned them and the overall idea looks quite
sound.  I'll comment on individual changes if needed.

Will queue.


^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v2 1/5] fixup! Add a test showing that 'git repack' throws away grafted-away parents
  2018-03-06 10:41     ` [PATCH v2 1/5] fixup! Add a test showing that 'git repack' throws away grafted-away parents Nguyễn Thái Ngọc Duy
@ 2018-03-06 18:01       ` Junio C Hamano
  0 siblings, 0 replies; 273+ messages in thread
From: Junio C Hamano @ 2018-03-06 18:01 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy
  Cc: e, git, peff, Ævar Arnfjörð Bjarmason

Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:

> The closing quote of a test body by convention is always at the start
> of line.
>
> Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
> ---

Obviously correct.  While the title may technically be correct, the
original commit is age old that we won't be amending it anyway, and
it is not very helpful not to say which aspect of the original is
being fixed up.  Perhaps something like:

    t7700: have closing quote of a test at the beginning of line

    1ec64827 ("Add a test showing that 'git repack' throws away
    grafted-away parents", 2009-07-23) added this test but indented
    the closing quote by mistake.

if we really cared documenting where the blame lies, but I do not
think it is quite worth it; anybody who cares deeply can ask "git
blame" about it, so I'll just retitle and use your original log
message body.

Thanks.

>  t/t7700-repack.sh | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/t/t7700-repack.sh b/t/t7700-repack.sh
> index 6061a04147..38247afbec 100755
> --- a/t/t7700-repack.sh
> +++ b/t/t7700-repack.sh
> @@ -194,7 +194,7 @@ test_expect_success 'objects made unreachable by grafts only are kept' '
>  	git reflog expire --expire=$test_tick --expire-unreachable=$test_tick --all &&
>  	git repack -a -d &&
>  	git cat-file -t $H1
> -	'
> +'
>  
>  test_done

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v2 2/5] repack: add --keep-pack option
  2018-03-06 10:41     ` [PATCH v2 2/5] repack: add --keep-pack option Nguyễn Thái Ngọc Duy
@ 2018-03-06 18:25       ` Junio C Hamano
  2018-03-07 10:19         ` Duy Nguyen
  0 siblings, 1 reply; 273+ messages in thread
From: Junio C Hamano @ 2018-03-06 18:25 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy
  Cc: e, git, peff, Ævar Arnfjörð Bjarmason

Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:

> +--keep-pack=<pack name>::
> +	Ignore the given pack. This is the equivalent of having
> +	`.keep` file on the pack. Implies `--honor-pack-keep`.
> +

A few questions I am not sure how I would answer:

 - Do we want to have this listed in the SYNOPSIS section, too?

 - We would want to make the SP in "<pack name>" consistent with
   the dash in "<missing-action>" in the same document; which way do
   we make it uniform?

 - Is this description clear enough to convey that we allow more
   than one instance of this option specified, and the pack names
   accumulate?

 - Are there use cases where we want to _ignore_ on-disk ".keep" and
   only honor the ones given via the "--keep-pack" options?

 - Is this description clear enough to convey that <pack name> is
   just the filename part (i.e. "pack-[0-9a-f]{40}.pack") in our
   local $GIT_OBJECT_DIRECTORY/pack/ and not a full path to the
   packfile?  I think that design is sensible, simplifies the
   implementation and reduces mistakes.

> +static void add_extra_kept_packs(const struct string_list *names)
> +{
> +	struct packed_git *p;
> +
> +	if (!names->nr)
> +		return;
> +
> +	prepare_packed_git();
> +	for (p = packed_git; p; p = p->next) {
> +		const char *name = basename(p->pack_name);
> +		int i;
> +
> +		if (!p->pack_local)
> +			continue;
> +
> +		for (i = 0; i < names->nr; i++) {
> +			if (fspathcmp(name, names->items[i].string))
> +				continue;
> +
> +			p->pack_keep = 1;
> +			ignore_packed_keep = 1;
> +			break;
> +		}
> +	}
> +}

OK.

> diff --git a/builtin/repack.c b/builtin/repack.c
> index 7bdb40142f..6a1dade0e1 100644
> --- a/builtin/repack.c
> +++ b/builtin/repack.c
> @@ -86,7 +86,8 @@ static void remove_pack_on_signal(int signo)
>   * have a corresponding .keep or .promisor file. These packs are not to
>   * be kept if we are going to pack everything into one file.
>   */
> -static void get_non_kept_pack_filenames(struct string_list *fname_list)
> +static void get_non_kept_pack_filenames(struct string_list *fname_list,
> +					const struct string_list *extra_keep)
>  {
>  	DIR *dir;
>  	struct dirent *e;
> @@ -97,6 +98,14 @@ static void get_non_kept_pack_filenames(struct string_list *fname_list)
>  
>  	while ((e = readdir(dir)) != NULL) {
>  		size_t len;
> +		int i;
> +
> +		for (i = 0;i < extra_keep->nr; i++)

Style: SP after ';' before 'i'.

> +			if (!fspathcmp(e->d_name, extra_keep->items[i].string))
> +				break;
> +		if (extra_keep->nr > 0 && i < extra_keep->nr)
> +			continue;
> +
>  		if (!strip_suffix(e->d_name, ".pack", &len))
>  			continue;

> diff --git a/t/t7700-repack.sh b/t/t7700-repack.sh
> index 38247afbec..553d907d34 100755
> --- a/t/t7700-repack.sh
> +++ b/t/t7700-repack.sh
> @@ -196,5 +196,24 @@ test_expect_success 'objects made unreachable by grafts only are kept' '
>  	git cat-file -t $H1
>  '
>  
> +test_expect_success 'repack --keep-pack' '
> +	test_create_repo keep-pack &&
> +	(
> +		cd keep-pack &&
> +		for cmit in one two three four; do
> +			test_commit $cmit &&
> +			git repack -d
> +		done &&

Style: replace "; " before do with LF followed by a few HT.

This 'for' loop would not exit and report error if an early
test_commit or "git repack -d" fails, would it?

> +		( cd .git/objects/pack && ls *.pack ) >pack-list &&
> +		test_line_count = 4 pack-list &&
> +		KEEP1=`head -n1 pack-list` &&
> +		KEEP4=`tail -n1 pack-list` &&

Style: $()

> +		git repack -a -d --keep-pack $KEEP1 --keep-pack $KEEP4 &&
> +		ls .git/objects/pack/*.pack >new-counts &&
> +		test_line_count = 3 new-counts &&
> +		git fsck

One important invariant for this new feature is that $KEEP1 and
$KEEP4 will both appear in new-counts file, no?  Rename new-counts
to new-pack-list and inspect the contents, not just line count,
perhaps?

> +	)
> +'
> +
>  test_done

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v2 3/5] gc --auto: exclude base pack if not enough mem to "repack -ad"
  2018-03-06 10:41     ` [PATCH v2 3/5] gc --auto: exclude base pack if not enough mem to "repack -ad" Nguyễn Thái Ngọc Duy
@ 2018-03-06 19:19       ` Junio C Hamano
  2018-03-07 10:48         ` Duy Nguyen
  2018-03-07 10:48       ` Johannes Schindelin
  2018-03-12 19:30       ` Ævar Arnfjörð Bjarmason
  2 siblings, 1 reply; 273+ messages in thread
From: Junio C Hamano @ 2018-03-06 19:19 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy
  Cc: e, git, peff, Ævar Arnfjörð Bjarmason

Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:

> On linux-2.6.git, valgrind massif reports 1.6GB heap in "pack all"
> case, and 535MB [1] in "pack all except the base pack" case. We save
> roughly 1GB memory by excluding the base pack.

;-)

> gc --auto decides to do this based on an estimation of pack-objects
> memory usage, which is quite accurate at least for the heap part, and
> whether that fits in half of system memory (the assumption here is for
> desktop environment where there are many other applications running).

I was still confused by "decides to do this..." after reading the
above three times.  If this is describing the state _with_ this
patch applied, then "Teach 'gc --auto' to do this automatically..."
would make it clear that is what is going on.

> diff --git a/Documentation/git-gc.txt b/Documentation/git-gc.txt
> index 571b5a7e3c..35ad420d5c 100644
> --- a/Documentation/git-gc.txt
> +++ b/Documentation/git-gc.txt
> @@ -59,6 +59,11 @@ then existing packs (except those marked with a `.keep` file)
>  are consolidated into a single pack by using the `-A` option of
>  'git repack'. Setting `gc.autoPackLimit` to 0 disables
>  automatic consolidation of packs.
> ++
> +If the physical amount of memory is considered not enough for `git
> +repack` to run smoothly, `--keep-base-pack` is enabled. This could be
> +overridden by setting `gc.bigBasePackThreshold` which only enables
> +`--keep-base-pack` when the base pack is larger the specified limit.

I somehow find the flow of logic in these two sentences harder to
follow than necessary.  Perhaps swapping the order may make it
easier to grok?  That is:

 - When gc.bigBasePackThreshold is set, packs larger than that will
   automatically be kept (i.e. not considered for repacking);

 - When it is not set, we try to guess how memory constrained we are,
   and behave as if the threshold were set to the size of the
   largest packfile we have (i.e. that single pack is kept).

I think another and bigger reason why I found the original hard to
read is because it is described for those who already understand
what "--keep-base-pack" option does.  Rewriting it not to require
the pre-existing knowledge of that option would make it a lot easier
to grok, I would think (you may not realize it because you wrote the
feature and are very familiar with it, though).

> +--keep-base-pack::
> +	All packs except the base pack are consolidated into a single
> +	pack. The largest pack is considered the base pack.

This makes it sound as if packs with .keep are also repacked unless
they meet the threshold for "base pack".  Is that what you actually
implemented?

In order to do so, [2/5] needs to allow the "--keep-pack" option
override the on-disk .keep files.  In an earlier message, I wondered
if such an arrangement is useful in some use cases; I think it is,
and because those who do want to see the on-disk .keep files honored
can collect and include them in the set of packs to be kept via
"--keep-pack" (after all this is an option for low-level scripting
anyway).

> +Set environment variable `GIT_TRACE` in order to see the memory usage
> +estimation in `git gc --auto` that determines whether the base pack is
> +kept.

This is somewhat a puzzling use of trace.  It sounds more like a way
to find out "how" memory usage estimation is done and arriving at a
wrong value for those who want to debug the feature.

> +static unsigned long big_base_pack_threshold;
> +static unsigned long max_delta_cache_size = DEFAULT_DELTA_CACHE_SIZE;

A new symbol, which is a good addition.

> +static struct packed_git *find_the_base_pack(void)
> +{
> +	struct packed_git *p, *base = NULL;
> +
> +	prepare_packed_git();
> +
> +	for (p = packed_git; p; p = p->next) {
> +		if (p->pack_local &&
> +		    (!base || base->pack_size < p->pack_size))
> +			base = p;
> +	}
> +
> +	return base;
> +}

This is finding the largest pack.

> @@ -187,7 +211,101 @@ static int too_many_packs(void)
>  	return gc_auto_pack_limit < cnt;
>  }
>  
> -static void add_repack_all_option(void)
> +static inline unsigned long total_ram(void)

"inline"?  We'd rather have compiler figure it out, no?

> +{
> +	unsigned long default_ram = 4;

4 what?  4 bytes?  Name it perhaps "default_ram_gb" or something?

> +#ifdef HAVE_SYSINFO
> +	struct sysinfo si;
> +
> +	if (!sysinfo(&si))
> +		return si.totalram;
> +#elif defined(HAVE_BSD_SYSCTL) && defined(HW_MEMSIZE)
> +	int64_t physical_memory;
> +	int mib[2];
> +	size_t length;
> +
> +	mib[0] = CTL_HW;
> +	mib[1] = HW_MEMSIZE;
> +	length = sizeof(int64_t);
> +	if (!sysctl(mib, 2, &physical_memory, &length, NULL, 0))
> +		return physical_memory;
> +#elif defined(GIT_WINDOWS_NATIVE)
> +	MEMORYSTATUSEX memInfo;
> +
> +	memInfo.dwLength = sizeof(MEMORYSTATUSEX);
> +	if (GlobalMemoryStatusEx(&memInfo))
> +		return memInfo;ullTotalPhys;

Is this legal C in Microsoft land?

> +#else
> +	fprintf(stderr, _("unrecognized platform, assuming %lu GB RAM\n"),
> +		default_ram);
> +#endif
> +	return default_ram * 1024 * 1024 * 1024;
> +}

I wonder if the above should go somewhere under compat/ without
ifdef but split into separate files for individual archs (I do not
know the answer to this question).

> +static void add_repack_all_option(struct packed_git *keep_pack)
>  {
>  	if (prune_expire && !strcmp(prune_expire, "now"))
>  		argv_array_push(&repack, "-a");
> @@ -196,6 +314,10 @@ static void add_repack_all_option(void)
>  		if (prune_expire)
>  			argv_array_pushf(&repack, "--unpack-unreachable=%s", prune_expire);
>  	}
> +
> +	if (keep_pack)
> +		argv_array_pushf(&repack, "--keep-pack=%s",
> +				 basename(keep_pack->pack_name));
>  }
>  
>  static void add_repack_incremental_option(void)
> @@ -218,9 +340,14 @@ static int need_to_gc(void)
>  	 * we run "repack -A -d -l".  Otherwise we tell the caller
>  	 * there is no need.
>  	 */
> -	if (too_many_packs())
> -		add_repack_all_option();
> -	else if (too_many_loose_objects())
> +	if (too_many_packs()) {
> +		struct packed_git *exclude = find_the_base_pack();
> +
> +		if (!pack_objects_uses_too_much_memory(exclude))
> +			exclude = NULL;
> +
> +		add_repack_all_option(exclude);

OK, so we possibly exclude the largest one or nothing (i.e. at most
one --keep-pack is given) here.

> @@ -353,6 +480,7 @@ int cmd_gc(int argc, const char **argv, const char *prefix)
>  	const char *name;
>  	pid_t pid;
>  	int daemonized = 0;
> +	int keep_base_pack = -1;
>  
>  	struct option builtin_gc_options[] = {
>  		OPT__QUIET(&quiet, N_("suppress progress reporting")),
> @@ -362,6 +490,8 @@ int cmd_gc(int argc, const char **argv, const char *prefix)
>  		OPT_BOOL(0, "aggressive", &aggressive, N_("be more thorough (increased runtime)")),
>  		OPT_BOOL(0, "auto", &auto_gc, N_("enable auto-gc mode")),
>  		OPT_BOOL(0, "force", &force, N_("force running gc even if there may be another gc running")),
> +		OPT_BOOL(0, "keep-base-pack", &keep_base_pack,
> +			 N_("repack all other packs except the base pack")),
>  		OPT_END()
>  	};
>  
> @@ -427,8 +557,19 @@ int cmd_gc(int argc, const char **argv, const char *prefix)
>  			 */
>  			daemonized = !daemonize();
>  		}
> -	} else
> -		add_repack_all_option();
> +	} else {
> +		struct packed_git *base_pack = find_the_base_pack();
> +		struct packed_git *exclude = NULL;
> +
> +		if (keep_base_pack != -1) {
> +			if (keep_base_pack)
> +				exclude = base_pack;

OK, --keep-base-pack option always wins if given...

> +		} else if (base_pack && big_base_pack_threshold &&
> +			   base_pack->pack_size >= big_base_pack_threshold)
> +			exclude = base_pack;

...and then if the largest one is larger than the threshold, it (and
it alone) is kept, but otherwise nothing is kept automatically.

But to those who say "packs larger than this value is too big" via
configuration, keeping only the largest of these above-threshold
packs would look counter-intuitive, wouldn't it, I wonder?

> +		add_repack_all_option(exclude);
> +	}

> diff --git a/t/t6500-gc.sh b/t/t6500-gc.sh
> index 41b0be575d..863fdbb0fd 100755
> --- a/t/t6500-gc.sh
> +++ b/t/t6500-gc.sh
> @@ -5,6 +5,13 @@ test_description='basic git gc tests
>  
>  . ./test-lib.sh
>  
> +test_expect_success 'setup' '
> +	# do not let the amount of physical memory affects gc
> +	# behavior, make sure the pack_objects_uses_too_much_memory()
> +	# always returns false
> +	git config gc.bigBasePackThreshold 2g

Hmph, that is because the configuration wins and we know the trash
repository will never have a pack that large.  OK.

I won't comment on the style issue in the remainder, as it shares
the same as another patch.

> @@ -116,6 +123,28 @@ test_expect_success 'background auto gc respects lock for all operations' '
>  	test_path_is_file .git/refs/heads/should-be-loose
>  '
>  
> +test_expect_success 'gc --keep-base-pack' '
> +	test_create_repo keep-pack &&
> +	(
> +		cd keep-pack &&
> +		for i in 10; do
> +			test_commit $i
> +		done &&
> +		git gc &&

This, because of the set-up step, is guaranteed to pack everything
into one.

> +		( cd .git/objects/pack && ls *.pack ) >pack-list &&
> +		test_line_count = 1 pack-list &&
> +		BASE_PACK=.git/objects/pack/pack-*.pack &&

And that is what these verify.

> +		for i in 10; do
> +			test_commit more-$i
> +		done &&
> +		git gc --keep-base-pack &&

And we allow --keep-base-pack option to pick the largest pack (there
is only one pack anyway) and keep it, which ...

> +		( cd .git/objects/pack && ls *.pack ) >pack-list &&
> +		test_line_count = 2 pack-list &&

... results in two packs in total.  And

> +		test_path_is_file $BASE_PACK &&

... we make sure the first pack is left intact (unlike the earlier
'new-count' test).  This sounds sensible.

> +		git fsck
> +	)
> +'
> +
>  # DO NOT leave a detached auto gc process running near the end of the
>  # test script: it can run long enough in the background to racily
>  # interfere with the cleanup in 'test_done'.

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v2 2/5] repack: add --keep-pack option
  2018-03-06 18:25       ` Junio C Hamano
@ 2018-03-07 10:19         ` Duy Nguyen
  0 siblings, 0 replies; 273+ messages in thread
From: Duy Nguyen @ 2018-03-07 10:19 UTC (permalink / raw)
  To: Junio C Hamano
  Cc: Eric Wong, Git Mailing List, Jeff King,
	Ævar Arnfjörð Bjarmason

On Wed, Mar 7, 2018 at 1:25 AM, Junio C Hamano <gitster@pobox.com> wrote:
> Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:
>
>> +--keep-pack=<pack name>::
>> +     Ignore the given pack. This is the equivalent of having
>> +     `.keep` file on the pack. Implies `--honor-pack-keep`.
>> +
>
> A few questions I am not sure how I would answer:
>
>  - Do we want to have this listed in the SYNOPSIS section, too?
>
>  - We would want to make the SP in "<pack name>" consistent with
>    the dash in "<missing-action>" in the same document; which way do
>    we make it uniform?

Probably the latter.

>
>  - Is this description clear enough to convey that we allow more
>    than one instance of this option specified, and the pack names
>    accumulate?

If a question is raised, it's probably not clear.

>  - Are there use cases where we want to _ignore_ on-disk ".keep" and
>    only honor the ones given via the "--keep-pack" options?

I can't think of one. These .keep files are originally lock files and
ignoring them sounds like a bad idea. Perhaps we could add
--no-keep-pack later to explicit not keep a pack, ignoring .keep file
if present?

>> diff --git a/t/t7700-repack.sh b/t/t7700-repack.sh
>> index 38247afbec..553d907d34 100755
>> --- a/t/t7700-repack.sh
>> +++ b/t/t7700-repack.sh
>> @@ -196,5 +196,24 @@ test_expect_success 'objects made unreachable by grafts only are kept' '
>>       git cat-file -t $H1
>>  '
>>
>> +test_expect_success 'repack --keep-pack' '
>> +     test_create_repo keep-pack &&
>> +     (
>> +             cd keep-pack &&
>> +             for cmit in one two three four; do
>> +                     test_commit $cmit &&
>> +                     git repack -d
>> +             done &&
>
> Style: replace "; " before do with LF followed by a few HT.
>
> This 'for' loop would not exit and report error if an early
> test_commit or "git repack -d" fails, would it?

Yeah. I guess I'll just unroll the loop.

>> +             git repack -a -d --keep-pack $KEEP1 --keep-pack $KEEP4 &&
>> +             ls .git/objects/pack/*.pack >new-counts &&
>> +             test_line_count = 3 new-counts &&
>> +             git fsck
>
> One important invariant for this new feature is that $KEEP1 and
> $KEEP4 will both appear in new-counts file, no?  Rename new-counts
> to new-pack-list and inspect the contents, not just line count,
> perhaps?

OK
-- 
Duy

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v2 3/5] gc --auto: exclude base pack if not enough mem to "repack -ad"
  2018-03-06 10:41     ` [PATCH v2 3/5] gc --auto: exclude base pack if not enough mem to "repack -ad" Nguyễn Thái Ngọc Duy
  2018-03-06 19:19       ` Junio C Hamano
@ 2018-03-07 10:48       ` Johannes Schindelin
  2018-03-07 18:40         ` Junio C Hamano
  2018-03-12 19:30       ` Ævar Arnfjörð Bjarmason
  2 siblings, 1 reply; 273+ messages in thread
From: Johannes Schindelin @ 2018-03-07 10:48 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy
  Cc: e, git, peff, Ævar Arnfjörð Bjarmason, Junio C Hamano

[-- Attachment #1: Type: text/plain, Size: 2649 bytes --]

Hi Duy,

On Tue, 6 Mar 2018, Nguyễn Thái Ngọc Duy wrote:

> diff --git a/builtin/gc.c b/builtin/gc.c
> index 77fa720bd0..273657ddf4 100644
> --- a/builtin/gc.c
> +++ b/builtin/gc.c
> @@ -187,7 +211,101 @@ static int too_many_packs(void)
>  	return gc_auto_pack_limit < cnt;
>  }
>  
> -static void add_repack_all_option(void)
> +static inline unsigned long total_ram(void)
> +{
> +	unsigned long default_ram = 4;
> +#ifdef HAVE_SYSINFO
> +	struct sysinfo si;
> +
> +	if (!sysinfo(&si))
> +		return si.totalram;
> +#elif defined(HAVE_BSD_SYSCTL) && defined(HW_MEMSIZE)
> +	int64_t physical_memory;
> +	int mib[2];
> +	size_t length;
> +
> +	mib[0] = CTL_HW;
> +	mib[1] = HW_MEMSIZE;
> +	length = sizeof(int64_t);
> +	if (!sysctl(mib, 2, &physical_memory, &length, NULL, 0))
> +		return physical_memory;
> +#elif defined(GIT_WINDOWS_NATIVE)
> +	MEMORYSTATUSEX memInfo;
> +
> +	memInfo.dwLength = sizeof(MEMORYSTATUSEX);
> +	if (GlobalMemoryStatusEx(&memInfo))
> +		return memInfo;ullTotalPhys;

This fails to compile:

builtin/gc.c: In function 'total_ram':
builtin/gc.c:235:10: error: incompatible types when returning type 'MEMORYSTATUSEX {aka struct _MEMORYSTATUSEX}' but 'long unsigned int' was expected
	   return memInfo;ullTotalPhys;
	          ^~~~~~~
builtin/gc.c:234:2: error: this 'if' clause does not guard... [-Werror=misleading-indentation]
	  if (GlobalMemoryStatusEx(&memInfo))
	  ^~
builtin/gc.c:235:18: note: ...this statement, but the latter is misleadingly indented as if it were guarded by the 'if'
	   return memInfo;ullTotalPhys;
	                  ^~~~~~~~~~~~
builtin/gc.c:235:18: error: 'ullTotalPhys' undeclared (first use in this function)
builtin/gc.c:235:18: note: each undeclared identifier is reported only once for each function it appears in

I suspect that the first semicolon wanted to be a period instead. At least
it fixes the build here (that's all I can test, I'm at GitMerge and miss a
very interesting discussion about the serialized commit graph to write
this):

-- snip --
diff --git a/builtin/gc.c b/builtin/gc.c
index 4f46a99ab54..9c12f1ee9af 100644
--- a/builtin/gc.c
+++ b/builtin/gc.c
@@ -232,7 +232,7 @@ static inline unsigned long total_ram(void)
 
 	memInfo.dwLength = sizeof(MEMORYSTATUSEX);
 	if (GlobalMemoryStatusEx(&memInfo))
-		return memInfo;ullTotalPhys;
+		return memInfo.ullTotalPhys;
 #else
 	fprintf(stderr, _("unrecognized platform, assuming %lu GB RAM\n"),
 		default_ram);

-- snap --

Junio, may I ask you to put this into a SQUASH??? commit so that the
Windows build no longer fails?

Thanks,
Dscho

^ permalink raw reply related	[flat|nested] 273+ messages in thread

* Re: [PATCH v2 3/5] gc --auto: exclude base pack if not enough mem to "repack -ad"
  2018-03-06 19:19       ` Junio C Hamano
@ 2018-03-07 10:48         ` Duy Nguyen
  2018-03-07 18:38           ` Junio C Hamano
  0 siblings, 1 reply; 273+ messages in thread
From: Duy Nguyen @ 2018-03-07 10:48 UTC (permalink / raw)
  To: Junio C Hamano
  Cc: Eric Wong, Git Mailing List, Jeff King,
	Ævar Arnfjörð Bjarmason

On Wed, Mar 7, 2018 at 2:19 AM, Junio C Hamano <gitster@pobox.com> wrote:
>> +--keep-base-pack::
>> +     All packs except the base pack are consolidated into a single
>> +     pack. The largest pack is considered the base pack.
>
> This makes it sound as if packs with .keep are also repacked unless
> they meet the threshold for "base pack".  Is that what you actually
> implemented?

Copy/paste problem. That is, I copied this from --auto description,
but I missed the "(except those marked with a `.keep` file)" part. No,
I don't think ignoring .keep files is a good idea, at least no by
default.

> In order to do so, [2/5] needs to allow the "--keep-pack" option
> override the on-disk .keep files.  In an earlier message, I wondered
> if such an arrangement is useful in some use cases; I think it is,
> and because those who do want to see the on-disk .keep files honored
> can collect and include them in the set of packs to be kept via
> "--keep-pack" (after all this is an option for low-level scripting
> anyway).

At gc level I don't think we need to allow this. But yeah git-repack
has --pack-kept-objects to ignore .keep. If they specify this, then
repack should ignore .keep (but still follow whatever --keep-pack is
specified). There's some interesting interaction between .keep and
pack bitmap feature in pack-objects though. I'm not so sure what
happens down there yet.

>> +Set environment variable `GIT_TRACE` in order to see the memory usage
>> +estimation in `git gc --auto` that determines whether the base pack is
>> +kept.
>
> This is somewhat a puzzling use of trace.  It sounds more like a way
> to find out "how" memory usage estimation is done and arriving at a
> wrong value for those who want to debug the feature.

Yeah. I'm not sure if this estimation could be really problematic that
people need to debug this way. A cleaner way (if we think people will
need this often) is just add a new option in "git gc" to report this
estimation breakdown and do nothing else.

>> +static struct packed_git *find_the_base_pack(void)
>> +{
>> +     struct packed_git *p, *base = NULL;
>> +
>> +     prepare_packed_git();
>> +
>> +     for (p = packed_git; p; p = p->next) {
>> +             if (p->pack_local &&
>> +                 (!base || base->pack_size < p->pack_size))
>> +                     base = p;
>> +     }
>> +
>> +     return base;
>> +}
>
> This is finding the largest pack.

The discussion on .keep files raises one question for me, what if the
largest pack already has a .keep file, do we still consider it the
base pack, or should we find the next largest non-kept pack?

I'm guessing we keep things simple here and ignore .keep files.

>> +#ifdef HAVE_SYSINFO
>> +     struct sysinfo si;
>> +
>> +     if (!sysinfo(&si))
>> +             return si.totalram;
>> +#elif defined(HAVE_BSD_SYSCTL) && defined(HW_MEMSIZE)
>> +     int64_t physical_memory;
>> +     int mib[2];
>> +     size_t length;
>> +
>> +     mib[0] = CTL_HW;
>> +     mib[1] = HW_MEMSIZE;
>> +     length = sizeof(int64_t);
>> +     if (!sysctl(mib, 2, &physical_memory, &length, NULL, 0))
>> +             return physical_memory;
>> +#elif defined(GIT_WINDOWS_NATIVE)
>> +     MEMORYSTATUSEX memInfo;
>> +
>> +     memInfo.dwLength = sizeof(MEMORYSTATUSEX);
>> +     if (GlobalMemoryStatusEx(&memInfo))
>> +             return memInfo;ullTotalPhys;
>
> Is this legal C in Microsoft land?

That's the problem with writing code without a way to test it. At
least travis helped catch a compiler bug on mac.

I'm torn between just #error here and let Windows/Mac guys implement
it (which they may scream "too much work, I don't wanna") but if I
help write something first, yes things are potentially broken and need
verification from those guys. I guess I'll just fix this up and hope
non-linux guys do the rest.

>> +#else
>> +     fprintf(stderr, _("unrecognized platform, assuming %lu GB RAM\n"),
>> +             default_ram);
>> +#endif
>> +     return default_ram * 1024 * 1024 * 1024;
>> +}
>
> I wonder if the above should go somewhere under compat/ without
> ifdef but split into separate files for individual archs (I do not
> know the answer to this question).

My first choice too. I chose this way after seeing online_cpus()
thread-utils.c. Not sure which way is best either.

>> @@ -427,8 +557,19 @@ int cmd_gc(int argc, const char **argv, const char *prefix)
>>                        */
>>                       daemonized = !daemonize();
>>               }
>> -     } else
>> -             add_repack_all_option();
>> +     } else {
>> +             struct packed_git *base_pack = find_the_base_pack();
>> +             struct packed_git *exclude = NULL;
>> +
>> +             if (keep_base_pack != -1) {
>> +                     if (keep_base_pack)
>> +                             exclude = base_pack;
>
> OK, --keep-base-pack option always wins if given...
>
>> +             } else if (base_pack && big_base_pack_threshold &&
>> +                        base_pack->pack_size >= big_base_pack_threshold)
>> +                     exclude = base_pack;
>
> ...and then if the largest one is larger than the threshold, it (and
> it alone) is kept, but otherwise nothing is kept automatically.
>
> But to those who say "packs larger than this value is too big" via
> configuration, keeping only the largest of these above-threshold
> packs would look counter-intuitive, wouldn't it, I wonder?

I think I'll just clarify this in the document. There may be a use
case for keeping multiple large packs, but I don't see it (*). We can
deal with it when it comes.

(*) Well I see one. In submodule setting, we if merge object stores of
all submodules back to the supermodule, we have multiple base packs
and probably want to keep them that way. It's still a long way to get
there (not even sure if submodule people want to get there)

>> diff --git a/t/t6500-gc.sh b/t/t6500-gc.sh
>> index 41b0be575d..863fdbb0fd 100755
>> --- a/t/t6500-gc.sh
>> +++ b/t/t6500-gc.sh
>> @@ -5,6 +5,13 @@ test_description='basic git gc tests
>>
>>  . ./test-lib.sh
>>
>> +test_expect_success 'setup' '
>> +     # do not let the amount of physical memory affects gc
>> +     # behavior, make sure the pack_objects_uses_too_much_memory()
>> +     # always returns false
>> +     git config gc.bigBasePackThreshold 2g
>
> Hmph, that is because the configuration wins and we know the trash
> repository will never have a pack that large.  OK.

Credit goes to travis linux32 job. I wouldn't notice this otherwise.

A thought came across me when I wrote this though. Should we support
special value "infinite" (or just "max")- in our config code?

The use of super large gc.bigBasePackThreshold to disable this keeping
base pack is intended. But I can't go too high here it may break
limits on 32 bit platforms. And 2g sounds really arbitrary.
-- 
Duy

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v2 3/5] gc --auto: exclude base pack if not enough mem to "repack -ad"
  2018-03-07 10:48         ` Duy Nguyen
@ 2018-03-07 18:38           ` Junio C Hamano
  2018-03-12 18:56             ` Ævar Arnfjörð Bjarmason
  0 siblings, 1 reply; 273+ messages in thread
From: Junio C Hamano @ 2018-03-07 18:38 UTC (permalink / raw)
  To: Duy Nguyen
  Cc: Eric Wong, Git Mailing List, Jeff King,
	Ævar Arnfjörð Bjarmason

Duy Nguyen <pclouds@gmail.com> writes:

>>> +Set environment variable `GIT_TRACE` in order to see the memory usage
>>> +estimation in `git gc --auto` that determines whether the base pack is
>>> +kept.
>>
>> This is somewhat a puzzling use of trace.  It sounds more like a way
>> to find out "how" memory usage estimation is done and arriving at a
>> wrong value for those who want to debug the feature.
>
> Yeah. I'm not sure if this estimation could be really problematic that
> people need to debug this way. A cleaner way (if we think people will
> need this often) is just add a new option in "git gc" to report this
> estimation breakdown and do nothing else.

Actually after reading the implementation and seeing what it does, I
personally do not have any problem with the way GIT_TRACE is used
for this purpose in this patch.  I am not sure how interesting the
information given by that codepath in real life; it looks even less
intereseting than say what comes out of "verify-pack --stat".

>> This is finding the largest pack.
>
> The discussion on .keep files raises one question for me, what if the
> largest pack already has a .keep file, do we still consider it the
> base pack, or should we find the next largest non-kept pack?
>
> I'm guessing we keep things simple here and ignore .keep files.

I agree that it is a sensible design decision.

>>> +#elif defined(GIT_WINDOWS_NATIVE)
>>> +     MEMORYSTATUSEX memInfo;
>>> +
>>> +     memInfo.dwLength = sizeof(MEMORYSTATUSEX);
>>> +     if (GlobalMemoryStatusEx(&memInfo))
>>> +             return memInfo;ullTotalPhys;
>>
>> Is this legal C in Microsoft land?
>
> That's the problem with writing code without a way to test it. At
> least travis helped catch a compiler bug on mac.
>
> I'm torn between just #error here and let Windows/Mac guys implement
> it (which they may scream "too much work, I don't wanna") but if I
> help write something first, yes things are potentially broken and need
> verification from those guys. I guess I'll just fix this up and hope
> non-linux guys do the rest.

Yup, we all collaborate and help in ways each of us can.  None of us
can be expected to do any more than that ;-)

>> I wonder if the above should go somewhere under compat/ without
>> ifdef but split into separate files for individual archs (I do not
>> know the answer to this question).
>
> My first choice too. I chose this way after seeing online_cpus()
> thread-utils.c. Not sure which way is best either.

OK.

>> But to those who say "packs larger than this value is too big" via
>> configuration, keeping only the largest of these above-threshold
>> packs would look counter-intuitive, wouldn't it, I wonder?
>
> I think I'll just clarify this in the document. There may be a use
> case for keeping multiple large packs, but I don't see it (*). We can
> deal with it when it comes.

When the project's history grows too much, a large pack that holds
its first 10 years of stuff, together with another one that holds
its second 20 years of stuff, may both be larger than the threshold
and want to be kept.  If we pick only the largest one, we would
explode the other one and repack together with loose objects.

But realistically, those who would want to control the way in which
their repository is packed to such a degree are very likely to add
".keep" files to these two packfiles themselves, so the above would
probably not a concern.  Perhaps we shouldn't do the "automatically
pick the largest one and exclude from repacking" when there is a
packfile that is marked with ".keep"?

> The use of super large gc.bigBasePackThreshold to disable this keeping
> base pack is intended. But I can't go too high here it may break
> limits on 32 bit platforms. And 2g sounds really arbitrary.

You could use 42m instead to clarify that it really is an arbitrary
threshold that was chosen only for the purpose of this test perhaps?
;-)

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v2 3/5] gc --auto: exclude base pack if not enough mem to "repack -ad"
  2018-03-07 10:48       ` Johannes Schindelin
@ 2018-03-07 18:40         ` Junio C Hamano
  0 siblings, 0 replies; 273+ messages in thread
From: Junio C Hamano @ 2018-03-07 18:40 UTC (permalink / raw)
  To: Johannes Schindelin
  Cc: Nguyễn Thái Ngọc Duy, e, git, peff,
	Ævar Arnfjörð Bjarmason

Johannes Schindelin <Johannes.Schindelin@gmx.de> writes:

> Junio, may I ask you to put this into a SQUASH??? commit so that the
> Windows build no longer fails?

Thanks for a reminder; I also spotted it (I first thought I screwed
up in my editor while reviewing and then went back to the original
on the list) and sent out a response, but then by that time I was
already far into the day's integration cycle.

Will queue a SQUASH??? at the tip.

^ permalink raw reply	[flat|nested] 273+ messages in thread

* [PATCH/RFC v3 00/12] Reduce pack-objects memory footprint
  2018-03-03  2:46   ` [PATCH/RFC v2 0/9] " Nguyễn Thái Ngọc Duy
                       ` (9 preceding siblings ...)
  2018-03-05  9:28     ` [PATCH/RFC v2 0/9] Reduce pack-objects memory footprint Duy Nguyen
@ 2018-03-08 11:42     ` Nguyễn Thái Ngọc Duy
  2018-03-08 11:42       ` [PATCH/RFC v3 01/12] pack-objects: a bit of document about struct object_entry Nguyễn Thái Ngọc Duy
                         ` (12 more replies)
  10 siblings, 13 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-08 11:42 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

v3 cleans up a bit to avoid the horrible macros that v2 adds, and
adds some more documentation for this struct since it's getting harder
to just look at the struct and see what field is related to what.

v3 also adds three more patches and reduces another 16 bytes (total
struct reduction now is 41%). After this there's hardly anything else I
could do. Two 64-bit fields left, but even if I shrink them, I'd lose
it to padding. There's still one possibility to share in_pack_offset
with idx.offset, but it's risky.

These three patches are made to optimize for the common case. The
incommon cases will suffer some performance loss:

- 10/12 limits the cached compressed delta size to 64k (default 1000
  bytes). If you normally have lots of huge deltas, you're going to
  take a hit because more deltas must be recreated at writing phase.
  Note that it does not stop pack-objects from creating deltas larger
  than 64k.

- 11/12 reduces uncompressed object size to 4GB. Whenever we need to
  read object size of those larger than that, we read the pack again
  to retrieve the information, which is much slower than accessing a
  piece of memory. Again I'm assuming these giant blobs are really
  really rare that this performance hit won't matter.

- 12/12 is similar to 11/12 and reduces uncompressed delta size to
  4GB. Frankly a 4GB delta is still ridiculous, but I don't think we
  gain more by shrinking it further. If your packs have one of those
  giant deltas, it still works, delta_size will be read back from the
  pack again.

The following interdiff does _NOT_ cover the new patches, just the
first nine that v2 has.

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 55f19a1f18..82a4a95888 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -29,19 +29,13 @@
 #include "list.h"
 #include "packfile.h"
 
-#define DELTA(obj) \
-	((obj)->delta_idx ? &to_pack.objects[(obj)->delta_idx - 1] : NULL)
-#define DELTA_CHILD(obj) \
-	((obj)->delta_child_idx ? &to_pack.objects[(obj)->delta_child_idx - 1] : NULL)
-#define DELTA_SIBLING(obj) \
-	((obj)->delta_sibling_idx ? &to_pack.objects[(obj)->delta_sibling_idx - 1] : NULL)
-
-#define CLEAR_DELTA(obj) (obj)->delta_idx = 0
-#define CLEAR_DELTA_CHILD(obj) (obj)->delta_child_idx = 0
-#define CLEAR_DELTA_SIBLING(obj) (obj)->delta_sibling_idx = 0
-
-#define SET_DELTA(obj, val) (obj)->delta_idx = ((val) - to_pack.objects) + 1
-#define SET_DELTA_CHILD(obj, val) (obj)->delta_child_idx = ((val) - to_pack.objects) + 1
+#define IN_PACK(obj) oe_in_pack(&to_pack, obj)
+#define DELTA(obj) oe_delta(&to_pack, obj)
+#define DELTA_CHILD(obj) oe_delta_child(&to_pack, obj)
+#define DELTA_SIBLING(obj) oe_delta_sibling(&to_pack, obj)
+#define SET_DELTA(obj, val) oe_set_delta(&to_pack, obj, val)
+#define SET_DELTA_CHILD(obj, val) oe_set_delta_child(&to_pack, obj, val)
+#define SET_DELTA_SIBLING(obj, val) oe_set_delta_sibling(&to_pack, obj, val)
 
 static const char *pack_usage[] = {
 	N_("git pack-objects --stdout [<options>...] [< <ref-list> | < <object-list>]"),
@@ -381,7 +375,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 				unsigned long limit, int usable_delta)
 {
-	struct packed_git *p = IN_PACK(&to_pack, entry);
+	struct packed_git *p = IN_PACK(entry);
 	struct pack_window *w_curs = NULL;
 	struct revindex_entry *revidx;
 	off_t offset;
@@ -492,7 +486,7 @@ static off_t write_object(struct hashfile *f,
 
 	if (!reuse_object)
 		to_reuse = 0;	/* explicit */
-	else if (!IN_PACK(&to_pack, entry))
+	else if (!IN_PACK(entry))
 		to_reuse = 0;	/* can't reuse what we don't have */
 	else if (entry->type == OBJ_REF_DELTA || entry->type == OBJ_OFS_DELTA)
 				/* check_object() decided it for us ... */
@@ -557,7 +551,7 @@ static enum write_one_status write_one(struct hashfile *f,
 		switch (write_one(f, DELTA(e), offset)) {
 		case WRITE_ONE_RECURSIVE:
 			/* we cannot depend on this one */
-			CLEAR_DELTA(e);
+			SET_DELTA(e, NULL);
 			break;
 		default:
 			break;
@@ -672,8 +666,8 @@ static struct object_entry **compute_write_order(void)
 	for (i = 0; i < to_pack.nr_objects; i++) {
 		objects[i].tagged = 0;
 		objects[i].filled = 0;
-		CLEAR_DELTA_CHILD(&objects[i]);
-		CLEAR_DELTA_SIBLING(&objects[i]);
+		SET_DELTA_CHILD(&objects[i], NULL);
+		SET_DELTA_SIBLING(&objects[i], NULL);
 	}
 
 	/*
@@ -1067,19 +1061,8 @@ static int want_object_in_pack(const struct object_id *oid,
 
 	want = 1;
 done:
-	if (want && *found_pack && !(*found_pack)->index) {
-		struct packed_git *p = *found_pack;
-
-		if (to_pack.in_pack_count >= (1 << OE_IN_PACK_BITS))
-			die(_("too many packs to handle in one go. "
-			      "Please add .keep files to exclude\n"
-			      "some pack files and keep the number "
-			      "of non-kept files below %d."),
-			    1 << OE_IN_PACK_BITS);
-
-		p->index = to_pack.in_pack_count++;
-		to_pack.in_pack[p->index] = p;
-	}
+	if (want && *found_pack && !(*found_pack)->index)
+		oe_add_pack(&to_pack, *found_pack);
 
 	return want;
 }
@@ -1104,9 +1087,7 @@ static void create_object_entry(const struct object_id *oid,
 	else
 		nr_result++;
 	if (found_pack) {
-		if (found_pack->index <= 0)
-			die("BUG: found_pack should be NULL instead of having non-positive index");
-		entry->in_pack_idx = found_pack->index;
+		oe_set_in_pack(entry, found_pack);
 		entry->in_pack_offset = found_offset;
 	}
 
@@ -1431,8 +1412,8 @@ static void cleanup_preferred_base(void)
 
 static void check_object(struct object_entry *entry)
 {
-	if (IN_PACK(&to_pack, entry)) {
-		struct packed_git *p = IN_PACK(&to_pack, entry);
+	if (IN_PACK(entry)) {
+		struct packed_git *p = IN_PACK(entry);
 		struct pack_window *w_curs = NULL;
 		const unsigned char *base_ref = NULL;
 		struct object_entry *base_entry;
@@ -1567,8 +1548,8 @@ static int pack_offset_sort(const void *_a, const void *_b)
 {
 	const struct object_entry *a = *(struct object_entry **)_a;
 	const struct object_entry *b = *(struct object_entry **)_b;
-	const struct packed_git *a_in_pack = IN_PACK(&to_pack, a);
-	const struct packed_git *b_in_pack = IN_PACK(&to_pack, b);
+	const struct packed_git *a_in_pack = IN_PACK(a);
+	const struct packed_git *b_in_pack = IN_PACK(b);
 
 	/* avoid filesystem trashing with loose objects */
 	if (!a_in_pack && !b_in_pack)
@@ -1609,12 +1590,12 @@ static void drop_reused_delta(struct object_entry *entry)
 		else
 			idx = &oe->delta_sibling_idx;
 	}
-	CLEAR_DELTA(entry);
+	SET_DELTA(entry, NULL);
 	entry->depth = 0;
 
 	oi.sizep = &entry->size;
 	oi.typep = &type;
-	if (packed_object_info(IN_PACK(&to_pack, entry), entry->in_pack_offset, &oi) < 0) {
+	if (packed_object_info(IN_PACK(entry), entry->in_pack_offset, &oi) < 0) {
 		/*
 		 * We failed to get the info from this pack for some reason;
 		 * fall back to sha1_object_info, which may find another copy.
@@ -1884,8 +1865,8 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	 * it, we will still save the transfer cost, as we already know
 	 * the other side has it and we won't send src_entry at all.
 	 */
-	if (reuse_delta && IN_PACK(&to_pack, trg_entry) &&
-	    IN_PACK(&to_pack, trg_entry) == IN_PACK(&to_pack, src_entry) &&
+	if (reuse_delta && IN_PACK(trg_entry) &&
+	    IN_PACK(trg_entry) == IN_PACK(src_entry) &&
 	    !src_entry->preferred_base &&
 	    trg_entry->in_pack_type != OBJ_REF_DELTA &&
 	    trg_entry->in_pack_type != OBJ_OFS_DELTA)
@@ -2994,16 +2975,6 @@ static int option_parse_unpack_unreachable(const struct option *opt,
 	return 0;
 }
 
-static void init_in_pack_mapping(struct packing_data *to_pack)
-{
-	/* let IN_PACK() return NULL if in_pack_idx is zero */
-	to_pack->in_pack[to_pack->in_pack_count++] = NULL;
-	/*
-	 * the rest is lazily initialized only for packs that we want
-	 * in want_object_in_pack().
-	 */
-}
-
 int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 {
 	int use_internal_rev_list = 0;
@@ -3236,7 +3207,9 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 			}
 		}
 	}
-	init_in_pack_mapping(&to_pack);
+
+	/* make sure IN_PACK(0) return NULL */
+	oe_add_pack(&to_pack, NULL);
 
 	if (progress)
 		progress_state = start_progress(_("Counting objects"), 0);
diff --git a/pack-bitmap-write.c b/pack-bitmap-write.c
index 1360a93311..256a63f892 100644
--- a/pack-bitmap-write.c
+++ b/pack-bitmap-write.c
@@ -64,7 +64,7 @@ void bitmap_writer_build_type_index(struct packing_data *to_pack,
 		struct object_entry *entry = (struct object_entry *)index[i];
 		enum object_type real_type;
 
-		IN_PACK_POS(to_pack, entry) = i;
+		oe_set_in_pack_pos(to_pack, entry, i);
 
 		switch (entry->type) {
 		case OBJ_COMMIT:
@@ -149,7 +149,7 @@ static uint32_t find_object_pos(const unsigned char *sha1)
 			"(object %s is missing)", sha1_to_hex(sha1));
 	}
 
-	return IN_PACK_POS(writer.to_pack, entry);
+	return oe_in_pack_pos(writer.to_pack, entry);
 }
 
 static void show_object(struct object *object, const char *name, void *data)
diff --git a/pack-bitmap.c b/pack-bitmap.c
index f21479fe16..865d9ecc4e 100644
--- a/pack-bitmap.c
+++ b/pack-bitmap.c
@@ -1032,7 +1032,7 @@ int rebuild_existing_bitmaps(struct packing_data *mapping,
 		oe = packlist_find(mapping, sha1, NULL);
 
 		if (oe)
-			reposition[i] = IN_PACK_POS(mapping, oe) + 1;
+			reposition[i] = oe_in_pack_pos(mapping, oe) + 1;
 	}
 
 	rebuild = bitmap_new();
diff --git a/pack-objects.h b/pack-objects.h
index a57aca5f03..3c15cf7b23 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -1,15 +1,9 @@
 #ifndef PACK_OBJECTS_H
 #define PACK_OBJECTS_H
 
-#define OE_DFS_STATE_BITS 2
-#define OE_DEPTH_BITS 12
-#define OE_IN_PACK_BITS 14
-
-#define IN_PACK_POS(to_pack, obj) \
-	(to_pack)->in_pack_pos[(struct object_entry *)(obj) - (to_pack)->objects]
-
-#define IN_PACK(to_pack, obj) \
-	(to_pack)->in_pack[(obj)->in_pack_idx]
+#define OE_DFS_STATE_BITS	2
+#define OE_DEPTH_BITS		12
+#define OE_IN_PACK_BITS		14
 
 /*
  * State flags for depth-first search used for analyzing delta cycles.
@@ -28,6 +22,51 @@ enum dfs_state {
  * The size of struct nearly determines pack-objects's memory
  * consumption. This struct is packed tight for that reason. When you
  * add or reorder something in this struct, think a bit about this.
+ *
+ * basic object info
+ * -----------------
+ * idx.oid is filled up before delta searching starts. idx.crc32 and
+ * is only valid after the object is written down and will be used for
+ * generating the index. idx.offset will be both gradually set and
+ * used in writing phase (base objects get offset first, then deltas
+ * refer to them)
+ *
+ * "size" is the uncompressed object size. Compressed size is not
+ * cached (ie. raw data in a pack) but available via revindex.
+ *
+ * "hash" contains a path name hash which is used for sorting the
+ * delta list and also during delta searching. Once prepare_pack()
+ * returns it's no longer needed.
+ *
+ * source pack info
+ * ----------------
+ * The (in_pack, in_pack_offset, in_pack_header_size) tuple contains
+ * the location of the object in the source pack, with or without
+ * header.
+ *
+ * "type" and "in_pack_type" both describe object type. in_pack_type
+ * may contain a delta type, while type is always the canonical type.
+ *
+ * deltas
+ * ------
+ * Delta links (delta, delta_child and delta_sibling) are created
+ * reflect that delta graph from the source pack then updated or added
+ * during delta searching phase when we find better deltas.
+ *
+ * delta_child and delta_sibling are last needed in
+ * compute_write_order(). "delta" and "delta_size" must remain valid
+ * at object writing phase in case the delta is not cached.
+ *
+ * If a delta is cached in memory and is compressed, "delta" points to
+ * the data and z_delta_size contains the compressed size. If it's
+ * uncompressed [1], z_delta_size must be zero. delta_size is always
+ * the uncompressed size and must be valid even if the delta is not
+ * cached. Delta recreation technically only depends on "delta"
+ * pointer, but delta_size is still used to verify it's the same as
+ * before.
+ *
+ * [1] during try_delta phase we don't bother with compressing because
+ * the delta could be quickly replaced with a better one.
  */
 struct object_entry {
 	struct pack_idx_entry idx;
@@ -103,4 +142,109 @@ static inline uint32_t pack_name_hash(const char *name)
 	return hash;
 }
 
+static inline unsigned int oe_in_pack_pos(const struct packing_data *pack,
+					  const struct object_entry *e)
+{
+	return pack->in_pack_pos[e - pack->objects];
+}
+
+static inline void oe_set_in_pack_pos(const struct packing_data *pack,
+				      const struct object_entry *e,
+				      unsigned int pos)
+{
+	pack->in_pack_pos[e - pack->objects] = pos;
+}
+
+static inline unsigned int oe_add_pack(struct packing_data *pack,
+				       struct packed_git *p)
+{
+	if (pack->in_pack_count >= (1 << OE_IN_PACK_BITS))
+		die(_("too many packs to handle in one go. "
+		      "Please add .keep files to exclude\n"
+		      "some pack files and keep the number "
+		      "of non-kept files below %d."),
+		    1 << OE_IN_PACK_BITS);
+	if (p) {
+		if (p->index > 0)
+			die("BUG: this packed is already indexed");
+		p->index = pack->in_pack_count;
+	}
+	pack->in_pack[pack->in_pack_count] = p;
+	return pack->in_pack_count++;
+}
+
+static inline struct packed_git *oe_in_pack(const struct packing_data *pack,
+					    const struct object_entry *e)
+{
+	return pack->in_pack[e->in_pack_idx];
+
+}
+
+static inline void oe_set_in_pack(struct object_entry *e,
+				  struct packed_git *p)
+{
+	if (p->index <= 0)
+		die("BUG: found_pack should be NULL "
+		    "instead of having non-positive index");
+	e->in_pack_idx = p->index;
+
+}
+
+static inline struct object_entry *oe_delta(
+		const struct packing_data *pack,
+		const struct object_entry *e)
+{
+	if (e->delta_idx)
+		return &pack->objects[e->delta_idx - 1];
+	return NULL;
+}
+
+static inline void oe_set_delta(struct packing_data *pack,
+				struct object_entry *e,
+				struct object_entry *delta)
+{
+	if (delta)
+		e->delta_idx = (delta - pack->objects) + 1;
+	else
+		e->delta_idx = 0;
+}
+
+static inline struct object_entry *oe_delta_child(
+		const struct packing_data *pack,
+		const struct object_entry *e)
+{
+	if (e->delta_child_idx)
+		return &pack->objects[e->delta_child_idx - 1];
+	return NULL;
+}
+
+static inline void oe_set_delta_child(struct packing_data *pack,
+				      struct object_entry *e,
+				      struct object_entry *delta)
+{
+	if (delta)
+		e->delta_child_idx = (delta - pack->objects) + 1;
+	else
+		e->delta_child_idx = 0;
+}
+
+static inline struct object_entry *oe_delta_sibling(
+		const struct packing_data *pack,
+		const struct object_entry *e)
+{
+	if (e->delta_sibling_idx)
+		return &pack->objects[e->delta_sibling_idx - 1];
+	return NULL;
+}
+
+static inline void oe_set_delta_sibling(struct packing_data *pack,
+					struct object_entry *e,
+					struct object_entry *delta)
+{
+	if (delta)
+		e->delta_sibling_idx = (delta - pack->objects) + 1;
+	else
+		e->delta_sibling_idx = 0;
+}
+
 #endif

Nguyễn Thái Ngọc Duy (12):
  pack-objects: a bit of document about struct object_entry
  pack-objects: turn type and in_pack_type to bitfields
  pack-objects: use bitfield for object_entry::dfs_state
  pack-objects: use bitfield for object_entry::depth
  pack-objects: note about in_pack_header_size
  pack-objects: move in_pack_pos out of struct object_entry
  pack-objects: move in_pack out of struct object_entry
  pack-objects: refer to delta objects by index instead of pointer
  pack-objects: reorder 'hash' to pack struct object_entry
  pack-objects: shrink z_delta_size field in struct object_entry
  pack-objects: shrink size field in struct object_entry
  pack-objects: shrink delta_size field in struct object_entry

 Documentation/config.txt           |   4 +-
 Documentation/git-pack-objects.txt |  13 +-
 Documentation/git-repack.txt       |   4 +-
 builtin/pack-objects.c             | 269 +++++++++++++++++----------
 cache.h                            |   3 +
 object.h                           |   1 -
 pack-bitmap-write.c                |   8 +-
 pack-bitmap.c                      |   2 +-
 pack-bitmap.h                      |   4 +-
 pack-objects.h                     | 288 ++++++++++++++++++++++++++---
 10 files changed, 460 insertions(+), 136 deletions(-)

-- 
2.16.2.873.g32ff258c87


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH/RFC v3 01/12] pack-objects: a bit of document about struct object_entry
  2018-03-08 11:42     ` [PATCH/RFC v3 00/12] " Nguyễn Thái Ngọc Duy
@ 2018-03-08 11:42       ` Nguyễn Thái Ngọc Duy
  2018-03-09 22:34         ` Junio C Hamano
  2018-03-08 11:42       ` [PATCH/RFC v3 02/12] pack-objects: turn type and in_pack_type to bitfields Nguyễn Thái Ngọc Duy
                         ` (11 subsequent siblings)
  12 siblings, 1 reply; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-08 11:42 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

The role of this comment block becomes more important after we shuffle
fields around to shrink this struct. It will be much harder to see what
field is related to what. This also documents the holes in this struct
according to pahole.

A couple of notes on shrinking the struct:

1) The reader may notice one thing from this document and the shrinking
business. If "delta" is NULL, all other delta-related fields should be
irrelevant. We could group all these in a separate struct and replace
them all with a pointer to this struct (allocated separately).

This does not help much though since 85% of objects are deltified
(source: linux-2.6.git). The gain is only from non-delta objects, which
is not that significant.

2) The field in_pack_offset and idx.offset could be merged. But we need
to be very careful. Up until the very last phase (object writing),
idx.offset is not used and can hold in_pack_offset. Then idx.offset will
be updated with _destination pack's_ offset, not source's. But since we
always write delta's bases first, and we only use in_pack_offset in
writing phase when we reuse objects, we should be ok?

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 pack-objects.h | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/pack-objects.h b/pack-objects.h
index 03f1191659..f834ead541 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -1,6 +1,52 @@
 #ifndef PACK_OBJECTS_H
 #define PACK_OBJECTS_H
 
+/*
+ * basic object info
+ * -----------------
+ * idx.oid is filled up before delta searching starts. idx.crc32 and
+ * is only valid after the object is written down and will be used for
+ * generating the index. idx.offset will be both gradually set and
+ * used in writing phase (base objects get offset first, then deltas
+ * refer to them)
+ *
+ * "size" is the uncompressed object size. Compressed size is not
+ * cached (ie. raw data in a pack) but available via revindex.
+ *
+ * "hash" contains a path name hash which is used for sorting the
+ * delta list and also during delta searching. Once prepare_pack()
+ * returns it's no longer needed.
+ *
+ * source pack info
+ * ----------------
+ * The (in_pack, in_pack_offset, in_pack_header_size) tuple contains
+ * the location of the object in the source pack, with or without
+ * header.
+ *
+ * "type" and "in_pack_type" both describe object type. in_pack_type
+ * may contain a delta type, while type is always the canonical type.
+ *
+ * deltas
+ * ------
+ * Delta links (delta, delta_child and delta_sibling) are created
+ * reflect that delta graph from the source pack then updated or added
+ * during delta searching phase when we find better deltas.
+ *
+ * delta_child and delta_sibling are last needed in
+ * compute_write_order(). "delta" and "delta_size" must remain valid
+ * at object writing phase in case the delta is not cached.
+ *
+ * If a delta is cached in memory and is compressed, "delta" points to
+ * the data and z_delta_size contains the compressed size. If it's
+ * uncompressed [1], z_delta_size must be zero. delta_size is always
+ * the uncompressed size and must be valid even if the delta is not
+ * cached. Delta recreation technically only depends on "delta"
+ * pointer, but delta_size is still used to verify it's the same as
+ * before.
+ *
+ * [1] during try_delta phase we don't bother with compressing because
+ * the delta could be quickly replaced with a better one.
+ */
 struct object_entry {
 	struct pack_idx_entry idx;
 	unsigned long size;	/* uncompressed size */
@@ -28,6 +74,7 @@ struct object_entry {
 	unsigned tagged:1; /* near the very tip of refs */
 	unsigned filled:1; /* assigned write-order */
 
+	/* XXX 28 bits hole, try to pack */
 	/*
 	 * State flags for depth-first search used for analyzing delta cycles.
 	 *
@@ -40,6 +87,7 @@ struct object_entry {
 		DFS_DONE
 	} dfs_state;
 	int depth;
+	/* size: 136, padding: 4 */
 };
 
 struct packing_data {
-- 
2.16.2.873.g32ff258c87


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH/RFC v3 02/12] pack-objects: turn type and in_pack_type to bitfields
  2018-03-08 11:42     ` [PATCH/RFC v3 00/12] " Nguyễn Thái Ngọc Duy
  2018-03-08 11:42       ` [PATCH/RFC v3 01/12] pack-objects: a bit of document about struct object_entry Nguyễn Thái Ngọc Duy
@ 2018-03-08 11:42       ` Nguyễn Thái Ngọc Duy
  2018-03-09 22:54         ` Junio C Hamano
  2018-03-08 11:42       ` [PATCH/RFC v3 03/12] pack-objects: use bitfield for object_entry::dfs_state Nguyễn Thái Ngọc Duy
                         ` (10 subsequent siblings)
  12 siblings, 1 reply; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-08 11:42 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

This saves 8 bytes in sizeof(struct object_entry). On a large
repository like linux-2.6.git (6.5M objects), this saves us 52MB
memory.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 14 ++++++++++++--
 cache.h                |  2 ++
 object.h               |  1 -
 pack-objects.h         |  8 ++++----
 4 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 5c674b2843..fd217cb51f 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -1407,6 +1407,7 @@ static void check_object(struct object_entry *entry)
 		unsigned long avail;
 		off_t ofs;
 		unsigned char *buf, c;
+		enum object_type type;
 
 		buf = use_pack(p, &w_curs, entry->in_pack_offset, &avail);
 
@@ -1415,11 +1416,15 @@ static void check_object(struct object_entry *entry)
 		 * since non-delta representations could still be reused.
 		 */
 		used = unpack_object_header_buffer(buf, avail,
-						   &entry->in_pack_type,
+						   &type,
 						   &entry->size);
 		if (used == 0)
 			goto give_up;
 
+		if (type < 0)
+			die("BUG: invalid type %d", type);
+		entry->in_pack_type = type;
+
 		/*
 		 * Determine if this is a delta and if so whether we can
 		 * reuse it or not.  Otherwise let's find out as cheaply as
@@ -1559,6 +1564,7 @@ static void drop_reused_delta(struct object_entry *entry)
 {
 	struct object_entry **p = &entry->delta->delta_child;
 	struct object_info oi = OBJECT_INFO_INIT;
+	enum object_type type;
 
 	while (*p) {
 		if (*p == entry)
@@ -1570,7 +1576,7 @@ static void drop_reused_delta(struct object_entry *entry)
 	entry->depth = 0;
 
 	oi.sizep = &entry->size;
-	oi.typep = &entry->type;
+	oi.typep = &type;
 	if (packed_object_info(entry->in_pack, entry->in_pack_offset, &oi) < 0) {
 		/*
 		 * We failed to get the info from this pack for some reason;
@@ -1580,6 +1586,10 @@ static void drop_reused_delta(struct object_entry *entry)
 		 */
 		entry->type = sha1_object_info(entry->idx.oid.hash,
 					       &entry->size);
+	} else {
+		if (type < 0)
+			die("BUG: invalid type %d", type);
+		entry->type = type;
 	}
 }
 
diff --git a/cache.h b/cache.h
index 21fbcc2414..862bdff83a 100644
--- a/cache.h
+++ b/cache.h
@@ -373,6 +373,8 @@ extern void free_name_hash(struct index_state *istate);
 #define read_blob_data_from_cache(path, sz) read_blob_data_from_index(&the_index, (path), (sz))
 #endif
 
+#define TYPE_BITS 3
+
 enum object_type {
 	OBJ_BAD = -1,
 	OBJ_NONE = 0,
diff --git a/object.h b/object.h
index 87563d9056..8ce294d6ec 100644
--- a/object.h
+++ b/object.h
@@ -25,7 +25,6 @@ struct object_array {
 
 #define OBJECT_ARRAY_INIT { 0, 0, NULL }
 
-#define TYPE_BITS   3
 /*
  * object flag allocation:
  * revision.h:      0---------10                                26
diff --git a/pack-objects.h b/pack-objects.h
index f834ead541..85b01b66da 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -60,11 +60,11 @@ struct object_entry {
 	void *delta_data;	/* cached delta (uncompressed) */
 	unsigned long delta_size;	/* delta data size (uncompressed) */
 	unsigned long z_delta_size;	/* delta data size (compressed) */
-	enum object_type type;
-	enum object_type in_pack_type;	/* could be delta */
 	uint32_t hash;			/* name hint hash */
 	unsigned int in_pack_pos;
 	unsigned char in_pack_header_size;
+	unsigned type:TYPE_BITS;
+	unsigned in_pack_type:TYPE_BITS; /* could be delta */
 	unsigned preferred_base:1; /*
 				    * we do not pack this, but is available
 				    * to be used as the base object to delta
@@ -74,7 +74,7 @@ struct object_entry {
 	unsigned tagged:1; /* near the very tip of refs */
 	unsigned filled:1; /* assigned write-order */
 
-	/* XXX 28 bits hole, try to pack */
+	/* XXX 22 bits hole, try to pack */
 	/*
 	 * State flags for depth-first search used for analyzing delta cycles.
 	 *
@@ -87,7 +87,7 @@ struct object_entry {
 		DFS_DONE
 	} dfs_state;
 	int depth;
-	/* size: 136, padding: 4 */
+	/* size: 128, padding: 4 */
 };
 
 struct packing_data {
-- 
2.16.2.873.g32ff258c87


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH/RFC v3 03/12] pack-objects: use bitfield for object_entry::dfs_state
  2018-03-08 11:42     ` [PATCH/RFC v3 00/12] " Nguyễn Thái Ngọc Duy
  2018-03-08 11:42       ` [PATCH/RFC v3 01/12] pack-objects: a bit of document about struct object_entry Nguyễn Thái Ngọc Duy
  2018-03-08 11:42       ` [PATCH/RFC v3 02/12] pack-objects: turn type and in_pack_type to bitfields Nguyễn Thái Ngọc Duy
@ 2018-03-08 11:42       ` Nguyễn Thái Ngọc Duy
  2018-03-08 11:42       ` [PATCH/RFC v3 04/12] pack-objects: use bitfield for object_entry::depth Nguyễn Thái Ngọc Duy
                         ` (9 subsequent siblings)
  12 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-08 11:42 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c |  3 +++
 pack-objects.h         | 33 ++++++++++++++++++++-------------
 2 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index fd217cb51f..a4dbb40824 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -3049,6 +3049,9 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 		OPT_END(),
 	};
 
+	if (DFS_NUM_STATES > (1 << OE_DFS_STATE_BITS))
+		die("BUG: too many dfs states, increase OE_DFS_STATE_BITS");
+
 	check_replace_refs = 0;
 
 	reset_pack_idx_option(&pack_idx_opts);
diff --git a/pack-objects.h b/pack-objects.h
index 85b01b66da..628c45871c 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -1,6 +1,21 @@
 #ifndef PACK_OBJECTS_H
 #define PACK_OBJECTS_H
 
+#define OE_DFS_STATE_BITS	2
+
+/*
+ * State flags for depth-first search used for analyzing delta cycles.
+ *
+ * The depth is measured in delta-links to the base (so if A is a delta
+ * against B, then A has a depth of 1, and B a depth of 0).
+ */
+enum dfs_state {
+	DFS_NONE = 0,
+	DFS_ACTIVE,
+	DFS_DONE,
+	DFS_NUM_STATES
+};
+
 /*
  * basic object info
  * -----------------
@@ -73,21 +88,13 @@ struct object_entry {
 	unsigned no_try_delta:1;
 	unsigned tagged:1; /* near the very tip of refs */
 	unsigned filled:1; /* assigned write-order */
+	unsigned dfs_state:OE_DFS_STATE_BITS;
+
+	/* XXX 20 bits hole, try to pack */
 
-	/* XXX 22 bits hole, try to pack */
-	/*
-	 * State flags for depth-first search used for analyzing delta cycles.
-	 *
-	 * The depth is measured in delta-links to the base (so if A is a delta
-	 * against B, then A has a depth of 1, and B a depth of 0).
-	 */
-	enum {
-		DFS_NONE = 0,
-		DFS_ACTIVE,
-		DFS_DONE
-	} dfs_state;
 	int depth;
-	/* size: 128, padding: 4 */
+
+	/* size: 120 */
 };
 
 struct packing_data {
-- 
2.16.2.873.g32ff258c87


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH/RFC v3 04/12] pack-objects: use bitfield for object_entry::depth
  2018-03-08 11:42     ` [PATCH/RFC v3 00/12] " Nguyễn Thái Ngọc Duy
                         ` (2 preceding siblings ...)
  2018-03-08 11:42       ` [PATCH/RFC v3 03/12] pack-objects: use bitfield for object_entry::dfs_state Nguyễn Thái Ngọc Duy
@ 2018-03-08 11:42       ` Nguyễn Thái Ngọc Duy
  2018-03-09 23:07         ` Junio C Hamano
  2018-03-08 11:42       ` [PATCH/RFC v3 05/12] pack-objects: note about in_pack_header_size Nguyễn Thái Ngọc Duy
                         ` (8 subsequent siblings)
  12 siblings, 1 reply; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-08 11:42 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

This does not give us any saving due to padding. But we will be able
to save once we cut 4 bytes out of this struct in a subsequent patch.

Because of struct packing from now on we can only handle max depth
4095 (or even lower when new booleans are added in this struct). This
should be ok since long delta chain will cause significant slow down
anyway.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 Documentation/config.txt           | 1 +
 Documentation/git-pack-objects.txt | 4 +++-
 Documentation/git-repack.txt       | 4 +++-
 builtin/pack-objects.c             | 4 ++++
 pack-objects.h                     | 8 +++-----
 5 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index f57e9cf10c..9bd3f5a789 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -2412,6 +2412,7 @@ pack.window::
 pack.depth::
 	The maximum delta depth used by linkgit:git-pack-objects[1] when no
 	maximum depth is given on the command line. Defaults to 50.
+	Maximum value is 4095.
 
 pack.windowMemory::
 	The maximum size of memory that is consumed by each thread
diff --git a/Documentation/git-pack-objects.txt b/Documentation/git-pack-objects.txt
index 81bc490ac5..3503c9e3e6 100644
--- a/Documentation/git-pack-objects.txt
+++ b/Documentation/git-pack-objects.txt
@@ -96,7 +96,9 @@ base-name::
 	it too deep affects the performance on the unpacker
 	side, because delta data needs to be applied that many
 	times to get to the necessary object.
-	The default value for --window is 10 and --depth is 50.
++
+The default value for --window is 10 and --depth is 50. The maximum
+depth is 4095.
 
 --window-memory=<n>::
 	This option provides an additional limit on top of `--window`;
diff --git a/Documentation/git-repack.txt b/Documentation/git-repack.txt
index ae750e9e11..25c83c4927 100644
--- a/Documentation/git-repack.txt
+++ b/Documentation/git-repack.txt
@@ -90,7 +90,9 @@ other objects in that pack they already have locally.
 	space. `--depth` limits the maximum delta depth; making it too deep
 	affects the performance on the unpacker side, because delta data needs
 	to be applied that many times to get to the necessary object.
-	The default value for --window is 10 and --depth is 50.
++
+The default value for --window is 10 and --depth is 50. The maximum
+depth is 4095.
 
 --threads=<n>::
 	This option is passed through to `git pack-objects`.
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index a4dbb40824..cfd97da7db 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -3068,6 +3068,10 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 	if (pack_to_stdout != !base_name || argc)
 		usage_with_options(pack_usage, pack_objects_options);
 
+	if (depth > (1 << OE_DEPTH_BITS))
+		die(_("delta chain depth %d is greater than maximum limit %d"),
+		    depth, (1 << OE_DEPTH_BITS));
+
 	argv_array_push(&rp, "pack-objects");
 	if (thin) {
 		use_internal_rev_list = 1;
diff --git a/pack-objects.h b/pack-objects.h
index 628c45871c..4b17402953 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -2,6 +2,7 @@
 #define PACK_OBJECTS_H
 
 #define OE_DFS_STATE_BITS	2
+#define OE_DEPTH_BITS		12
 
 /*
  * State flags for depth-first search used for analyzing delta cycles.
@@ -89,12 +90,9 @@ struct object_entry {
 	unsigned tagged:1; /* near the very tip of refs */
 	unsigned filled:1; /* assigned write-order */
 	unsigned dfs_state:OE_DFS_STATE_BITS;
+	unsigned depth:OE_DEPTH_BITS;
 
-	/* XXX 20 bits hole, try to pack */
-
-	int depth;
-
-	/* size: 120 */
+	/* size: 120, bit_padding: 8 bits */
 };
 
 struct packing_data {
-- 
2.16.2.873.g32ff258c87


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH/RFC v3 05/12] pack-objects: note about in_pack_header_size
  2018-03-08 11:42     ` [PATCH/RFC v3 00/12] " Nguyễn Thái Ngọc Duy
                         ` (3 preceding siblings ...)
  2018-03-08 11:42       ` [PATCH/RFC v3 04/12] pack-objects: use bitfield for object_entry::depth Nguyễn Thái Ngọc Duy
@ 2018-03-08 11:42       ` Nguyễn Thái Ngọc Duy
  2018-03-08 11:42       ` [PATCH/RFC v3 06/12] pack-objects: move in_pack_pos out of struct object_entry Nguyễn Thái Ngọc Duy
                         ` (7 subsequent siblings)
  12 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-08 11:42 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

Object header in a pack is packed really tight (see
pack-format.txt). Even with 8 bytes length, we need 9-10 bytes most,
plus a hash (20 bytes). Which means this field only needs to store a
number as big as 32 (5 bits).

This is trickier to pack tight though since a new hash algorithm is
coming, the number of bits needed may quickly increase. So leave it
for now.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 pack-objects.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pack-objects.h b/pack-objects.h
index 4b17402953..2ccd6359d2 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -78,7 +78,7 @@ struct object_entry {
 	unsigned long z_delta_size;	/* delta data size (compressed) */
 	uint32_t hash;			/* name hint hash */
 	unsigned int in_pack_pos;
-	unsigned char in_pack_header_size;
+	unsigned char in_pack_header_size; /* note: spare bits available! */
 	unsigned type:TYPE_BITS;
 	unsigned in_pack_type:TYPE_BITS; /* could be delta */
 	unsigned preferred_base:1; /*
-- 
2.16.2.873.g32ff258c87


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH/RFC v3 06/12] pack-objects: move in_pack_pos out of struct object_entry
  2018-03-08 11:42     ` [PATCH/RFC v3 00/12] " Nguyễn Thái Ngọc Duy
                         ` (4 preceding siblings ...)
  2018-03-08 11:42       ` [PATCH/RFC v3 05/12] pack-objects: note about in_pack_header_size Nguyễn Thái Ngọc Duy
@ 2018-03-08 11:42       ` Nguyễn Thái Ngọc Duy
  2018-03-08 11:42       ` [PATCH/RFC v3 07/12] pack-objects: move in_pack " Nguyễn Thái Ngọc Duy
                         ` (6 subsequent siblings)
  12 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-08 11:42 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

This field is only need for pack-bitmap, which is an optional
feature. Move it to a separate array that is only allocated when
pack-bitmap is used (it's not freed in the same way that objects[] is
not). This saves us 8 bytes in struct object_entry.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c |  3 ++-
 pack-bitmap-write.c    |  8 +++++---
 pack-bitmap.c          |  2 +-
 pack-bitmap.h          |  4 +++-
 pack-objects.h         | 18 ++++++++++++++++--
 5 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index cfd97da7db..7bb5544883 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -878,7 +878,8 @@ static void write_pack_file(void)
 
 			if (write_bitmap_index) {
 				bitmap_writer_set_checksum(oid.hash);
-				bitmap_writer_build_type_index(written_list, nr_written);
+				bitmap_writer_build_type_index(
+					&to_pack, written_list, nr_written);
 			}
 
 			finish_tmp_packfile(&tmpname, pack_tmp_name,
diff --git a/pack-bitmap-write.c b/pack-bitmap-write.c
index e01f992884..256a63f892 100644
--- a/pack-bitmap-write.c
+++ b/pack-bitmap-write.c
@@ -48,7 +48,8 @@ void bitmap_writer_show_progress(int show)
 /**
  * Build the initial type index for the packfile
  */
-void bitmap_writer_build_type_index(struct pack_idx_entry **index,
+void bitmap_writer_build_type_index(struct packing_data *to_pack,
+				    struct pack_idx_entry **index,
 				    uint32_t index_nr)
 {
 	uint32_t i;
@@ -57,12 +58,13 @@ void bitmap_writer_build_type_index(struct pack_idx_entry **index,
 	writer.trees = ewah_new();
 	writer.blobs = ewah_new();
 	writer.tags = ewah_new();
+	ALLOC_ARRAY(to_pack->in_pack_pos, to_pack->nr_objects);
 
 	for (i = 0; i < index_nr; ++i) {
 		struct object_entry *entry = (struct object_entry *)index[i];
 		enum object_type real_type;
 
-		entry->in_pack_pos = i;
+		oe_set_in_pack_pos(to_pack, entry, i);
 
 		switch (entry->type) {
 		case OBJ_COMMIT:
@@ -147,7 +149,7 @@ static uint32_t find_object_pos(const unsigned char *sha1)
 			"(object %s is missing)", sha1_to_hex(sha1));
 	}
 
-	return entry->in_pack_pos;
+	return oe_in_pack_pos(writer.to_pack, entry);
 }
 
 static void show_object(struct object *object, const char *name, void *data)
diff --git a/pack-bitmap.c b/pack-bitmap.c
index 9270983e5f..865d9ecc4e 100644
--- a/pack-bitmap.c
+++ b/pack-bitmap.c
@@ -1032,7 +1032,7 @@ int rebuild_existing_bitmaps(struct packing_data *mapping,
 		oe = packlist_find(mapping, sha1, NULL);
 
 		if (oe)
-			reposition[i] = oe->in_pack_pos + 1;
+			reposition[i] = oe_in_pack_pos(mapping, oe) + 1;
 	}
 
 	rebuild = bitmap_new();
diff --git a/pack-bitmap.h b/pack-bitmap.h
index 3742a00e14..5ded2f139a 100644
--- a/pack-bitmap.h
+++ b/pack-bitmap.h
@@ -44,7 +44,9 @@ int rebuild_existing_bitmaps(struct packing_data *mapping, khash_sha1 *reused_bi
 
 void bitmap_writer_show_progress(int show);
 void bitmap_writer_set_checksum(unsigned char *sha1);
-void bitmap_writer_build_type_index(struct pack_idx_entry **index, uint32_t index_nr);
+void bitmap_writer_build_type_index(struct packing_data *to_pack,
+				    struct pack_idx_entry **index,
+				    uint32_t index_nr);
 void bitmap_writer_reuse_bitmaps(struct packing_data *to_pack);
 void bitmap_writer_select_commits(struct commit **indexed_commits,
 		unsigned int indexed_commits_nr, int max_bitmaps);
diff --git a/pack-objects.h b/pack-objects.h
index 2ccd6359d2..9ab0ce300d 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -77,7 +77,6 @@ struct object_entry {
 	unsigned long delta_size;	/* delta data size (uncompressed) */
 	unsigned long z_delta_size;	/* delta data size (compressed) */
 	uint32_t hash;			/* name hint hash */
-	unsigned int in_pack_pos;
 	unsigned char in_pack_header_size; /* note: spare bits available! */
 	unsigned type:TYPE_BITS;
 	unsigned in_pack_type:TYPE_BITS; /* could be delta */
@@ -92,7 +91,7 @@ struct object_entry {
 	unsigned dfs_state:OE_DFS_STATE_BITS;
 	unsigned depth:OE_DEPTH_BITS;
 
-	/* size: 120, bit_padding: 8 bits */
+	/* size: 112, bit_padding: 8 bits */
 };
 
 struct packing_data {
@@ -101,6 +100,8 @@ struct packing_data {
 
 	int32_t *index;
 	uint32_t index_size;
+
+	unsigned int *in_pack_pos;
 };
 
 struct object_entry *packlist_alloc(struct packing_data *pdata,
@@ -131,4 +132,17 @@ static inline uint32_t pack_name_hash(const char *name)
 	return hash;
 }
 
+static inline unsigned int oe_in_pack_pos(const struct packing_data *pack,
+					  const struct object_entry *e)
+{
+	return pack->in_pack_pos[e - pack->objects];
+}
+
+static inline void oe_set_in_pack_pos(const struct packing_data *pack,
+				      const struct object_entry *e,
+				      unsigned int pos)
+{
+	pack->in_pack_pos[e - pack->objects] = pos;
+}
+
 #endif
-- 
2.16.2.873.g32ff258c87


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH/RFC v3 07/12] pack-objects: move in_pack out of struct object_entry
  2018-03-08 11:42     ` [PATCH/RFC v3 00/12] " Nguyễn Thái Ngọc Duy
                         ` (5 preceding siblings ...)
  2018-03-08 11:42       ` [PATCH/RFC v3 06/12] pack-objects: move in_pack_pos out of struct object_entry Nguyễn Thái Ngọc Duy
@ 2018-03-08 11:42       ` Nguyễn Thái Ngọc Duy
  2018-03-09 23:21         ` Junio C Hamano
  2018-03-08 11:42       ` [PATCH/RFC v3 08/12] pack-objects: refer to delta objects by index instead of pointer Nguyễn Thái Ngọc Duy
                         ` (5 subsequent siblings)
  12 siblings, 1 reply; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-08 11:42 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

Instead of using 8 bytes (on 64 bit arch) to store a pointer to a
pack. Use an index isntead since the number of packs should be
relatively small.

This limits the number of packs we can handle to 16k. For now if you hit
16k pack files limit, pack-objects will simply fail [1].

This technically saves 7 bytes. But we don't see any of that in
practice due to padding. The saving becomes real when we pack this
struct tighter later.

[1] The escape hatch is .keep file to limit the non-kept pack files
    below 16k limit. Then you can go for another pack-objects run to
    combine another 16k pack files. Repeat until you're satisfied.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 Documentation/git-pack-objects.txt |  9 ++++++
 builtin/pack-objects.c             | 40 +++++++++++++++---------
 cache.h                            |  1 +
 pack-objects.h                     | 49 ++++++++++++++++++++++++++++--
 4 files changed, 83 insertions(+), 16 deletions(-)

diff --git a/Documentation/git-pack-objects.txt b/Documentation/git-pack-objects.txt
index 3503c9e3e6..b8d936ccf5 100644
--- a/Documentation/git-pack-objects.txt
+++ b/Documentation/git-pack-objects.txt
@@ -269,6 +269,15 @@ Unexpected missing object will raise an error.
 	locally created objects [without .promisor] and objects from the
 	promisor remote [with .promisor].)  This is used with partial clone.
 
+LIMITATIONS
+-----------
+
+This command could only handle 16384 existing pack files at a time.
+If you have more than this, you need to exclude some pack files with
+".keep" file and --honor-pack-keep option, to combine 16k pack files
+in one, then remove these .keep files and run pack-objects one more
+time.
+
 SEE ALSO
 --------
 linkgit:git-rev-list[1]
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 7bb5544883..7df525e201 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -29,6 +29,8 @@
 #include "list.h"
 #include "packfile.h"
 
+#define IN_PACK(obj) oe_in_pack(&to_pack, obj)
+
 static const char *pack_usage[] = {
 	N_("git pack-objects --stdout [<options>...] [< <ref-list> | < <object-list>]"),
 	N_("git pack-objects [<options>...] <base-name> [< <ref-list> | < <object-list>]"),
@@ -367,7 +369,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 				unsigned long limit, int usable_delta)
 {
-	struct packed_git *p = entry->in_pack;
+	struct packed_git *p = IN_PACK(entry);
 	struct pack_window *w_curs = NULL;
 	struct revindex_entry *revidx;
 	off_t offset;
@@ -478,7 +480,7 @@ static off_t write_object(struct hashfile *f,
 
 	if (!reuse_object)
 		to_reuse = 0;	/* explicit */
-	else if (!entry->in_pack)
+	else if (!IN_PACK(entry))
 		to_reuse = 0;	/* can't reuse what we don't have */
 	else if (entry->type == OBJ_REF_DELTA || entry->type == OBJ_OFS_DELTA)
 				/* check_object() decided it for us ... */
@@ -1024,7 +1026,7 @@ static int want_object_in_pack(const struct object_id *oid,
 	if (*found_pack) {
 		want = want_found_object(exclude, *found_pack);
 		if (want != -1)
-			return want;
+			goto done;
 	}
 
 	list_for_each(pos, &packed_git_mru) {
@@ -1047,11 +1049,16 @@ static int want_object_in_pack(const struct object_id *oid,
 			if (!exclude && want > 0)
 				list_move(&p->mru, &packed_git_mru);
 			if (want != -1)
-				return want;
+				goto done;
 		}
 	}
 
-	return 1;
+	want = 1;
+done:
+	if (want && *found_pack && !(*found_pack)->index)
+		oe_add_pack(&to_pack, *found_pack);
+
+	return want;
 }
 
 static void create_object_entry(const struct object_id *oid,
@@ -1074,7 +1081,7 @@ static void create_object_entry(const struct object_id *oid,
 	else
 		nr_result++;
 	if (found_pack) {
-		entry->in_pack = found_pack;
+		oe_set_in_pack(entry, found_pack);
 		entry->in_pack_offset = found_offset;
 	}
 
@@ -1399,8 +1406,8 @@ static void cleanup_preferred_base(void)
 
 static void check_object(struct object_entry *entry)
 {
-	if (entry->in_pack) {
-		struct packed_git *p = entry->in_pack;
+	if (IN_PACK(entry)) {
+		struct packed_git *p = IN_PACK(entry);
 		struct pack_window *w_curs = NULL;
 		const unsigned char *base_ref = NULL;
 		struct object_entry *base_entry;
@@ -1535,14 +1542,16 @@ static int pack_offset_sort(const void *_a, const void *_b)
 {
 	const struct object_entry *a = *(struct object_entry **)_a;
 	const struct object_entry *b = *(struct object_entry **)_b;
+	const struct packed_git *a_in_pack = IN_PACK(a);
+	const struct packed_git *b_in_pack = IN_PACK(b);
 
 	/* avoid filesystem trashing with loose objects */
-	if (!a->in_pack && !b->in_pack)
+	if (!a_in_pack && !b_in_pack)
 		return oidcmp(&a->idx.oid, &b->idx.oid);
 
-	if (a->in_pack < b->in_pack)
+	if (a_in_pack < b_in_pack)
 		return -1;
-	if (a->in_pack > b->in_pack)
+	if (a_in_pack > b_in_pack)
 		return 1;
 	return a->in_pack_offset < b->in_pack_offset ? -1 :
 			(a->in_pack_offset > b->in_pack_offset);
@@ -1578,7 +1587,7 @@ static void drop_reused_delta(struct object_entry *entry)
 
 	oi.sizep = &entry->size;
 	oi.typep = &type;
-	if (packed_object_info(entry->in_pack, entry->in_pack_offset, &oi) < 0) {
+	if (packed_object_info(IN_PACK(entry), entry->in_pack_offset, &oi) < 0) {
 		/*
 		 * We failed to get the info from this pack for some reason;
 		 * fall back to sha1_object_info, which may find another copy.
@@ -1848,8 +1857,8 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	 * it, we will still save the transfer cost, as we already know
 	 * the other side has it and we won't send src_entry at all.
 	 */
-	if (reuse_delta && trg_entry->in_pack &&
-	    trg_entry->in_pack == src_entry->in_pack &&
+	if (reuse_delta && IN_PACK(trg_entry) &&
+	    IN_PACK(trg_entry) == IN_PACK(src_entry) &&
 	    !src_entry->preferred_base &&
 	    trg_entry->in_pack_type != OBJ_REF_DELTA &&
 	    trg_entry->in_pack_type != OBJ_OFS_DELTA)
@@ -3191,6 +3200,9 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 		}
 	}
 
+	/* make sure IN_PACK(0) return NULL */
+	oe_add_pack(&to_pack, NULL);
+
 	if (progress)
 		progress_state = start_progress(_("Counting objects"), 0);
 	if (!use_internal_rev_list)
diff --git a/cache.h b/cache.h
index 862bdff83a..b90feb3802 100644
--- a/cache.h
+++ b/cache.h
@@ -1635,6 +1635,7 @@ extern struct packed_git {
 	int index_version;
 	time_t mtime;
 	int pack_fd;
+	int index;		/* for builtin/pack-objects.c */
 	unsigned pack_local:1,
 		 pack_keep:1,
 		 freshened:1,
diff --git a/pack-objects.h b/pack-objects.h
index 9ab0ce300d..59c44b3420 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -3,6 +3,7 @@
 
 #define OE_DFS_STATE_BITS	2
 #define OE_DEPTH_BITS		12
+#define OE_IN_PACK_BITS		14
 
 /*
  * State flags for depth-first search used for analyzing delta cycles.
@@ -18,6 +19,10 @@ enum dfs_state {
 };
 
 /*
+ * The size of struct nearly determines pack-objects's memory
+ * consumption. This struct is packed tight for that reason. When you
+ * add or reorder something in this struct, think a bit about this.
+ *
  * basic object info
  * -----------------
  * idx.oid is filled up before delta searching starts. idx.crc32 and
@@ -66,7 +71,6 @@ enum dfs_state {
 struct object_entry {
 	struct pack_idx_entry idx;
 	unsigned long size;	/* uncompressed size */
-	struct packed_git *in_pack;	/* already in pack */
 	off_t in_pack_offset;
 	struct object_entry *delta;	/* delta base object */
 	struct object_entry *delta_child; /* deltified objects who bases me */
@@ -78,6 +82,7 @@ struct object_entry {
 	unsigned long z_delta_size;	/* delta data size (compressed) */
 	uint32_t hash;			/* name hint hash */
 	unsigned char in_pack_header_size; /* note: spare bits available! */
+	unsigned in_pack_idx:OE_IN_PACK_BITS;	/* already in pack */
 	unsigned type:TYPE_BITS;
 	unsigned in_pack_type:TYPE_BITS; /* could be delta */
 	unsigned preferred_base:1; /*
@@ -89,9 +94,12 @@ struct object_entry {
 	unsigned tagged:1; /* near the very tip of refs */
 	unsigned filled:1; /* assigned write-order */
 	unsigned dfs_state:OE_DFS_STATE_BITS;
+
+	/* XXX 8 bits hole, try to pack */
+
 	unsigned depth:OE_DEPTH_BITS;
 
-	/* size: 112, bit_padding: 8 bits */
+	/* size: 112, padding: 4, bit_padding: 18 bits */
 };
 
 struct packing_data {
@@ -102,6 +110,8 @@ struct packing_data {
 	uint32_t index_size;
 
 	unsigned int *in_pack_pos;
+	int in_pack_count;
+	struct packed_git *in_pack[1 << OE_IN_PACK_BITS];
 };
 
 struct object_entry *packlist_alloc(struct packing_data *pdata,
@@ -145,4 +155,39 @@ static inline void oe_set_in_pack_pos(const struct packing_data *pack,
 	pack->in_pack_pos[e - pack->objects] = pos;
 }
 
+static inline unsigned int oe_add_pack(struct packing_data *pack,
+				       struct packed_git *p)
+{
+	if (pack->in_pack_count >= (1 << OE_IN_PACK_BITS))
+		die(_("too many packs to handle in one go. "
+		      "Please add .keep files to exclude\n"
+		      "some pack files and keep the number "
+		      "of non-kept files below %d."),
+		    1 << OE_IN_PACK_BITS);
+	if (p) {
+		if (p->index > 0)
+			die("BUG: this packed is already indexed");
+		p->index = pack->in_pack_count;
+	}
+	pack->in_pack[pack->in_pack_count] = p;
+	return pack->in_pack_count++;
+}
+
+static inline struct packed_git *oe_in_pack(const struct packing_data *pack,
+					    const struct object_entry *e)
+{
+	return pack->in_pack[e->in_pack_idx];
+
+}
+
+static inline void oe_set_in_pack(struct object_entry *e,
+				  struct packed_git *p)
+{
+	if (p->index <= 0)
+		die("BUG: found_pack should be NULL "
+		    "instead of having non-positive index");
+	e->in_pack_idx = p->index;
+
+}
+
 #endif
-- 
2.16.2.873.g32ff258c87


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH/RFC v3 08/12] pack-objects: refer to delta objects by index instead of pointer
  2018-03-08 11:42     ` [PATCH/RFC v3 00/12] " Nguyễn Thái Ngọc Duy
                         ` (6 preceding siblings ...)
  2018-03-08 11:42       ` [PATCH/RFC v3 07/12] pack-objects: move in_pack " Nguyễn Thái Ngọc Duy
@ 2018-03-08 11:42       ` Nguyễn Thái Ngọc Duy
  2018-03-14 16:18         ` Junio C Hamano
  2018-03-08 11:42       ` [PATCH/RFC v3 09/12] pack-objects: reorder 'hash' to pack struct object_entry Nguyễn Thái Ngọc Duy
                         ` (4 subsequent siblings)
  12 siblings, 1 reply; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-08 11:42 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

Notice that packing_data::nr_objects is uint32_t, we could only handle
maximum 4G objects and can address all of them with an uint32_t. If we
use a pointer here, we waste 4 bytes on 64 bit architecture.

Convert these delta pointers to indexes. Since we need to handle NULL
pointers as well, the index is shifted by one [1].

There are holes in this struct but this patch is already big. Struct
packing can be done separately. Even with holes, we save 8 bytes per
object_entry.

[1] This means we can only index 2^32-2 objects even though nr_objects
    could contain 2^32-1 objects. It should not be a problem in
    practice because when we grow objects[], nr_alloc would probably
    blow up long before nr_objects hits the wall.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 116 ++++++++++++++++++++++-------------------
 pack-objects.h         |  71 ++++++++++++++++++++++---
 2 files changed, 127 insertions(+), 60 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 7df525e201..82a4a95888 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -30,6 +30,12 @@
 #include "packfile.h"
 
 #define IN_PACK(obj) oe_in_pack(&to_pack, obj)
+#define DELTA(obj) oe_delta(&to_pack, obj)
+#define DELTA_CHILD(obj) oe_delta_child(&to_pack, obj)
+#define DELTA_SIBLING(obj) oe_delta_sibling(&to_pack, obj)
+#define SET_DELTA(obj, val) oe_set_delta(&to_pack, obj, val)
+#define SET_DELTA_CHILD(obj, val) oe_set_delta_child(&to_pack, obj, val)
+#define SET_DELTA_SIBLING(obj, val) oe_set_delta_sibling(&to_pack, obj, val)
 
 static const char *pack_usage[] = {
 	N_("git pack-objects --stdout [<options>...] [< <ref-list> | < <object-list>]"),
@@ -127,11 +133,11 @@ static void *get_delta(struct object_entry *entry)
 	buf = read_sha1_file(entry->idx.oid.hash, &type, &size);
 	if (!buf)
 		die("unable to read %s", oid_to_hex(&entry->idx.oid));
-	base_buf = read_sha1_file(entry->delta->idx.oid.hash, &type,
+	base_buf = read_sha1_file(DELTA(entry)->idx.oid.hash, &type,
 				  &base_size);
 	if (!base_buf)
 		die("unable to read %s",
-		    oid_to_hex(&entry->delta->idx.oid));
+		    oid_to_hex(&DELTA(entry)->idx.oid));
 	delta_buf = diff_delta(base_buf, base_size,
 			       buf, size, &delta_size, 0);
 	if (!delta_buf || delta_size != entry->delta_size)
@@ -288,12 +294,12 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 		size = entry->delta_size;
 		buf = entry->delta_data;
 		entry->delta_data = NULL;
-		type = (allow_ofs_delta && entry->delta->idx.offset) ?
+		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	} else {
 		buf = get_delta(entry);
 		size = entry->delta_size;
-		type = (allow_ofs_delta && entry->delta->idx.offset) ?
+		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	}
 
@@ -317,7 +323,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 		 * encoding of the relative offset for the delta
 		 * base from this object's position in the pack.
 		 */
-		off_t ofs = entry->idx.offset - entry->delta->idx.offset;
+		off_t ofs = entry->idx.offset - DELTA(entry)->idx.offset;
 		unsigned pos = sizeof(dheader) - 1;
 		dheader[pos] = ofs & 127;
 		while (ofs >>= 7)
@@ -343,7 +349,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 			return 0;
 		}
 		hashwrite(f, header, hdrlen);
-		hashwrite(f, entry->delta->idx.oid.hash, 20);
+		hashwrite(f, DELTA(entry)->idx.oid.hash, 20);
 		hdrlen += 20;
 	} else {
 		if (limit && hdrlen + datalen + 20 >= limit) {
@@ -379,8 +385,8 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 		      dheader[MAX_PACK_OBJECT_HEADER];
 	unsigned hdrlen;
 
-	if (entry->delta)
-		type = (allow_ofs_delta && entry->delta->idx.offset) ?
+	if (DELTA(entry))
+		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	hdrlen = encode_in_pack_object_header(header, sizeof(header),
 					      type, entry->size);
@@ -408,7 +414,7 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 	}
 
 	if (type == OBJ_OFS_DELTA) {
-		off_t ofs = entry->idx.offset - entry->delta->idx.offset;
+		off_t ofs = entry->idx.offset - DELTA(entry)->idx.offset;
 		unsigned pos = sizeof(dheader) - 1;
 		dheader[pos] = ofs & 127;
 		while (ofs >>= 7)
@@ -427,7 +433,7 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 			return 0;
 		}
 		hashwrite(f, header, hdrlen);
-		hashwrite(f, entry->delta->idx.oid.hash, 20);
+		hashwrite(f, DELTA(entry)->idx.oid.hash, 20);
 		hdrlen += 20;
 		reused_delta++;
 	} else {
@@ -467,13 +473,13 @@ static off_t write_object(struct hashfile *f,
 	else
 		limit = pack_size_limit - write_offset;
 
-	if (!entry->delta)
+	if (!DELTA(entry))
 		usable_delta = 0;	/* no delta */
 	else if (!pack_size_limit)
 	       usable_delta = 1;	/* unlimited packfile */
-	else if (entry->delta->idx.offset == (off_t)-1)
+	else if (DELTA(entry)->idx.offset == (off_t)-1)
 		usable_delta = 0;	/* base was written to another pack */
-	else if (entry->delta->idx.offset)
+	else if (DELTA(entry)->idx.offset)
 		usable_delta = 1;	/* base already exists in this pack */
 	else
 		usable_delta = 0;	/* base could end up in another pack */
@@ -488,7 +494,7 @@ static off_t write_object(struct hashfile *f,
 				/* ... but pack split may override that */
 	else if (entry->type != entry->in_pack_type)
 		to_reuse = 0;	/* pack has delta which is unusable */
-	else if (entry->delta)
+	else if (DELTA(entry))
 		to_reuse = 0;	/* we want to pack afresh */
 	else
 		to_reuse = 1;	/* we have it in-pack undeltified,
@@ -540,12 +546,12 @@ static enum write_one_status write_one(struct hashfile *f,
 	}
 
 	/* if we are deltified, write out base object first. */
-	if (e->delta) {
+	if (DELTA(e)) {
 		e->idx.offset = 1; /* now recurse */
-		switch (write_one(f, e->delta, offset)) {
+		switch (write_one(f, DELTA(e), offset)) {
 		case WRITE_ONE_RECURSIVE:
 			/* we cannot depend on this one */
-			e->delta = NULL;
+			SET_DELTA(e, NULL);
 			break;
 		default:
 			break;
@@ -607,34 +613,34 @@ static void add_descendants_to_write_order(struct object_entry **wo,
 			/* add this node... */
 			add_to_write_order(wo, endp, e);
 			/* all its siblings... */
-			for (s = e->delta_sibling; s; s = s->delta_sibling) {
+			for (s = DELTA_SIBLING(e); s; s = DELTA_SIBLING(s)) {
 				add_to_write_order(wo, endp, s);
 			}
 		}
 		/* drop down a level to add left subtree nodes if possible */
-		if (e->delta_child) {
+		if (DELTA_CHILD(e)) {
 			add_to_order = 1;
-			e = e->delta_child;
+			e = DELTA_CHILD(e);
 		} else {
 			add_to_order = 0;
 			/* our sibling might have some children, it is next */
-			if (e->delta_sibling) {
-				e = e->delta_sibling;
+			if (DELTA_SIBLING(e)) {
+				e = DELTA_SIBLING(e);
 				continue;
 			}
 			/* go back to our parent node */
-			e = e->delta;
-			while (e && !e->delta_sibling) {
+			e = DELTA(e);
+			while (e && !DELTA_SIBLING(e)) {
 				/* we're on the right side of a subtree, keep
 				 * going up until we can go right again */
-				e = e->delta;
+				e = DELTA(e);
 			}
 			if (!e) {
 				/* done- we hit our original root node */
 				return;
 			}
 			/* pass it off to sibling at this level */
-			e = e->delta_sibling;
+			e = DELTA_SIBLING(e);
 		}
 	};
 }
@@ -645,7 +651,7 @@ static void add_family_to_write_order(struct object_entry **wo,
 {
 	struct object_entry *root;
 
-	for (root = e; root->delta; root = root->delta)
+	for (root = e; DELTA(root); root = DELTA(root))
 		; /* nothing */
 	add_descendants_to_write_order(wo, endp, root);
 }
@@ -660,8 +666,8 @@ static struct object_entry **compute_write_order(void)
 	for (i = 0; i < to_pack.nr_objects; i++) {
 		objects[i].tagged = 0;
 		objects[i].filled = 0;
-		objects[i].delta_child = NULL;
-		objects[i].delta_sibling = NULL;
+		SET_DELTA_CHILD(&objects[i], NULL);
+		SET_DELTA_SIBLING(&objects[i], NULL);
 	}
 
 	/*
@@ -671,11 +677,11 @@ static struct object_entry **compute_write_order(void)
 	 */
 	for (i = to_pack.nr_objects; i > 0;) {
 		struct object_entry *e = &objects[--i];
-		if (!e->delta)
+		if (!DELTA(e))
 			continue;
 		/* Mark me as the first child */
-		e->delta_sibling = e->delta->delta_child;
-		e->delta->delta_child = e;
+		e->delta_sibling_idx = DELTA(e)->delta_child_idx;
+		SET_DELTA_CHILD(DELTA(e), e);
 	}
 
 	/*
@@ -1498,10 +1504,10 @@ static void check_object(struct object_entry *entry)
 			 * circular deltas.
 			 */
 			entry->type = entry->in_pack_type;
-			entry->delta = base_entry;
+			SET_DELTA(entry, base_entry);
 			entry->delta_size = entry->size;
-			entry->delta_sibling = base_entry->delta_child;
-			base_entry->delta_child = entry;
+			entry->delta_sibling_idx = base_entry->delta_child_idx;
+			SET_DELTA_CHILD(base_entry, entry);
 			unuse_pack(&w_curs);
 			return;
 		}
@@ -1572,17 +1578,19 @@ static int pack_offset_sort(const void *_a, const void *_b)
  */
 static void drop_reused_delta(struct object_entry *entry)
 {
-	struct object_entry **p = &entry->delta->delta_child;
+	unsigned *idx = &to_pack.objects[entry->delta_idx - 1].delta_child_idx;
 	struct object_info oi = OBJECT_INFO_INIT;
 	enum object_type type;
 
-	while (*p) {
-		if (*p == entry)
-			*p = (*p)->delta_sibling;
+	while (*idx) {
+		struct object_entry *oe = &to_pack.objects[*idx - 1];
+
+		if (oe == entry)
+			*idx = oe->delta_sibling_idx;
 		else
-			p = &(*p)->delta_sibling;
+			idx = &oe->delta_sibling_idx;
 	}
-	entry->delta = NULL;
+	SET_DELTA(entry, NULL);
 	entry->depth = 0;
 
 	oi.sizep = &entry->size;
@@ -1624,7 +1632,7 @@ static void break_delta_chains(struct object_entry *entry)
 
 	for (cur = entry, total_depth = 0;
 	     cur;
-	     cur = cur->delta, total_depth++) {
+	     cur = DELTA(cur), total_depth++) {
 		if (cur->dfs_state == DFS_DONE) {
 			/*
 			 * We've already seen this object and know it isn't
@@ -1649,7 +1657,7 @@ static void break_delta_chains(struct object_entry *entry)
 		 * it's not a delta, we're done traversing, but we'll mark it
 		 * done to save time on future traversals.
 		 */
-		if (!cur->delta) {
+		if (!DELTA(cur)) {
 			cur->dfs_state = DFS_DONE;
 			break;
 		}
@@ -1672,7 +1680,7 @@ static void break_delta_chains(struct object_entry *entry)
 		 * We keep all commits in the chain that we examined.
 		 */
 		cur->dfs_state = DFS_ACTIVE;
-		if (cur->delta->dfs_state == DFS_ACTIVE) {
+		if (DELTA(cur)->dfs_state == DFS_ACTIVE) {
 			drop_reused_delta(cur);
 			cur->dfs_state = DFS_DONE;
 			break;
@@ -1687,7 +1695,7 @@ static void break_delta_chains(struct object_entry *entry)
 	 * an extra "next" pointer to keep going after we reset cur->delta.
 	 */
 	for (cur = entry; cur; cur = next) {
-		next = cur->delta;
+		next = DELTA(cur);
 
 		/*
 		 * We should have a chain of zero or more ACTIVE states down to
@@ -1870,7 +1878,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 
 	/* Now some size filtering heuristics. */
 	trg_size = trg_entry->size;
-	if (!trg_entry->delta) {
+	if (!DELTA(trg_entry)) {
 		max_size = trg_size/2 - 20;
 		ref_depth = 1;
 	} else {
@@ -1946,7 +1954,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	if (!delta_buf)
 		return 0;
 
-	if (trg_entry->delta) {
+	if (DELTA(trg_entry)) {
 		/* Prefer only shallower same-sized deltas. */
 		if (delta_size == trg_entry->delta_size &&
 		    src->depth + 1 >= trg->depth) {
@@ -1975,7 +1983,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 		free(delta_buf);
 	}
 
-	trg_entry->delta = src_entry;
+	SET_DELTA(trg_entry, src_entry);
 	trg_entry->delta_size = delta_size;
 	trg->depth = src->depth + 1;
 
@@ -1984,13 +1992,13 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 
 static unsigned int check_delta_limit(struct object_entry *me, unsigned int n)
 {
-	struct object_entry *child = me->delta_child;
+	struct object_entry *child = DELTA_CHILD(me);
 	unsigned int m = n;
 	while (child) {
 		unsigned int c = check_delta_limit(child, n + 1);
 		if (m < c)
 			m = c;
-		child = child->delta_sibling;
+		child = DELTA_SIBLING(child);
 	}
 	return m;
 }
@@ -2059,7 +2067,7 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
 		 * otherwise they would become too deep.
 		 */
 		max_depth = depth;
-		if (entry->delta_child) {
+		if (DELTA_CHILD(entry)) {
 			max_depth -= check_delta_limit(entry, 0);
 			if (max_depth <= 0)
 				goto next;
@@ -2109,7 +2117,7 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
 		 * depth, leaving it in the window is pointless.  we
 		 * should evict it first.
 		 */
-		if (entry->delta && max_depth <= n->depth)
+		if (DELTA(entry) && max_depth <= n->depth)
 			continue;
 
 		/*
@@ -2117,7 +2125,7 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
 		 * currently deltified object, to keep it longer.  It will
 		 * be the first base object to be attempted next.
 		 */
-		if (entry->delta) {
+		if (DELTA(entry)) {
 			struct unpacked swap = array[best_base];
 			int dist = (window + idx - best_base) % window;
 			int dst = best_base;
@@ -2438,7 +2446,7 @@ static void prepare_pack(int window, int depth)
 	for (i = 0; i < to_pack.nr_objects; i++) {
 		struct object_entry *entry = to_pack.objects + i;
 
-		if (entry->delta)
+		if (DELTA(entry))
 			/* This happens if we decided to reuse existing
 			 * delta from a pack.  "reuse_delta &&" is implied.
 			 */
diff --git a/pack-objects.h b/pack-objects.h
index 59c44b3420..1c0ad4c9ef 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -72,11 +72,13 @@ struct object_entry {
 	struct pack_idx_entry idx;
 	unsigned long size;	/* uncompressed size */
 	off_t in_pack_offset;
-	struct object_entry *delta;	/* delta base object */
-	struct object_entry *delta_child; /* deltified objects who bases me */
-	struct object_entry *delta_sibling; /* other deltified objects who
-					     * uses the same base as me
-					     */
+	uint32_t delta_idx;	/* delta base object */
+	uint32_t delta_child_idx; /* deltified objects who bases me */
+	uint32_t delta_sibling_idx; /* other deltified objects who
+				     * uses the same base as me
+				     */
+	/* XXX 4 bytes hole, try to pack */
+
 	void *delta_data;	/* cached delta (uncompressed) */
 	unsigned long delta_size;	/* delta data size (uncompressed) */
 	unsigned long z_delta_size;	/* delta data size (compressed) */
@@ -99,7 +101,7 @@ struct object_entry {
 
 	unsigned depth:OE_DEPTH_BITS;
 
-	/* size: 112, padding: 4, bit_padding: 18 bits */
+	/* size: 104, padding: 4, bit_padding: 18 bits */
 };
 
 struct packing_data {
@@ -190,4 +192,61 @@ static inline void oe_set_in_pack(struct object_entry *e,
 
 }
 
+static inline struct object_entry *oe_delta(
+		const struct packing_data *pack,
+		const struct object_entry *e)
+{
+	if (e->delta_idx)
+		return &pack->objects[e->delta_idx - 1];
+	return NULL;
+}
+
+static inline void oe_set_delta(struct packing_data *pack,
+				struct object_entry *e,
+				struct object_entry *delta)
+{
+	if (delta)
+		e->delta_idx = (delta - pack->objects) + 1;
+	else
+		e->delta_idx = 0;
+}
+
+static inline struct object_entry *oe_delta_child(
+		const struct packing_data *pack,
+		const struct object_entry *e)
+{
+	if (e->delta_child_idx)
+		return &pack->objects[e->delta_child_idx - 1];
+	return NULL;
+}
+
+static inline void oe_set_delta_child(struct packing_data *pack,
+				      struct object_entry *e,
+				      struct object_entry *delta)
+{
+	if (delta)
+		e->delta_child_idx = (delta - pack->objects) + 1;
+	else
+		e->delta_child_idx = 0;
+}
+
+static inline struct object_entry *oe_delta_sibling(
+		const struct packing_data *pack,
+		const struct object_entry *e)
+{
+	if (e->delta_sibling_idx)
+		return &pack->objects[e->delta_sibling_idx - 1];
+	return NULL;
+}
+
+static inline void oe_set_delta_sibling(struct packing_data *pack,
+					struct object_entry *e,
+					struct object_entry *delta)
+{
+	if (delta)
+		e->delta_sibling_idx = (delta - pack->objects) + 1;
+	else
+		e->delta_sibling_idx = 0;
+}
+
 #endif
-- 
2.16.2.873.g32ff258c87


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH/RFC v3 09/12] pack-objects: reorder 'hash' to pack struct object_entry
  2018-03-08 11:42     ` [PATCH/RFC v3 00/12] " Nguyễn Thái Ngọc Duy
                         ` (7 preceding siblings ...)
  2018-03-08 11:42       ` [PATCH/RFC v3 08/12] pack-objects: refer to delta objects by index instead of pointer Nguyễn Thái Ngọc Duy
@ 2018-03-08 11:42       ` Nguyễn Thái Ngọc Duy
  2018-03-08 11:42       ` [PATCH/RFC v3 10/12] pack-objects: shrink z_delta_size field in " Nguyễn Thái Ngọc Duy
                         ` (3 subsequent siblings)
  12 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-08 11:42 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 pack-objects.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/pack-objects.h b/pack-objects.h
index 1c0ad4c9ef..3c15cf7b23 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -77,12 +77,10 @@ struct object_entry {
 	uint32_t delta_sibling_idx; /* other deltified objects who
 				     * uses the same base as me
 				     */
-	/* XXX 4 bytes hole, try to pack */
-
+	uint32_t hash;			/* name hint hash */
 	void *delta_data;	/* cached delta (uncompressed) */
 	unsigned long delta_size;	/* delta data size (uncompressed) */
 	unsigned long z_delta_size;	/* delta data size (compressed) */
-	uint32_t hash;			/* name hint hash */
 	unsigned char in_pack_header_size; /* note: spare bits available! */
 	unsigned in_pack_idx:OE_IN_PACK_BITS;	/* already in pack */
 	unsigned type:TYPE_BITS;
@@ -101,7 +99,7 @@ struct object_entry {
 
 	unsigned depth:OE_DEPTH_BITS;
 
-	/* size: 104, padding: 4, bit_padding: 18 bits */
+	/* size: 96, bit_padding: 18 bits */
 };
 
 struct packing_data {
-- 
2.16.2.873.g32ff258c87


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH/RFC v3 10/12] pack-objects: shrink z_delta_size field in struct object_entry
  2018-03-08 11:42     ` [PATCH/RFC v3 00/12] " Nguyễn Thái Ngọc Duy
                         ` (8 preceding siblings ...)
  2018-03-08 11:42       ` [PATCH/RFC v3 09/12] pack-objects: reorder 'hash' to pack struct object_entry Nguyễn Thái Ngọc Duy
@ 2018-03-08 11:42       ` Nguyễn Thái Ngọc Duy
  2018-03-08 11:42       ` [PATCH/RFC v3 11/12] pack-objects: shrink size " Nguyễn Thái Ngọc Duy
                         ` (2 subsequent siblings)
  12 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-08 11:42 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

We only cache deltas when it's smaller than a certain limit. This limit
defaults to 1000 but save its compressed length in a 64-bit field.
Shrink that field down to 16 bits, so you can only cache 65kb deltas.
Larger deltas must be recomputed at when the pack is written down.

This saves us 8 bytes (some from previous bit padding).

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 Documentation/config.txt |  3 ++-
 builtin/pack-objects.c   | 22 ++++++++++++++++------
 pack-objects.h           | 11 ++++++++---
 3 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index 9bd3f5a789..00fa824448 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -2449,7 +2449,8 @@ pack.deltaCacheLimit::
 	The maximum size of a delta, that is cached in
 	linkgit:git-pack-objects[1]. This cache is used to speed up the
 	writing object phase by not having to recompute the final delta
-	result once the best match for all objects is found. Defaults to 1000.
+	result once the best match for all objects is found.
+	Defaults to 1000. Maximum value is 65535.
 
 pack.threads::
 	Specifies the number of threads to spawn when searching for best
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 82a4a95888..39920061e9 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -2105,12 +2105,19 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
 		 * between writes at that moment.
 		 */
 		if (entry->delta_data && !pack_to_stdout) {
-			entry->z_delta_size = do_compress(&entry->delta_data,
-							  entry->delta_size);
-			cache_lock();
-			delta_cache_size -= entry->delta_size;
-			delta_cache_size += entry->z_delta_size;
-			cache_unlock();
+			unsigned long size;
+
+			size = do_compress(&entry->delta_data, entry->delta_size);
+			entry->z_delta_size = size;
+			if (entry->z_delta_size == size) {
+				cache_lock();
+				delta_cache_size -= entry->delta_size;
+				delta_cache_size += entry->z_delta_size;
+				cache_unlock();
+			} else {
+				FREE_AND_NULL(entry->delta_data);
+				entry->z_delta_size = 0;
+			}
 		}
 
 		/* if we made n a delta, and if n is already at max
@@ -3089,6 +3096,9 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 	if (depth > (1 << OE_DEPTH_BITS))
 		die(_("delta chain depth %d is greater than maximum limit %d"),
 		    depth, (1 << OE_DEPTH_BITS));
+	if (cache_max_small_delta_size >= (1 << OE_Z_DELTA_BITS))
+		die(_("pack.deltaCacheLimit is greater than maximum limit %d"),
+		    1 << OE_Z_DELTA_BITS);
 
 	argv_array_push(&rp, "pack-objects");
 	if (thin) {
diff --git a/pack-objects.h b/pack-objects.h
index 3c15cf7b23..cbb39ab568 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -4,6 +4,7 @@
 #define OE_DFS_STATE_BITS	2
 #define OE_DEPTH_BITS		12
 #define OE_IN_PACK_BITS		14
+#define OE_Z_DELTA_BITS		16
 
 /*
  * State flags for depth-first search used for analyzing delta cycles.
@@ -80,7 +81,6 @@ struct object_entry {
 	uint32_t hash;			/* name hint hash */
 	void *delta_data;	/* cached delta (uncompressed) */
 	unsigned long delta_size;	/* delta data size (uncompressed) */
-	unsigned long z_delta_size;	/* delta data size (compressed) */
 	unsigned char in_pack_header_size; /* note: spare bits available! */
 	unsigned in_pack_idx:OE_IN_PACK_BITS;	/* already in pack */
 	unsigned type:TYPE_BITS;
@@ -93,13 +93,18 @@ struct object_entry {
 	unsigned no_try_delta:1;
 	unsigned tagged:1; /* near the very tip of refs */
 	unsigned filled:1; /* assigned write-order */
-	unsigned dfs_state:OE_DFS_STATE_BITS;
 
 	/* XXX 8 bits hole, try to pack */
 
+	unsigned dfs_state:OE_DFS_STATE_BITS;
 	unsigned depth:OE_DEPTH_BITS;
+	/*
+	 * if delta_data contains a compressed delta, this contains
+	 * the compressed length
+	*/
+	unsigned z_delta_size:OE_Z_DELTA_BITS;
 
-	/* size: 96, bit_padding: 18 bits */
+	/* size: 88, bit_padding: 2 bits */
 };
 
 struct packing_data {
-- 
2.16.2.873.g32ff258c87


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH/RFC v3 11/12] pack-objects: shrink size field in struct object_entry
  2018-03-08 11:42     ` [PATCH/RFC v3 00/12] " Nguyễn Thái Ngọc Duy
                         ` (9 preceding siblings ...)
  2018-03-08 11:42       ` [PATCH/RFC v3 10/12] pack-objects: shrink z_delta_size field in " Nguyễn Thái Ngọc Duy
@ 2018-03-08 11:42       ` Nguyễn Thái Ngọc Duy
  2018-03-08 11:42       ` [PATCH/RFC v3 12/12] pack-objects: shrink delta_size " Nguyễn Thái Ngọc Duy
  2018-03-16 18:31       ` [PATCH v4 00/11] nd/pack-objects-pack-struct updates Nguyễn Thái Ngọc Duy
  12 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-08 11:42 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

It's very very rare that an uncompressd object is larger than
4GB (partly because Git does not handle those large files very well to
begin with). Let's optimize it for the common case where object size is
smaller than this limit.

Shrink size field down to 32 bits [1] and one overflow bit. If the size
is too large, we read it back from disk.

Add two compare helpers that can take advantage of the overflow
bit (e.g. if the file is 4GB+, chances are it's already larger than
core.bigFileThreshold and there's no point in comparing the actual
value).

There's no actual saving from this due to holes. Which should be gone in
the next patch.

[1] it's actually already 32 bits on 64-bit Windows

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 49 ++++++++++++++++++++++++++----------------
 pack-objects.h         | 48 +++++++++++++++++++++++++++++++++++++++--
 2 files changed, 77 insertions(+), 20 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 39920061e9..db040e95db 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -274,7 +274,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 
 	if (!usable_delta) {
 		if (entry->type == OBJ_BLOB &&
-		    entry->size > big_file_threshold &&
+		    oe_size_greater_than(entry, big_file_threshold) &&
 		    (st = open_istream(entry->idx.oid.hash, &type, &size, NULL)) != NULL)
 			buf = NULL;
 		else {
@@ -384,12 +384,13 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 	unsigned char header[MAX_PACK_OBJECT_HEADER],
 		      dheader[MAX_PACK_OBJECT_HEADER];
 	unsigned hdrlen;
+	unsigned long entry_size = oe_size(entry);
 
 	if (DELTA(entry))
 		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	hdrlen = encode_in_pack_object_header(header, sizeof(header),
-					      type, entry->size);
+					      type, entry_size);
 
 	offset = entry->in_pack_offset;
 	revidx = find_pack_revindex(p, offset);
@@ -406,7 +407,7 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 	datalen -= entry->in_pack_header_size;
 
 	if (!pack_to_stdout && p->index_version == 1 &&
-	    check_pack_inflate(p, &w_curs, offset, datalen, entry->size)) {
+	    check_pack_inflate(p, &w_curs, offset, datalen, entry_size)) {
 		error("corrupt packed object for %s",
 		      oid_to_hex(&entry->idx.oid));
 		unuse_pack(&w_curs);
@@ -1412,6 +1413,8 @@ static void cleanup_preferred_base(void)
 
 static void check_object(struct object_entry *entry)
 {
+	unsigned long size;
+
 	if (IN_PACK(entry)) {
 		struct packed_git *p = IN_PACK(entry);
 		struct pack_window *w_curs = NULL;
@@ -1431,13 +1434,14 @@ static void check_object(struct object_entry *entry)
 		 */
 		used = unpack_object_header_buffer(buf, avail,
 						   &type,
-						   &entry->size);
+						   &size);
 		if (used == 0)
 			goto give_up;
 
 		if (type < 0)
 			die("BUG: invalid type %d", type);
 		entry->in_pack_type = type;
+		oe_set_size(entry, size);
 
 		/*
 		 * Determine if this is a delta and if so whether we can
@@ -1505,7 +1509,7 @@ static void check_object(struct object_entry *entry)
 			 */
 			entry->type = entry->in_pack_type;
 			SET_DELTA(entry, base_entry);
-			entry->delta_size = entry->size;
+			entry->delta_size = oe_size(entry);
 			entry->delta_sibling_idx = base_entry->delta_child_idx;
 			SET_DELTA_CHILD(base_entry, entry);
 			unuse_pack(&w_curs);
@@ -1513,14 +1517,17 @@ static void check_object(struct object_entry *entry)
 		}
 
 		if (entry->type) {
+			unsigned long size;
+
+			size = get_size_from_delta(p, &w_curs,
+				entry->in_pack_offset + entry->in_pack_header_size);
 			/*
 			 * This must be a delta and we already know what the
 			 * final object type is.  Let's extract the actual
 			 * object size from the delta header.
 			 */
-			entry->size = get_size_from_delta(p, &w_curs,
-					entry->in_pack_offset + entry->in_pack_header_size);
-			if (entry->size == 0)
+			oe_set_size(entry, size);
+			if (oe_size_less_than(entry, 1))
 				goto give_up;
 			unuse_pack(&w_curs);
 			return;
@@ -1535,13 +1542,14 @@ static void check_object(struct object_entry *entry)
 		unuse_pack(&w_curs);
 	}
 
-	entry->type = sha1_object_info(entry->idx.oid.hash, &entry->size);
+	entry->type = sha1_object_info(entry->idx.oid.hash, &size);
 	/*
 	 * The error condition is checked in prepare_pack().  This is
 	 * to permit a missing preferred base object to be ignored
 	 * as a preferred base.  Doing so can result in a larger
 	 * pack file, but the transfer will still take place.
 	 */
+	oe_set_size(entry, size);
 }
 
 static int pack_offset_sort(const void *_a, const void *_b)
@@ -1581,6 +1589,7 @@ static void drop_reused_delta(struct object_entry *entry)
 	unsigned *idx = &to_pack.objects[entry->delta_idx - 1].delta_child_idx;
 	struct object_info oi = OBJECT_INFO_INIT;
 	enum object_type type;
+	unsigned long size;
 
 	while (*idx) {
 		struct object_entry *oe = &to_pack.objects[*idx - 1];
@@ -1593,7 +1602,7 @@ static void drop_reused_delta(struct object_entry *entry)
 	SET_DELTA(entry, NULL);
 	entry->depth = 0;
 
-	oi.sizep = &entry->size;
+	oi.sizep = &size;
 	oi.typep = &type;
 	if (packed_object_info(IN_PACK(entry), entry->in_pack_offset, &oi) < 0) {
 		/*
@@ -1603,11 +1612,13 @@ static void drop_reused_delta(struct object_entry *entry)
 		 * and dealt with in prepare_pack().
 		 */
 		entry->type = sha1_object_info(entry->idx.oid.hash,
-					       &entry->size);
+					       &size);
+		oe_set_size(entry, size);
 	} else {
 		if (type < 0)
 			die("BUG: invalid type %d", type);
 		entry->type = type;
+		oe_set_size(entry, size);
 	}
 }
 
@@ -1748,7 +1759,7 @@ static void get_object_details(void)
 	for (i = 0; i < to_pack.nr_objects; i++) {
 		struct object_entry *entry = sorted_by_offset[i];
 		check_object(entry);
-		if (big_file_threshold < entry->size)
+		if (oe_size_greater_than(entry, big_file_threshold))
 			entry->no_try_delta = 1;
 	}
 
@@ -1775,6 +1786,8 @@ static int type_size_sort(const void *_a, const void *_b)
 {
 	const struct object_entry *a = *(struct object_entry **)_a;
 	const struct object_entry *b = *(struct object_entry **)_b;
+	unsigned long a_size = oe_size(a);
+	unsigned long b_size = oe_size(b);
 
 	if (a->type > b->type)
 		return -1;
@@ -1788,9 +1801,9 @@ static int type_size_sort(const void *_a, const void *_b)
 		return -1;
 	if (a->preferred_base < b->preferred_base)
 		return 1;
-	if (a->size > b->size)
+	if (a_size > b_size)
 		return -1;
-	if (a->size < b->size)
+	if (a_size < b_size)
 		return 1;
 	return a < b ? -1 : (a > b);  /* newest first */
 }
@@ -1877,7 +1890,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 		return 0;
 
 	/* Now some size filtering heuristics. */
-	trg_size = trg_entry->size;
+	trg_size = oe_size(trg_entry);
 	if (!DELTA(trg_entry)) {
 		max_size = trg_size/2 - 20;
 		ref_depth = 1;
@@ -1889,7 +1902,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 						(max_depth - ref_depth + 1);
 	if (max_size == 0)
 		return 0;
-	src_size = src_entry->size;
+	src_size = oe_size(src_entry);
 	sizediff = src_size < trg_size ? trg_size - src_size : 0;
 	if (sizediff >= max_size)
 		return 0;
@@ -2009,7 +2022,7 @@ static unsigned long free_unpacked(struct unpacked *n)
 	free_delta_index(n->index);
 	n->index = NULL;
 	if (n->data) {
-		freed_mem += n->entry->size;
+		freed_mem += oe_size(n->entry);
 		FREE_AND_NULL(n->data);
 	}
 	n->entry = NULL;
@@ -2459,7 +2472,7 @@ static void prepare_pack(int window, int depth)
 			 */
 			continue;
 
-		if (entry->size < 50)
+		if (oe_size_less_than(entry, 50))
 			continue;
 
 		if (entry->no_try_delta)
diff --git a/pack-objects.h b/pack-objects.h
index cbb39ab568..0253df6cd4 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -71,7 +71,11 @@ enum dfs_state {
  */
 struct object_entry {
 	struct pack_idx_entry idx;
-	unsigned long size;	/* uncompressed size */
+	/* object uncompressed size _if_ size_valid is true */
+	uint32_t size_;
+
+	/* XXX 4 bytes hole, try to pack */
+
 	off_t in_pack_offset;
 	uint32_t delta_idx;	/* delta base object */
 	uint32_t delta_child_idx; /* deltified objects who bases me */
@@ -93,6 +97,7 @@ struct object_entry {
 	unsigned no_try_delta:1;
 	unsigned tagged:1; /* near the very tip of refs */
 	unsigned filled:1; /* assigned write-order */
+	unsigned size_valid:1;
 
 	/* XXX 8 bits hole, try to pack */
 
@@ -104,7 +109,7 @@ struct object_entry {
 	*/
 	unsigned z_delta_size:OE_Z_DELTA_BITS;
 
-	/* size: 88, bit_padding: 2 bits */
+	/* size: 88, bit_padding: 1 bits */
 };
 
 struct packing_data {
@@ -252,4 +257,43 @@ static inline void oe_set_delta_sibling(struct packing_data *pack,
 		e->delta_sibling_idx = 0;
 }
 
+static inline unsigned long oe_size(const struct object_entry *e)
+{
+	if (e->size_valid) {
+		return e->size_;
+	} else {
+		unsigned long size;
+
+		sha1_object_info(e->idx.oid.hash, &size);
+		return size;
+	}
+}
+
+static inline int oe_size_less_than(const struct object_entry *e,
+				    unsigned long limit)
+{
+	if (e->size_valid)
+		return e->size_ < limit;
+	if (limit > maximum_unsigned_value_of_type(uint32_t))
+		return 1;
+	return oe_size(e) < limit;
+}
+
+static inline int oe_size_greater_than(const struct object_entry *e,
+				       unsigned long limit)
+{
+	if (e->size_valid)
+		return e->size_ > limit;
+	if (limit <= maximum_unsigned_value_of_type(uint32_t))
+		return 1;
+	return oe_size(e) > limit;
+}
+
+static inline void oe_set_size(struct object_entry *e,
+			       unsigned long size)
+{
+	e->size_ = size;
+	e->size_valid = e->size_ == size;
+}
+
 #endif
-- 
2.16.2.873.g32ff258c87


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH/RFC v3 12/12] pack-objects: shrink delta_size field in struct object_entry
  2018-03-08 11:42     ` [PATCH/RFC v3 00/12] " Nguyễn Thái Ngọc Duy
                         ` (10 preceding siblings ...)
  2018-03-08 11:42       ` [PATCH/RFC v3 11/12] pack-objects: shrink size " Nguyễn Thái Ngọc Duy
@ 2018-03-08 11:42       ` Nguyễn Thái Ngọc Duy
  2018-03-16 18:31       ` [PATCH v4 00/11] nd/pack-objects-pack-struct updates Nguyễn Thái Ngọc Duy
  12 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-08 11:42 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

Allowing a delta size of 64 bits is crazy. Shrink this field down to
31 bits with one overflow bit.

If we encounter an existing delta larger than 2GB, we do not cache
delta_size at all and will get the value from oe_size(), potentially
from disk if it's larger than 4GB.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 24 ++++++++++++++----------
 pack-objects.h         | 30 +++++++++++++++++++++++++-----
 2 files changed, 39 insertions(+), 15 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index db040e95db..0f65e0f243 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -30,10 +30,12 @@
 #include "packfile.h"
 
 #define IN_PACK(obj) oe_in_pack(&to_pack, obj)
+#define DELTA_SIZE(obj) oe_delta_size(&to_pack, obj)
 #define DELTA(obj) oe_delta(&to_pack, obj)
 #define DELTA_CHILD(obj) oe_delta_child(&to_pack, obj)
 #define DELTA_SIBLING(obj) oe_delta_sibling(&to_pack, obj)
 #define SET_DELTA(obj, val) oe_set_delta(&to_pack, obj, val)
+#define SET_DELTA_SIZE(obj, val) oe_set_delta_size(&to_pack, obj, val)
 #define SET_DELTA_CHILD(obj, val) oe_set_delta_child(&to_pack, obj, val)
 #define SET_DELTA_SIBLING(obj, val) oe_set_delta_sibling(&to_pack, obj, val)
 
@@ -140,7 +142,7 @@ static void *get_delta(struct object_entry *entry)
 		    oid_to_hex(&DELTA(entry)->idx.oid));
 	delta_buf = diff_delta(base_buf, base_size,
 			       buf, size, &delta_size, 0);
-	if (!delta_buf || delta_size != entry->delta_size)
+	if (!delta_buf || delta_size != DELTA_SIZE(entry))
 		die("delta size changed");
 	free(buf);
 	free(base_buf);
@@ -291,14 +293,14 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 		FREE_AND_NULL(entry->delta_data);
 		entry->z_delta_size = 0;
 	} else if (entry->delta_data) {
-		size = entry->delta_size;
+		size = DELTA_SIZE(entry);
 		buf = entry->delta_data;
 		entry->delta_data = NULL;
 		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	} else {
 		buf = get_delta(entry);
-		size = entry->delta_size;
+		size = DELTA_SIZE(entry);
 		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	}
@@ -1509,7 +1511,7 @@ static void check_object(struct object_entry *entry)
 			 */
 			entry->type = entry->in_pack_type;
 			SET_DELTA(entry, base_entry);
-			entry->delta_size = oe_size(entry);
+			SET_DELTA_SIZE(entry, oe_size(entry));
 			entry->delta_sibling_idx = base_entry->delta_child_idx;
 			SET_DELTA_CHILD(base_entry, entry);
 			unuse_pack(&w_curs);
@@ -1895,7 +1897,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 		max_size = trg_size/2 - 20;
 		ref_depth = 1;
 	} else {
-		max_size = trg_entry->delta_size;
+		max_size = DELTA_SIZE(trg_entry);
 		ref_depth = trg->depth;
 	}
 	max_size = (uint64_t)max_size * (max_depth - src->depth) /
@@ -1966,10 +1968,12 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	delta_buf = create_delta(src->index, trg->data, trg_size, &delta_size, max_size);
 	if (!delta_buf)
 		return 0;
+	if (delta_size >= maximum_unsigned_value_of_type(uint32_t))
+		return 0;
 
 	if (DELTA(trg_entry)) {
 		/* Prefer only shallower same-sized deltas. */
-		if (delta_size == trg_entry->delta_size &&
+		if (delta_size == DELTA_SIZE(trg_entry) &&
 		    src->depth + 1 >= trg->depth) {
 			free(delta_buf);
 			return 0;
@@ -1984,7 +1988,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	free(trg_entry->delta_data);
 	cache_lock();
 	if (trg_entry->delta_data) {
-		delta_cache_size -= trg_entry->delta_size;
+		delta_cache_size -= DELTA_SIZE(trg_entry);
 		trg_entry->delta_data = NULL;
 	}
 	if (delta_cacheable(src_size, trg_size, delta_size)) {
@@ -1997,7 +2001,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	}
 
 	SET_DELTA(trg_entry, src_entry);
-	trg_entry->delta_size = delta_size;
+	SET_DELTA_SIZE(trg_entry, delta_size);
 	trg->depth = src->depth + 1;
 
 	return 1;
@@ -2120,11 +2124,11 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
 		if (entry->delta_data && !pack_to_stdout) {
 			unsigned long size;
 
-			size = do_compress(&entry->delta_data, entry->delta_size);
+			size = do_compress(&entry->delta_data, DELTA_SIZE(entry));
 			entry->z_delta_size = size;
 			if (entry->z_delta_size == size) {
 				cache_lock();
-				delta_cache_size -= entry->delta_size;
+				delta_cache_size -= DELTA_SIZE(entry);
 				delta_cache_size += entry->z_delta_size;
 				cache_unlock();
 			} else {
diff --git a/pack-objects.h b/pack-objects.h
index 0253df6cd4..f1a82bf9ac 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -5,6 +5,7 @@
 #define OE_DEPTH_BITS		12
 #define OE_IN_PACK_BITS		14
 #define OE_Z_DELTA_BITS		16
+#define OE_DELTA_SIZE_BITS	31
 
 /*
  * State flags for depth-first search used for analyzing delta cycles.
@@ -73,9 +74,6 @@ struct object_entry {
 	struct pack_idx_entry idx;
 	/* object uncompressed size _if_ size_valid is true */
 	uint32_t size_;
-
-	/* XXX 4 bytes hole, try to pack */
-
 	off_t in_pack_offset;
 	uint32_t delta_idx;	/* delta base object */
 	uint32_t delta_child_idx; /* deltified objects who bases me */
@@ -84,7 +82,10 @@ struct object_entry {
 				     */
 	uint32_t hash;			/* name hint hash */
 	void *delta_data;	/* cached delta (uncompressed) */
-	unsigned long delta_size;	/* delta data size (uncompressed) */
+	/* object uncompressed size _if_ size_valid is true */
+	uint32_t size;
+	uint32_t delta_size_:OE_DELTA_SIZE_BITS;	/* delta data size (uncompressed) */
+	uint32_t delta_size_valid:1;
 	unsigned char in_pack_header_size; /* note: spare bits available! */
 	unsigned in_pack_idx:OE_IN_PACK_BITS;	/* already in pack */
 	unsigned type:TYPE_BITS;
@@ -109,7 +110,7 @@ struct object_entry {
 	*/
 	unsigned z_delta_size:OE_Z_DELTA_BITS;
 
-	/* size: 88, bit_padding: 1 bits */
+	/* size: 80, bit_padding: 1 bits */
 };
 
 struct packing_data {
@@ -296,4 +297,23 @@ static inline void oe_set_size(struct object_entry *e,
 	e->size_valid = e->size_ == size;
 }
 
+static inline unsigned long oe_delta_size(struct packing_data *pack,
+					  const struct object_entry *e)
+{
+	if (e->delta_size_valid)
+		return e->delta_size_;
+	return oe_size(e);
+}
+
+static inline void oe_set_delta_size(struct packing_data *pack,
+				     struct object_entry *e,
+				     unsigned long size)
+{
+	e->delta_size_ = size;
+	e->delta_size_valid =e->delta_size_ == size;
+	if (!e->delta_size_valid && size != oe_size(e))
+		die("BUG: this can only happen in check_object() "
+		    "where delta size is the same as entry size");
+}
+
 #endif
-- 
2.16.2.873.g32ff258c87


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* Re: [PATCH/RFC v3 01/12] pack-objects: a bit of document about struct object_entry
  2018-03-08 11:42       ` [PATCH/RFC v3 01/12] pack-objects: a bit of document about struct object_entry Nguyễn Thái Ngọc Duy
@ 2018-03-09 22:34         ` Junio C Hamano
  0 siblings, 0 replies; 273+ messages in thread
From: Junio C Hamano @ 2018-03-09 22:34 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: avarab, e, git, peff

Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:

> The role of this comment block becomes more important after we shuffle
> fields around to shrink this struct. It will be much harder to see what
> field is related to what. This also documents the holes in this struct
> according to pahole.
>
> A couple of notes on shrinking the struct:
>
> 1) The reader may notice one thing from this document and the shrinking
> business. If "delta" is NULL, all other delta-related fields should be
> irrelevant. We could group all these in a separate struct and replace
> them all with a pointer to this struct (allocated separately).
>
> This does not help much though since 85% of objects are deltified
> (source: linux-2.6.git). The gain is only from non-delta objects, which
> is not that significant.

OK.

> 2) The field in_pack_offset and idx.offset could be merged. But we need
> to be very careful. Up until the very last phase (object writing),
> idx.offset is not used and can hold in_pack_offset. Then idx.offset will
> be updated with _destination pack's_ offset, not source's. But since we
> always write delta's bases first, and we only use in_pack_offset in
> writing phase when we reuse objects, we should be ok?

By separating the processing in strict phases, I do think the result
would be OK, but at the same time, that does smell like an
invitation for future bugs.

> +/*
> + * basic object info
> + * -----------------
> + * idx.oid is filled up before delta searching starts. idx.crc32 and
> + * is only valid after the object is written down and will be used for
> + * generating the index. idx.offset will be both gradually set and
> + * used in writing phase (base objects get offset first, then deltas
> + * refer to them)

Here, I'd feel that "written out" somehow would sound more natural
than "written down", but that is perhaps because I've seen it used
elsewhere and I am confusing familiarlity with naturalness.  In any
case, if we mean "written to the resulting packdata stream", saying
that to be more explicit is probably a good idea.  We compute crc32
and learn the offset for each object as we write them to the result.

> + * If a delta is cached in memory and is compressed, "delta" points to
> + * the data and z_delta_size contains the compressed size. If it's

Isn't it "delta_data" (aot "delta") that points at the cached delta
data?

> + * uncompressed [1], z_delta_size must be zero. delta_size is always
> + * the uncompressed size and must be valid even if the delta is not
> + * cached. Delta recreation technically only depends on "delta"
> + * pointer, but delta_size is still used to verify it's the same as
> + * before.
> + *
> + * [1] during try_delta phase we don't bother with compressing because
> + * the delta could be quickly replaced with a better one.
> + */
>  struct object_entry {
>  	struct pack_idx_entry idx;
>  	unsigned long size;	/* uncompressed size */
> @@ -28,6 +74,7 @@ struct object_entry {
>  	unsigned tagged:1; /* near the very tip of refs */
>  	unsigned filled:1; /* assigned write-order */
>  
> +	/* XXX 28 bits hole, try to pack */
>  	/*
>  	 * State flags for depth-first search used for analyzing delta cycles.
>  	 *
> @@ -40,6 +87,7 @@ struct object_entry {
>  		DFS_DONE
>  	} dfs_state;
>  	int depth;
> +	/* size: 136, padding: 4 */
>  };
>  
>  struct packing_data {

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH/RFC v3 02/12] pack-objects: turn type and in_pack_type to bitfields
  2018-03-08 11:42       ` [PATCH/RFC v3 02/12] pack-objects: turn type and in_pack_type to bitfields Nguyễn Thái Ngọc Duy
@ 2018-03-09 22:54         ` Junio C Hamano
  2018-03-12 17:51           ` Duy Nguyen
  0 siblings, 1 reply; 273+ messages in thread
From: Junio C Hamano @ 2018-03-09 22:54 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: avarab, e, git, peff

Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:

> @@ -1570,7 +1576,7 @@ static void drop_reused_delta(struct object_entry *entry)
>  	entry->depth = 0;
>  
>  	oi.sizep = &entry->size;
> -	oi.typep = &entry->type;
> +	oi.typep = &type;
>  	if (packed_object_info(entry->in_pack, entry->in_pack_offset, &oi) < 0) {
>  		/*
>  		 * We failed to get the info from this pack for some reason;
> @@ -1580,6 +1586,10 @@ static void drop_reused_delta(struct object_entry *entry)
>  		 */
>  		entry->type = sha1_object_info(entry->idx.oid.hash,
>  					       &entry->size);

The comment immediately before this pre-context reads as such:

		/*
		 * We failed to get the info from this pack for some reason;
		 * fall back to sha1_object_info, which may find another copy.
		 * And if that fails, the error will be recorded in entry->type
		 * and dealt with in prepare_pack().
		 */

The rest of the code relies on the ability of entry->type to record
the error by storing an invalid (negative) type; otherwise, it cannot
detect an error where (1) the entry in _this_ pack was corrupt, and
(2) we wished to find another copy of the object elsewhere (which
would overwrite the negative entry->type we assign here), but we
didn't find any.

How should we propagate the error we found here down the control
flow in this new code?

> +	} else {
> +		if (type < 0)
> +			die("BUG: invalid type %d", type);
> +		entry->type = type;

The BUG() on this side is sensible, as packed_object_info()
shouldn't report success when it stored negative result in *oi.typep
anyway.

>  	unsigned char in_pack_header_size;
> +	unsigned type:TYPE_BITS;
> +	unsigned in_pack_type:TYPE_BITS; /* could be delta */


^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH/RFC v3 04/12] pack-objects: use bitfield for object_entry::depth
  2018-03-08 11:42       ` [PATCH/RFC v3 04/12] pack-objects: use bitfield for object_entry::depth Nguyễn Thái Ngọc Duy
@ 2018-03-09 23:07         ` Junio C Hamano
  0 siblings, 0 replies; 273+ messages in thread
From: Junio C Hamano @ 2018-03-09 23:07 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: avarab, e, git, peff

Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:

> +	Maximum value is 4095.
>   ...
> +	if (depth > (1 << OE_DEPTH_BITS))
> +		die(_("delta chain depth %d is greater than maximum limit %d"),
> +		    depth, (1 << OE_DEPTH_BITS));
> +

Do I see an off-by-one here?  Ie.

	if ((1 << OE_DEPATH_BITS) <= depth)
		die("depth that is 4096 or deeper is too much");


^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH/RFC v3 07/12] pack-objects: move in_pack out of struct object_entry
  2018-03-08 11:42       ` [PATCH/RFC v3 07/12] pack-objects: move in_pack " Nguyễn Thái Ngọc Duy
@ 2018-03-09 23:21         ` Junio C Hamano
  0 siblings, 0 replies; 273+ messages in thread
From: Junio C Hamano @ 2018-03-09 23:21 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: avarab, e, git, peff

Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:

> Instead of using 8 bytes (on 64 bit arch) to store a pointer to a
> pack. Use an index isntead since the number of packs should be
> relatively small.
>
> This limits the number of packs we can handle to 16k. For now if you hit
> 16k pack files limit, pack-objects will simply fail [1].
>
> This technically saves 7 bytes. But we don't see any of that in
> practice due to padding. The saving becomes real when we pack this
> struct tighter later.

Somehow 7 and 16k do not add up.

We use 8 bytes in the original code, and a solution that potentially
saves 7 bytes would use only 1 byte instead of the original 8, which
would allow us to index/identify 1<<8 == 256 packs, but for some reason
we can handle up to 16k.

> [1] The escape hatch is .keep file to limit the non-kept pack files
>     below 16k limit. Then you can go for another pack-objects run to
>     combine another 16k pack files. Repeat until you're satisfied.

;-)

> +static inline unsigned int oe_add_pack(struct packing_data *pack,
> +				       struct packed_git *p)
> +{
> +	if (pack->in_pack_count >= (1 << OE_IN_PACK_BITS))
> +		die(_("too many packs to handle in one go. "
> +		      "Please add .keep files to exclude\n"
> +		      "some pack files and keep the number "
> +		      "of non-kept files below %d."),
> +		    1 << OE_IN_PACK_BITS);

OK.

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH/RFC v3 02/12] pack-objects: turn type and in_pack_type to bitfields
  2018-03-09 22:54         ` Junio C Hamano
@ 2018-03-12 17:51           ` Duy Nguyen
  0 siblings, 0 replies; 273+ messages in thread
From: Duy Nguyen @ 2018-03-12 17:51 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: avarab, e, git, peff

On Fri, Mar 09, 2018 at 02:54:53PM -0800, Junio C Hamano wrote:
> Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:
> 
> > @@ -1570,7 +1576,7 @@ static void drop_reused_delta(struct object_entry *entry)
> >  	entry->depth = 0;
> >  
> >  	oi.sizep = &entry->size;
> > -	oi.typep = &entry->type;
> > +	oi.typep = &type;
> >  	if (packed_object_info(entry->in_pack, entry->in_pack_offset, &oi) < 0) {
> >  		/*
> >  		 * We failed to get the info from this pack for some reason;
> > @@ -1580,6 +1586,10 @@ static void drop_reused_delta(struct object_entry *entry)
> >  		 */
> >  		entry->type = sha1_object_info(entry->idx.oid.hash,
> >  					       &entry->size);
> 
> The comment immediately before this pre-context reads as such:
> 
> 		/*
> 		 * We failed to get the info from this pack for some reason;
> 		 * fall back to sha1_object_info, which may find another copy.
> 		 * And if that fails, the error will be recorded in entry->type
> 		 * and dealt with in prepare_pack().
> 		 */
> 
> The rest of the code relies on the ability of entry->type to record
> the error by storing an invalid (negative) type; otherwise, it cannot
> detect an error where (1) the entry in _this_ pack was corrupt, and
> (2) we wished to find another copy of the object elsewhere (which
> would overwrite the negative entry->type we assign here), but we
> didn't find any.
> 
> How should we propagate the error we found here down the control
> flow in this new code?

Good catch! I don't have any magic trick to do this, so I'm adding an
extra bit to store type validity. Something like this as a fixup patch
(I'll resend the whole series soon).

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index fd217cb51f..f164f1797b 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -265,7 +265,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 	struct git_istream *st = NULL;
 
 	if (!usable_delta) {
-		if (entry->type == OBJ_BLOB &&
+		if (oe_type(entry) == OBJ_BLOB &&
 		    entry->size > big_file_threshold &&
 		    (st = open_istream(entry->idx.oid.hash, &type, &size, NULL)) != NULL)
 			buf = NULL;
@@ -371,7 +371,7 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 	struct pack_window *w_curs = NULL;
 	struct revindex_entry *revidx;
 	off_t offset;
-	enum object_type type = entry->type;
+	enum object_type type = oe_type(entry);
 	off_t datalen;
 	unsigned char header[MAX_PACK_OBJECT_HEADER],
 		      dheader[MAX_PACK_OBJECT_HEADER];
@@ -480,11 +480,12 @@ static off_t write_object(struct hashfile *f,
 		to_reuse = 0;	/* explicit */
 	else if (!entry->in_pack)
 		to_reuse = 0;	/* can't reuse what we don't have */
-	else if (entry->type == OBJ_REF_DELTA || entry->type == OBJ_OFS_DELTA)
+	else if (oe_type(entry) == OBJ_REF_DELTA ||
+		 oe_type(entry) == OBJ_OFS_DELTA)
 				/* check_object() decided it for us ... */
 		to_reuse = usable_delta;
 				/* ... but pack split may override that */
-	else if (entry->type != entry->in_pack_type)
+	else if (oe_type(entry) != entry->in_pack_type)
 		to_reuse = 0;	/* pack has delta which is unusable */
 	else if (entry->delta)
 		to_reuse = 0;	/* we want to pack afresh */
@@ -705,8 +706,8 @@ static struct object_entry **compute_write_order(void)
 	 * And then all remaining commits and tags.
 	 */
 	for (i = last_untagged; i < to_pack.nr_objects; i++) {
-		if (objects[i].type != OBJ_COMMIT &&
-		    objects[i].type != OBJ_TAG)
+		if (oe_type(&objects[i]) != OBJ_COMMIT &&
+		    oe_type(&objects[i]) != OBJ_TAG)
 			continue;
 		add_to_write_order(wo, &wo_end, &objects[i]);
 	}
@@ -715,7 +716,7 @@ static struct object_entry **compute_write_order(void)
 	 * And then all the trees.
 	 */
 	for (i = last_untagged; i < to_pack.nr_objects; i++) {
-		if (objects[i].type != OBJ_TREE)
+		if (oe_type(&objects[i]) != OBJ_TREE)
 			continue;
 		add_to_write_order(wo, &wo_end, &objects[i]);
 	}
@@ -1067,7 +1068,7 @@ static void create_object_entry(const struct object_id *oid,
 	entry = packlist_alloc(&to_pack, oid->hash, index_pos);
 	entry->hash = hash;
 	if (type)
-		entry->type = type;
+		oe_set_type(entry, type);
 	if (exclude)
 		entry->preferred_base = 1;
 	else
@@ -1433,9 +1434,9 @@ static void check_object(struct object_entry *entry)
 		switch (entry->in_pack_type) {
 		default:
 			/* Not a delta hence we've already got all we need. */
-			entry->type = entry->in_pack_type;
+			oe_set_type(entry, entry->in_pack_type);
 			entry->in_pack_header_size = used;
-			if (entry->type < OBJ_COMMIT || entry->type > OBJ_BLOB)
+			if (oe_type(entry) < OBJ_COMMIT || oe_type(entry) > OBJ_BLOB)
 				goto give_up;
 			unuse_pack(&w_curs);
 			return;
@@ -1489,7 +1490,7 @@ static void check_object(struct object_entry *entry)
 			 * deltify other objects against, in order to avoid
 			 * circular deltas.
 			 */
-			entry->type = entry->in_pack_type;
+			oe_set_type(entry, entry->in_pack_type);
 			entry->delta = base_entry;
 			entry->delta_size = entry->size;
 			entry->delta_sibling = base_entry->delta_child;
@@ -1498,7 +1499,7 @@ static void check_object(struct object_entry *entry)
 			return;
 		}
 
-		if (entry->type) {
+		if (oe_type(entry)) {
 			/*
 			 * This must be a delta and we already know what the
 			 * final object type is.  Let's extract the actual
@@ -1521,7 +1522,7 @@ static void check_object(struct object_entry *entry)
 		unuse_pack(&w_curs);
 	}
 
-	entry->type = sha1_object_info(entry->idx.oid.hash, &entry->size);
+	oe_set_type(entry, sha1_object_info(entry->idx.oid.hash, &entry->size));
 	/*
 	 * The error condition is checked in prepare_pack().  This is
 	 * to permit a missing preferred base object to be ignored
@@ -1584,12 +1585,10 @@ static void drop_reused_delta(struct object_entry *entry)
 		 * And if that fails, the error will be recorded in entry->type
 		 * and dealt with in prepare_pack().
 		 */
-		entry->type = sha1_object_info(entry->idx.oid.hash,
-					       &entry->size);
+		oe_set_type(entry, sha1_object_info(entry->idx.oid.hash,
+						    &entry->size));
 	} else {
-		if (type < 0)
-			die("BUG: invalid type %d", type);
-		entry->type = type;
+		oe_set_type(entry, type);
 	}
 }
 
@@ -1757,10 +1756,12 @@ static int type_size_sort(const void *_a, const void *_b)
 {
 	const struct object_entry *a = *(struct object_entry **)_a;
 	const struct object_entry *b = *(struct object_entry **)_b;
+	enum object_type a_type = oe_type(a);
+	enum object_type b_type = oe_type(b);
 
-	if (a->type > b->type)
+	if (a_type > b_type)
 		return -1;
-	if (a->type < b->type)
+	if (a_type < b_type)
 		return 1;
 	if (a->hash > b->hash)
 		return -1;
@@ -1836,7 +1837,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	void *delta_buf;
 
 	/* Don't bother doing diffs between different types */
-	if (trg_entry->type != src_entry->type)
+	if (oe_type(trg_entry) != oe_type(src_entry))
 		return -1;
 
 	/*
@@ -2442,11 +2443,11 @@ static void prepare_pack(int window, int depth)
 
 		if (!entry->preferred_base) {
 			nr_deltas++;
-			if (entry->type < 0)
+			if (oe_type(entry) < 0)
 				die("unable to get type of object %s",
 				    oid_to_hex(&entry->idx.oid));
 		} else {
-			if (entry->type < 0) {
+			if (oe_type(entry) < 0) {
 				/*
 				 * This object is not found, but we
 				 * don't have to include it anyway.
diff --git a/pack-objects.h b/pack-objects.h
index 3e5a89569a..90fbbc9394 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -58,8 +58,9 @@ struct object_entry {
 	void *delta_data;	/* cached delta (uncompressed) */
 	unsigned long delta_size;	/* delta data size (uncompressed) */
 	unsigned long z_delta_size;	/* delta data size (compressed) */
-	unsigned type:TYPE_BITS;
+	unsigned type_:TYPE_BITS;
 	unsigned in_pack_type:TYPE_BITS; /* could be delta */
+	unsigned type_valid:1;
 	uint32_t hash;			/* name hint hash */
 	unsigned int in_pack_pos;
 	unsigned char in_pack_header_size;
@@ -122,4 +123,19 @@ static inline uint32_t pack_name_hash(const char *name)
 	return hash;
 }
 
+static inline enum object_type oe_type(const struct object_entry *e)
+{
+	return e->type_valid ? e->type_ : OBJ_BAD;
+}
+
+static inline void oe_set_type(struct object_entry *e,
+			       enum object_type type)
+{
+	if (type >= OBJ_ANY)
+		die("BUG: OBJ_ANY cannot be set in pack-objects code");
+
+	e->type_valid = type >= 0;
+	e->type_ = (unsigned)type;
+}
+
 #endif

^ permalink raw reply related	[flat|nested] 273+ messages in thread

* Re: [PATCH v2 4/5] pack-objects: show some progress when counting kept objects
  2018-03-06 10:41     ` [PATCH v2 4/5] pack-objects: show some progress when counting kept objects Nguyễn Thái Ngọc Duy
@ 2018-03-12 18:32       ` Ævar Arnfjörð Bjarmason
  2018-03-16 19:14         ` Duy Nguyen
  0 siblings, 1 reply; 273+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2018-03-12 18:32 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: e, git, peff, Junio C Hamano


On Tue, Mar 06 2018, Nguyễn Thái Ngọc Duy jotted:

> We only show progress when there are new objects to be packed. But
> when --keep-pack is specified on the base pack, we will exclude most
> of objects. This makes 'pack-objects' stay silent for a long time
> while the counting phase is going.
>
> Let's show some progress whenever we visit an object instead. The
> number of packed objects will be shown after if it's not the same as
> the number of visited objects.
>
> Since the meaning of this number has changed, use another word instead
> of "Counting" to hint about the change.

Can you elaborate on how the meaning has changed? With/without this on
linux.git I get:

With:

    Enumerating objects: 5901144, done.
    Getting object details: 100% (5901145/5901145), done.
    Delta compression using up to 8 threads.

Without:

    Counting objects: 5901145, done.
    Delta compression using up to 8 threads.

So now we're seemingly off-by-one but otherwise doing the same thing?

As for as user feedback goes we might as well have said "Reticulating
splines", but I have some bias towards keeping the current "Counting
objects..." phrasing. We ourselves have other docs referring to it that
aren't changed by this patch, and there's
e.g. https://githubengineering.com/counting-objects/ and lots of other
3rd party docs that refer to this.

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v2 3/5] gc --auto: exclude base pack if not enough mem to "repack -ad"
  2018-03-07 18:38           ` Junio C Hamano
@ 2018-03-12 18:56             ` Ævar Arnfjörð Bjarmason
  2018-03-12 21:16               ` Junio C Hamano
  2018-03-15 16:48               ` Duy Nguyen
  0 siblings, 2 replies; 273+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2018-03-12 18:56 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Duy Nguyen, Eric Wong, Git Mailing List, Jeff King


On Wed, Mar 07 2018, Junio C. Hamano jotted:

> Duy Nguyen <pclouds@gmail.com> writes:
>>> But to those who say "packs larger than this value is too big" via
>>> configuration, keeping only the largest of these above-threshold
>>> packs would look counter-intuitive, wouldn't it, I wonder?
>>
>> I think I'll just clarify this in the document. There may be a use
>> case for keeping multiple large packs, but I don't see it (*). We can
>> deal with it when it comes.
>
> When the project's history grows too much, a large pack that holds
> its first 10 years of stuff, together with another one that holds
> its second 20 years of stuff, may both be larger than the threshold
> and want to be kept.  If we pick only the largest one, we would
> explode the other one and repack together with loose objects.
>
> But realistically, those who would want to control the way in which
> their repository is packed to such a degree are very likely to add
> ".keep" files to these two packfiles themselves, so the above would
> probably not a concern.  Perhaps we shouldn't do the "automatically
> pick the largest one and exclude from repacking" when there is a
> packfile that is marked with ".keep"?

As someone who expects to use this (although hopefully in slightly
modified form), it's very useful if we can keep the useful semantics in
gc.* config values without needing some external job finding repos and
creating *.keep files to get custom behavior.

E.g. I have the use-case of wanting to set this on servers that I know
are going to be used for cloning some big repos in user's ~ directory
manually, so if I can set something sensible in /etc/gitconfig that's
great, but it sucks a lot more to need to write some cronjob that goes
hunting for repos in those ~ directories and tweaks *.keep files.

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v2 3/5] gc --auto: exclude base pack if not enough mem to "repack -ad"
  2018-03-06 10:41     ` [PATCH v2 3/5] gc --auto: exclude base pack if not enough mem to "repack -ad" Nguyễn Thái Ngọc Duy
  2018-03-06 19:19       ` Junio C Hamano
  2018-03-07 10:48       ` Johannes Schindelin
@ 2018-03-12 19:30       ` Ævar Arnfjörð Bjarmason
  2018-03-15 17:00         ` Duy Nguyen
  2 siblings, 1 reply; 273+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2018-03-12 19:30 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: e, git, peff, Junio C Hamano


On Tue, Mar 06 2018, Nguyễn Thái Ngọc Duy jotted:

> pack-objects could be a big memory hog especially on large repos,
> everybody knows that. The suggestion to stick a .keep file on the
> giant base pack to avoid this problem is also known for a long time.
>
> Let's do the suggestion automatically instead of waiting for people to
> come to Git mailing list and get the advice. When a certain condition
> is met, "gc --auto" tells "git repack" to keep the base pack around.
> The end result would be two packs instead of one.
>
> On linux-2.6.git, valgrind massif reports 1.6GB heap in "pack all"
> case, and 535MB [1] in "pack all except the base pack" case. We save
> roughly 1GB memory by excluding the base pack.
>
> gc --auto decides to do this based on an estimation of pack-objects
> memory usage, which is quite accurate at least for the heap part, and
> whether that fits in half of system memory (the assumption here is for
> desktop environment where there are many other applications running).
>
> Since the estimation may be inaccurate and that 1/2 threshold is
> really arbitrary, give the user a finer control over this mechanism:
> if the largest pack is larger than gc.bigBasePackThreshold, it's kept.
>
> PS. A big chunk of the remaining 535MB is the result of pack-objects
> running rev-list internally. This will be dealt with when we could run
> rev-list externally. Right now we can't because pack-objects internal
> rev-list does more regarding unreachable objects, which cannot be done
> by "git rev-list".
>
> Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
> ---
>  Documentation/config.txt |   7 ++
>  Documentation/git-gc.txt |  13 ++++
>  builtin/gc.c             | 153 +++++++++++++++++++++++++++++++++++++--
>  builtin/pack-objects.c   |   2 +-
>  config.mak.uname         |   1 +
>  git-compat-util.h        |   4 +
>  pack-objects.h           |   2 +
>  t/t6500-gc.sh            |  29 ++++++++
>  8 files changed, 204 insertions(+), 7 deletions(-)
>
> diff --git a/Documentation/config.txt b/Documentation/config.txt
> index f57e9cf10c..120cf6bac9 100644
> --- a/Documentation/config.txt
> +++ b/Documentation/config.txt
> @@ -1549,6 +1549,13 @@ gc.autoDetach::
>  	Make `git gc --auto` return immediately and run in background
>  	if the system supports it. Default is true.
>
> +gc.bigBasePackThreshold::
> +	Make `git gc --auto` only enable `--keep-base-pack` when the
> +	base pack's size is larger than this limit (in bytes).
> +	Defaults to zero, which disables this check and lets
> +	`git gc --auto` determine when to enable `--keep-base-pack`
> +	based on memory usage.
> +

I'm really keen to use this (and would be happy to apply a patch on
top), but want to get your thoughts first, see also my just-sent
87bmftqg1n.fsf@evledraar.gmail.com
(https://public-inbox.org/git/87bmftqg1n.fsf@evledraar.gmail.com/).

The thing I'd like to change is that the underlying --keep-pack= takes a
list of paths (good!), but then I think this patch needlessly
complicates things by talking about "base packs" and having the
implementation limitation that we only ever pass one --keep-pack down to
pack-objects (bad!).

Why don't we instead just have a gc.* variable that you can set to some
size of pack that we'll always implicitly *.keep? That way I can
e.g. clone a 5GB pack and set the limit to 2GB, then keep adding new
content per the rules of gc.autoPackLimit, until I finally create a
larger than 2GB pack, at that point I'll have 5GB, 2GB, and some smaller
packs and loose objects.

We already have pack.packSizeLimit, perhaps we could call this
e.g. gc.keepPacksSize=2GB?

Or is there a use-case for still having the concept of a "base" pack? Is
it magic in some way? Maybe I'm missing something but I don't see why,
we can just stop thinking about whether some one pack is larger than X,
and consider all packs larger than X specially.

But if we do maybe an extra gc.keepBasePack=true?

Finally I wonder if there should be something equivalent to
gc.autoPackLimit for this. I.e. with my proposed semantics above it's
possible that we end up growing forever, i.e. I could have 1000 2GB
packs and then 50 very small packs per gc.autoPackLimit.

Maybe we need a gc.keepPackLimit=100 to deal with that, then e.g. if
gc.keepPacksSize=2GB is set and we have 101 >= 2GB packs, we'd pick the
two smallest one and not issue a --keep-pack for those, although then
maybe our memory use would spike past the limit.

I don't know, maybe we can leave that for later, but I'm quite keen to
turn the top-level config variable into something that just considers
size instead of "base" if possible, and it seems we're >95% of the way
to that already with this patch.

Finally, I don't like the way the current implementation conflates a
"size" variable with auto detecting the size from memory, leaving no way
to fallback to the auto-detection if you set it manually.

I think we should split out the auto-memory behavior into another
variable, and also make the currently hardcoded 50% of memory
configurable.

That way you could e.g. say you'd always like to keep 2GB packs, but if
you happen to have ended up with a 1GB pack and it's time to repack, and
you only have 500MB free memory on that system, it would keep the 1GB
one until such time as we have more memory.

Actually maybe that should be a "if we're that low on memory, forget
about GC for now" config, but urgh, there's a lot of potential
complexity to be handled here...

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v2 3/5] gc --auto: exclude base pack if not enough mem to "repack -ad"
  2018-03-12 18:56             ` Ævar Arnfjörð Bjarmason
@ 2018-03-12 21:16               ` Junio C Hamano
  2018-03-12 22:01                 ` Ævar Arnfjörð Bjarmason
  2018-03-15 16:48               ` Duy Nguyen
  1 sibling, 1 reply; 273+ messages in thread
From: Junio C Hamano @ 2018-03-12 21:16 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Duy Nguyen, Eric Wong, Git Mailing List, Jeff King

On Mon, Mar 12, 2018 at 11:56 AM, Ævar Arnfjörð Bjarmason
<avarab@gmail.com> wrote:
> As someone who expects to use this (although hopefully in slightly
> modified form), it's very useful if we can keep the useful semantics in
> gc.* config values without needing some external job finding repos and
> creating *.keep files to get custom behavior.
>
> E.g. I have the use-case of wanting to set this on servers that I know
> are going to be used for cloning some big repos in user's ~ directory
> manually, so if I can set something sensible in /etc/gitconfig that's
> great, but it sucks a lot more to need to write some cronjob that goes
> hunting for repos in those ~ directories and tweaks *.keep files.

Yeah, but that is exactly what I suggested, no? That is, if you don't do any
specific marking to describe _which_ ones need to be kept, this new thing
would kick in and pick the largest one and repack all others. If you choose
to want more control, on the other hand, you can mark those packs you
would want to keep, and this mechanism will not kick in to countermand
your explicit settings done via those .keep files.

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v2 3/5] gc --auto: exclude base pack if not enough mem to "repack -ad"
  2018-03-12 21:16               ` Junio C Hamano
@ 2018-03-12 22:01                 ` Ævar Arnfjörð Bjarmason
  0 siblings, 0 replies; 273+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2018-03-12 22:01 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Duy Nguyen, Eric Wong, Git Mailing List, Jeff King


On Mon, Mar 12 2018, Junio C. Hamano jotted:

> On Mon, Mar 12, 2018 at 11:56 AM, Ævar Arnfjörð Bjarmason
> <avarab@gmail.com> wrote:
>> As someone who expects to use this (although hopefully in slightly
>> modified form), it's very useful if we can keep the useful semantics in
>> gc.* config values without needing some external job finding repos and
>> creating *.keep files to get custom behavior.
>>
>> E.g. I have the use-case of wanting to set this on servers that I know
>> are going to be used for cloning some big repos in user's ~ directory
>> manually, so if I can set something sensible in /etc/gitconfig that's
>> great, but it sucks a lot more to need to write some cronjob that goes
>> hunting for repos in those ~ directories and tweaks *.keep files.
>
> Yeah, but that is exactly what I suggested, no? That is, if you don't do any
> specific marking to describe _which_ ones need to be kept, this new thing
> would kick in and pick the largest one and repack all others. If you choose
> to want more control, on the other hand, you can mark those packs you
> would want to keep, and this mechanism will not kick in to countermand
> your explicit settings done via those .keep files.

Yes, this configurable mechanism as it stands only needs /etc/gitconfig.

What I was pointing out in this mail is that we really should get the
advanced use-cases right as well (see my
87a7vdqegi.fsf@evledraar.gmail.com for details) via the config, because
it's a pain to cross the chasm between setting config centrally on the
one hand, and needing to track down .git's in arbitrary locations on the
FS (you may not have cloned them yourself) to set *.keep flags.

Doubly so if the machines in questions are just the laptops of some
developers. It's relatively easy to tell them "we work with git repos,
run this git config commands", not so easy to have them install & keep
up-to-date some arbitrary cronjob that needs to hunt down their repos
and set *.keep flags.

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH/RFC v3 08/12] pack-objects: refer to delta objects by index instead of pointer
  2018-03-08 11:42       ` [PATCH/RFC v3 08/12] pack-objects: refer to delta objects by index instead of pointer Nguyễn Thái Ngọc Duy
@ 2018-03-14 16:18         ` Junio C Hamano
  0 siblings, 0 replies; 273+ messages in thread
From: Junio C Hamano @ 2018-03-14 16:18 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: avarab, e, git, peff

Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:

> Notice that packing_data::nr_objects is uint32_t, we could only handle
> maximum 4G objects and can address all of them with an uint32_t. If we
> use a pointer here, we waste 4 bytes on 64 bit architecture.

Some things are left unsaid or left unclear and make readers stutter
a bit while reading this paragraph.  We can address them with
uint32_t only because we happen to have a linear array of all
objects involved already, i.e. the pack->objects[] array.  The
readers are forced to rephrase the above in their mind

	... and each of them can be identified with an uint32_t.
	Because we have all of these objects in pack->objects[], we
	can replace the "delta" field in each object entry that
	points at its delta base object with uint32_t index into
	this array to save memory (on 64-bit arch, 8-byte pointer
	gets shrunk to 4-byte uint).

or something like that before understanding why this is a valid
memory footprint optimization.


^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v2 3/5] gc --auto: exclude base pack if not enough mem to "repack -ad"
  2018-03-12 18:56             ` Ævar Arnfjörð Bjarmason
  2018-03-12 21:16               ` Junio C Hamano
@ 2018-03-15 16:48               ` Duy Nguyen
  1 sibling, 0 replies; 273+ messages in thread
From: Duy Nguyen @ 2018-03-15 16:48 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Junio C Hamano, Eric Wong, Git Mailing List, Jeff King

On Mon, Mar 12, 2018 at 7:56 PM, Ævar Arnfjörð Bjarmason
<avarab@gmail.com> wrote:
>
> On Wed, Mar 07 2018, Junio C. Hamano jotted:
>
>> Duy Nguyen <pclouds@gmail.com> writes:
>>>> But to those who say "packs larger than this value is too big" via
>>>> configuration, keeping only the largest of these above-threshold
>>>> packs would look counter-intuitive, wouldn't it, I wonder?
>>>
>>> I think I'll just clarify this in the document. There may be a use
>>> case for keeping multiple large packs, but I don't see it (*). We can
>>> deal with it when it comes.
>>
>> When the project's history grows too much, a large pack that holds
>> its first 10 years of stuff, together with another one that holds
>> its second 20 years of stuff, may both be larger than the threshold
>> and want to be kept.  If we pick only the largest one, we would
>> explode the other one and repack together with loose objects.
>>
>> But realistically, those who would want to control the way in which
>> their repository is packed to such a degree are very likely to add
>> ".keep" files to these two packfiles themselves, so the above would
>> probably not a concern.  Perhaps we shouldn't do the "automatically
>> pick the largest one and exclude from repacking" when there is a
>> packfile that is marked with ".keep"?
>
> As someone who expects to use this (although hopefully in slightly
> modified form), it's very useful if we can keep the useful semantics in
> gc.* config values without needing some external job finding repos andis is
> creating *.keep files to get custom behavior.
>
> E.g. I have the use-case of wanting to set this on servers that I know
> are going to be used for cloning some big repos in user's ~ directory
> manually, so if I can set something sensible in /etc/gitconfig that's
> great, but it sucks a lot more to need to write some cronjob that goes
> hunting for repos in those ~ directories and tweaks *.keep files.

If this is about .gc.bigBasePackThreshold keeping all packs larger
than the threshold, then yes it will be so in the reroll.
-- 
Duy

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v2 3/5] gc --auto: exclude base pack if not enough mem to "repack -ad"
  2018-03-12 19:30       ` Ævar Arnfjörð Bjarmason
@ 2018-03-15 17:00         ` Duy Nguyen
  2018-03-15 19:21           ` Ævar Arnfjörð Bjarmason
  0 siblings, 1 reply; 273+ messages in thread
From: Duy Nguyen @ 2018-03-15 17:00 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Eric Wong, Git Mailing List, Jeff King, Junio C Hamano

On Mon, Mar 12, 2018 at 8:30 PM, Ævar Arnfjörð Bjarmason
<avarab@gmail.com> wrote:
> We already have pack.packSizeLimit, perhaps we could call this
> e.g. gc.keepPacksSize=2GB?

I'm OK either way. The "base pack" concept comes from the
"--keep-base-pack" option where we do keep _one_ base pack. But gc
config var has a slightly different semantics when it can keep
multiple packs.

> Finally I wonder if there should be something equivalent to
> gc.autoPackLimit for this. I.e. with my proposed semantics above it's
> possible that we end up growing forever, i.e. I could have 1000 2GB
> packs and then 50 very small packs per gc.autoPackLimit.
>
> Maybe we need a gc.keepPackLimit=100 to deal with that, then e.g. if
> gc.keepPacksSize=2GB is set and we have 101 >= 2GB packs, we'd pick the
> two smallest one and not issue a --keep-pack for those, although then
> maybe our memory use would spike past the limit.
>
> I don't know, maybe we can leave that for later, but I'm quite keen to
> turn the top-level config variable into something that just considers
> size instead of "base" if possible, and it seems we're >95% of the way
> to that already with this patch.

At least I will try to ignore gc.keepPacksSize if all packs are kept
because of it. That repack run will hurt. But then we're back to one
giant pack and plenty of small packs that will take some time to grow
up to 2GB again.

> Finally, I don't like the way the current implementation conflates a
> "size" variable with auto detecting the size from memory, leaving no way
> to fallback to the auto-detection if you set it manually.
>
> I think we should split out the auto-memory behavior into another
> variable, and also make the currently hardcoded 50% of memory
> configurable.
>
> That way you could e.g. say you'd always like to keep 2GB packs, but if
> you happen to have ended up with a 1GB pack and it's time to repack, and
> you only have 500MB free memory on that system, it would keep the 1GB
> one until such time as we have more memory.

I don't calculate based on free memory (it's tricky to get that right
on linux) but physical memory. If you don't have enough memory
according to this formula, you won't until you add more memory sticks.

>
> Actually maybe that should be a "if we're that low on memory, forget
> about GC for now" config, but urgh, there's a lot of potential
> complexity to be handled here...

Yeah I think what you want is a hook. You can make custom rules then.
We already have pre-auto-gc hook and could pretty much do what you
want without pack-objects memory estimation. But if you want it, maybe
we can export the info to the hook somehow.
-- 
Duy

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v2 3/5] gc --auto: exclude base pack if not enough mem to "repack -ad"
  2018-03-15 17:00         ` Duy Nguyen
@ 2018-03-15 19:21           ` Ævar Arnfjörð Bjarmason
  2018-03-16 17:47             ` Duy Nguyen
  0 siblings, 1 reply; 273+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2018-03-15 19:21 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Eric Wong, Git Mailing List, Jeff King, Junio C Hamano


On Thu, Mar 15 2018, Duy Nguyen jotted:

> On Mon, Mar 12, 2018 at 8:30 PM, Ævar Arnfjörð Bjarmason
> <avarab@gmail.com> wrote:
>> We already have pack.packSizeLimit, perhaps we could call this
>> e.g. gc.keepPacksSize=2GB?
>
> I'm OK either way. The "base pack" concept comes from the
> "--keep-base-pack" option where we do keep _one_ base pack. But gc
> config var has a slightly different semantics when it can keep
> multiple packs.

I see, yeah it would be great to generalize it to N packs.

>> Finally I wonder if there should be something equivalent to
>> gc.autoPackLimit for this. I.e. with my proposed semantics above it's
>> possible that we end up growing forever, i.e. I could have 1000 2GB
>> packs and then 50 very small packs per gc.autoPackLimit.
>>
>> Maybe we need a gc.keepPackLimit=100 to deal with that, then e.g. if
>> gc.keepPacksSize=2GB is set and we have 101 >= 2GB packs, we'd pick the
>> two smallest one and not issue a --keep-pack for those, although then
>> maybe our memory use would spike past the limit.
>>
>> I don't know, maybe we can leave that for later, but I'm quite keen to
>> turn the top-level config variable into something that just considers
>> size instead of "base" if possible, and it seems we're >95% of the way
>> to that already with this patch.
>
> At least I will try to ignore gc.keepPacksSize if all packs are kept
> because of it. That repack run will hurt. But then we're back to one
> giant pack and plenty of small packs that will take some time to grow
> up to 2GB again.

I think that semantic really should have its own option. The usefulness
of this is significantly diminished if it's not a guarantee on the
resource use of git-gc.

Consider a very large repo where we clone and get a 4GB pack. Then as
time goes on we end up with lots of loose objects and small packs from
pulling, and eventually end up with say 4GB + 2x 500MB packs (if our
limit is 500MB).

If I understand what you're saying correctly if we ever match the gc
--auto requirements because we have *just* the big packs and then a
bunch of loose objects (say we rebased a lot) then we'll try to create a
giant 5GB pack (+ loose objects).

>> Finally, I don't like the way the current implementation conflates a
>> "size" variable with auto detecting the size from memory, leaving no way
>> to fallback to the auto-detection if you set it manually.
>>
>> I think we should split out the auto-memory behavior into another
>> variable, and also make the currently hardcoded 50% of memory
>> configurable.
>>
>> That way you could e.g. say you'd always like to keep 2GB packs, but if
>> you happen to have ended up with a 1GB pack and it's time to repack, and
>> you only have 500MB free memory on that system, it would keep the 1GB
>> one until such time as we have more memory.
>
> I don't calculate based on free memory (it's tricky to get that right
> on linux) but physical memory. If you don't have enough memory
> according to this formula, you won't until you add more memory sticks.

Ah, thanks for the clarification.

>>
>> Actually maybe that should be a "if we're that low on memory, forget
>> about GC for now" config, but urgh, there's a lot of potential
>> complexity to be handled here...
>
> Yeah I think what you want is a hook. You can make custom rules then.
> We already have pre-auto-gc hook and could pretty much do what you
> want without pack-objects memory estimation. But if you want it, maybe
> we can export the info to the hook somehow.

I can do away with that particular thing, but I'd really like to do
without the hook. I can automate it on some machines, but then we also
have un-managed laptops run by users who clone big repos. It's much
easier to tell them to set a few git config variables than have them
install & keep some hook up-to-date.

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v2 3/5] gc --auto: exclude base pack if not enough mem to "repack -ad"
  2018-03-15 19:21           ` Ævar Arnfjörð Bjarmason
@ 2018-03-16 17:47             ` Duy Nguyen
  0 siblings, 0 replies; 273+ messages in thread
From: Duy Nguyen @ 2018-03-16 17:47 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Eric Wong, Git Mailing List, Jeff King, Junio C Hamano

On Thu, Mar 15, 2018 at 8:21 PM, Ævar Arnfjörð Bjarmason
<avarab@gmail.com> wrote:
>
> On Thu, Mar 15 2018, Duy Nguyen jotted:
>
>> On Mon, Mar 12, 2018 at 8:30 PM, Ævar Arnfjörð Bjarmason
>> <avarab@gmail.com> wrote:
>>> We already have pack.packSizeLimit, perhaps we could call this
>>> e.g. gc.keepPacksSize=2GB?
>>
>> I'm OK either way. The "base pack" concept comes from the
>> "--keep-base-pack" option where we do keep _one_ base pack. But gc
>> config var has a slightly different semantics when it can keep
>> multiple packs.
>
> I see, yeah it would be great to generalize it to N packs.
>
>>> Finally I wonder if there should be something equivalent to
>>> gc.autoPackLimit for this. I.e. with my proposed semantics above it's
>>> possible that we end up growing forever, i.e. I could have 1000 2GB
>>> packs and then 50 very small packs per gc.autoPackLimit.
>>>
>>> Maybe we need a gc.keepPackLimit=100 to deal with that, then e.g. if
>>> gc.keepPacksSize=2GB is set and we have 101 >= 2GB packs, we'd pick the
>>> two smallest one and not issue a --keep-pack for those, although then
>>> maybe our memory use would spike past the limit.
>>>
>>> I don't know, maybe we can leave that for later, but I'm quite keen to
>>> turn the top-level config variable into something that just considers
>>> size instead of "base" if possible, and it seems we're >95% of the way
>>> to that already with this patch.
>>
>> At least I will try to ignore gc.keepPacksSize if all packs are kept
>> because of it. That repack run will hurt. But then we're back to one
>> giant pack and plenty of small packs that will take some time to grow
>> up to 2GB again.
>
> I think that semantic really should have its own option. The usefulness
> of this is significantly diminished if it's not a guarantee on the
> resource use of git-gc.
>
> Consider a very large repo where we clone and get a 4GB pack. Then as
> time goes on we end up with lots of loose objects and small packs from
> pulling, and eventually end up with say 4GB + 2x 500MB packs (if our
> limit is 500MB).
>
> If I understand what you're saying correctly if we ever match the gc
> --auto requirements because we have *just* the big packs and then a
> bunch of loose objects (say we rebased a lot) then we'll try to create a
> giant 5GB pack (+ loose objects).

Yes. There isn't a simple and easy solution here and I consider
packing (even if it's expensive) to regain performance is better than
not packing at all. I could tweak that a bit by keeping the largest
pack out (so we have to packs in the end). After a long long long time
when your second pack gets to 5 GB, then we hit the most expensive
repack. But that should be ok for now, I guess.

I think this repack strategy was discussed here at some point in the
past by Gerrit guys. Their goal was to reduce I/O, I believe. A
perfect solution probably could be found, but I don't want to hold
this series back until it's found and I don't want to introduce a
zillion config knobs that become useless later on when the perfect
solution is found.

>>> Actually maybe that should be a "if we're that low on memory, forget
>>> about GC for now" config, but urgh, there's a lot of potential
>>> complexity to be handled here...
>>
>> Yeah I think what you want is a hook. You can make custom rules then.
>> We already have pre-auto-gc hook and could pretty much do what you
>> want without pack-objects memory estimation. But if you want it, maybe
>> we can export the info to the hook somehow.
>
> I can do away with that particular thing, but I'd really like to do
> without the hook. I can automate it on some machines, but then we also
> have un-managed laptops run by users who clone big repos. It's much
> easier to tell them to set a few git config variables than have them
> install & keep some hook up-to-date.

That sounds like we need a mechanism to push hooks (and config stuff)
automatically from clone source. I think this topic was touched in the
summit? I don't object adding new config but we need to figure out
what we need, and from this thread I think there are too many "I don't
know" to settle on a solution.
-- 
Duy

^ permalink raw reply	[flat|nested] 273+ messages in thread

* [PATCH v4 00/11] nd/pack-objects-pack-struct updates
  2018-03-08 11:42     ` [PATCH/RFC v3 00/12] " Nguyễn Thái Ngọc Duy
                         ` (11 preceding siblings ...)
  2018-03-08 11:42       ` [PATCH/RFC v3 12/12] pack-objects: shrink delta_size " Nguyễn Thái Ngọc Duy
@ 2018-03-16 18:31       ` Nguyễn Thái Ngọc Duy
  2018-03-16 18:31         ` [PATCH v4 01/11] pack-objects: a bit of document about struct object_entry Nguyễn Thái Ngọc Duy
                           ` (11 more replies)
  12 siblings, 12 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-16 18:31 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

The most important change in v4 is it fixes a case where I failed to
propagate an error condition to later code in 02/11. This results in
new wrappers, oe_type() and oe_set_type(). This also reveals another
extra object type, OBJ_NONE, that's also used by pack-objects.

Other changes are comments fixes, commit messages fixes, off-by-one
bugs. No more saving compared to v3.

I also changed my approach a bit. I stop trying to make struct
reduction visible at every patch. All these patches shrink some field
even if the struct size is the same. The reordering and packing
happens at the last patch.

I'm not super happy that many corner cases of my changes are not
covered by the test suite. In many cases it's very hard or expensive
to create the right error condition. If only this code is part of
libgit.a and I could write C unit tests for it...

Interdiff

-- 8< --
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 0f65e0f243..c388d87c3e 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -275,7 +275,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 	struct git_istream *st = NULL;
 
 	if (!usable_delta) {
-		if (entry->type == OBJ_BLOB &&
+		if (oe_type(entry) == OBJ_BLOB &&
 		    oe_size_greater_than(entry, big_file_threshold) &&
 		    (st = open_istream(entry->idx.oid.hash, &type, &size, NULL)) != NULL)
 			buf = NULL;
@@ -381,7 +381,7 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 	struct pack_window *w_curs = NULL;
 	struct revindex_entry *revidx;
 	off_t offset;
-	enum object_type type = entry->type;
+	enum object_type type = oe_type(entry);
 	off_t datalen;
 	unsigned char header[MAX_PACK_OBJECT_HEADER],
 		      dheader[MAX_PACK_OBJECT_HEADER];
@@ -491,11 +491,12 @@ static off_t write_object(struct hashfile *f,
 		to_reuse = 0;	/* explicit */
 	else if (!IN_PACK(entry))
 		to_reuse = 0;	/* can't reuse what we don't have */
-	else if (entry->type == OBJ_REF_DELTA || entry->type == OBJ_OFS_DELTA)
+	else if (oe_type(entry) == OBJ_REF_DELTA ||
+		 oe_type(entry) == OBJ_OFS_DELTA)
 				/* check_object() decided it for us ... */
 		to_reuse = usable_delta;
 				/* ... but pack split may override that */
-	else if (entry->type != entry->in_pack_type)
+	else if (oe_type(entry) != entry->in_pack_type)
 		to_reuse = 0;	/* pack has delta which is unusable */
 	else if (DELTA(entry))
 		to_reuse = 0;	/* we want to pack afresh */
@@ -716,8 +717,8 @@ static struct object_entry **compute_write_order(void)
 	 * And then all remaining commits and tags.
 	 */
 	for (i = last_untagged; i < to_pack.nr_objects; i++) {
-		if (objects[i].type != OBJ_COMMIT &&
-		    objects[i].type != OBJ_TAG)
+		if (oe_type(&objects[i]) != OBJ_COMMIT &&
+		    oe_type(&objects[i]) != OBJ_TAG)
 			continue;
 		add_to_write_order(wo, &wo_end, &objects[i]);
 	}
@@ -726,7 +727,7 @@ static struct object_entry **compute_write_order(void)
 	 * And then all the trees.
 	 */
 	for (i = last_untagged; i < to_pack.nr_objects; i++) {
-		if (objects[i].type != OBJ_TREE)
+		if (oe_type(&objects[i]) != OBJ_TREE)
 			continue;
 		add_to_write_order(wo, &wo_end, &objects[i]);
 	}
@@ -1083,8 +1084,7 @@ static void create_object_entry(const struct object_id *oid,
 
 	entry = packlist_alloc(&to_pack, oid->hash, index_pos);
 	entry->hash = hash;
-	if (type)
-		entry->type = type;
+	oe_set_type(entry, type);
 	if (exclude)
 		entry->preferred_base = 1;
 	else
@@ -1453,9 +1453,9 @@ static void check_object(struct object_entry *entry)
 		switch (entry->in_pack_type) {
 		default:
 			/* Not a delta hence we've already got all we need. */
-			entry->type = entry->in_pack_type;
+			oe_set_type(entry, entry->in_pack_type);
 			entry->in_pack_header_size = used;
-			if (entry->type < OBJ_COMMIT || entry->type > OBJ_BLOB)
+			if (oe_type(entry) < OBJ_COMMIT || oe_type(entry) > OBJ_BLOB)
 				goto give_up;
 			unuse_pack(&w_curs);
 			return;
@@ -1509,7 +1509,7 @@ static void check_object(struct object_entry *entry)
 			 * deltify other objects against, in order to avoid
 			 * circular deltas.
 			 */
-			entry->type = entry->in_pack_type;
+			oe_set_type(entry, entry->in_pack_type);
 			SET_DELTA(entry, base_entry);
 			SET_DELTA_SIZE(entry, oe_size(entry));
 			entry->delta_sibling_idx = base_entry->delta_child_idx;
@@ -1518,7 +1518,7 @@ static void check_object(struct object_entry *entry)
 			return;
 		}
 
-		if (entry->type) {
+		if (oe_type(entry)) {
 			unsigned long size;
 
 			size = get_size_from_delta(p, &w_curs,
@@ -1544,14 +1544,15 @@ static void check_object(struct object_entry *entry)
 		unuse_pack(&w_curs);
 	}
 
-	entry->type = sha1_object_info(entry->idx.oid.hash, &size);
+	oe_set_type(entry, sha1_object_info(entry->idx.oid.hash, &size));
 	/*
 	 * The error condition is checked in prepare_pack().  This is
 	 * to permit a missing preferred base object to be ignored
 	 * as a preferred base.  Doing so can result in a larger
 	 * pack file, but the transfer will still take place.
 	 */
-	oe_set_size(entry, size);
+	if (entry->type_valid)
+		oe_set_size(entry, size);
 }
 
 static int pack_offset_sort(const void *_a, const void *_b)
@@ -1613,15 +1614,12 @@ static void drop_reused_delta(struct object_entry *entry)
 		 * And if that fails, the error will be recorded in entry->type
 		 * and dealt with in prepare_pack().
 		 */
-		entry->type = sha1_object_info(entry->idx.oid.hash,
-					       &size);
-		oe_set_size(entry, size);
+		oe_set_type(entry, sha1_object_info(entry->idx.oid.hash,
+						    &size));
 	} else {
-		if (type < 0)
-			die("BUG: invalid type %d", type);
-		entry->type = type;
-		oe_set_size(entry, size);
+		oe_set_type(entry, type);
 	}
+	oe_set_size(entry, size);
 }
 
 /*
@@ -1788,12 +1786,14 @@ static int type_size_sort(const void *_a, const void *_b)
 {
 	const struct object_entry *a = *(struct object_entry **)_a;
 	const struct object_entry *b = *(struct object_entry **)_b;
+	enum object_type a_type = oe_type(a);
+	enum object_type b_type = oe_type(b);
 	unsigned long a_size = oe_size(a);
 	unsigned long b_size = oe_size(b);
 
-	if (a->type > b->type)
+	if (a_type > b_type)
 		return -1;
-	if (a->type < b->type)
+	if (a_type < b_type)
 		return 1;
 	if (a->hash > b->hash)
 		return -1;
@@ -1869,7 +1869,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	void *delta_buf;
 
 	/* Don't bother doing diffs between different types */
-	if (trg_entry->type != src_entry->type)
+	if (oe_type(trg_entry) != oe_type(src_entry))
 		return -1;
 
 	/*
@@ -2484,11 +2484,11 @@ static void prepare_pack(int window, int depth)
 
 		if (!entry->preferred_base) {
 			nr_deltas++;
-			if (entry->type < 0)
+			if (oe_type(entry) < 0)
 				die("unable to get type of object %s",
 				    oid_to_hex(&entry->idx.oid));
 		} else {
-			if (entry->type < 0) {
+			if (oe_type(entry) < 0) {
 				/*
 				 * This object is not found, but we
 				 * don't have to include it anyway.
@@ -2597,7 +2597,7 @@ static void read_object_list_from_stdin(void)
 			die("expected object ID, got garbage:\n %s", line);
 
 		add_preferred_base_object(p + 1);
-		add_object_entry(&oid, 0, p + 1, 0);
+		add_object_entry(&oid, OBJ_NONE, p + 1, 0);
 	}
 }
 
@@ -3110,7 +3110,7 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 	if (pack_to_stdout != !base_name || argc)
 		usage_with_options(pack_usage, pack_objects_options);
 
-	if (depth > (1 << OE_DEPTH_BITS))
+	if (depth >= (1 << OE_DEPTH_BITS))
 		die(_("delta chain depth %d is greater than maximum limit %d"),
 		    depth, (1 << OE_DEPTH_BITS));
 	if (cache_max_small_delta_size >= (1 << OE_Z_DELTA_BITS))
diff --git a/pack-bitmap-write.c b/pack-bitmap-write.c
index 256a63f892..f7c897515b 100644
--- a/pack-bitmap-write.c
+++ b/pack-bitmap-write.c
@@ -66,12 +66,12 @@ void bitmap_writer_build_type_index(struct packing_data *to_pack,
 
 		oe_set_in_pack_pos(to_pack, entry, i);
 
-		switch (entry->type) {
+		switch (oe_type(entry)) {
 		case OBJ_COMMIT:
 		case OBJ_TREE:
 		case OBJ_BLOB:
 		case OBJ_TAG:
-			real_type = entry->type;
+			real_type = oe_type(entry);
 			break;
 
 		default:
@@ -100,7 +100,7 @@ void bitmap_writer_build_type_index(struct packing_data *to_pack,
 		default:
 			die("Missing type information for %s (%d/%d)",
 			    oid_to_hex(&entry->idx.oid), real_type,
-			    entry->type);
+			    oe_type(entry));
 		}
 	}
 }
diff --git a/pack-objects.h b/pack-objects.h
index 1a159aba37..0fa0c83294 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -28,7 +28,7 @@ enum dfs_state {
  * basic object info
  * -----------------
  * idx.oid is filled up before delta searching starts. idx.crc32 and
- * is only valid after the object is written down and will be used for
+ * is only valid after the object is written out and will be used for
  * generating the index. idx.offset will be both gradually set and
  * used in writing phase (base objects get offset first, then deltas
  * refer to them)
@@ -59,8 +59,8 @@ enum dfs_state {
  * compute_write_order(). "delta" and "delta_size" must remain valid
  * at object writing phase in case the delta is not cached.
  *
- * If a delta is cached in memory and is compressed delta points to
- * the data and z_delta_size contains the compressed size. If it's
+ * If a delta is cached in memory and is compressed delta_data points
+ * to the data and z_delta_size contains the compressed size. If it's
  * uncompressed [1], z_delta_size must be zero. delta_size is always
  * the uncompressed size and must be valid even if the delta is not
  * cached.
@@ -70,23 +70,22 @@ enum dfs_state {
  */
 struct object_entry {
 	struct pack_idx_entry idx;
-	/* object uncompressed size _if_ size_valid is true */
-	uint32_t size_;
+	void *delta_data;	/* cached delta (uncompressed) */
 	off_t in_pack_offset;
+	uint32_t hash;			/* name hint hash */
+	uint32_t size_;	/* object uncompressed size _if_ size_valid is true */
 	uint32_t delta_idx;	/* delta base object */
 	uint32_t delta_child_idx; /* deltified objects who bases me */
 	uint32_t delta_sibling_idx; /* other deltified objects who
 				     * uses the same base as me
 				     */
-	uint32_t hash;			/* name hint hash */
-	void *delta_data;	/* cached delta (uncompressed) */
-	/* object uncompressed size _if_ size_valid is true */
-	uint32_t size;
-	uint32_t delta_size_:OE_DELTA_SIZE_BITS;	/* delta data size (uncompressed) */
+	uint32_t delta_size_:OE_DELTA_SIZE_BITS; /* delta data size (uncompressed) */
 	uint32_t delta_size_valid:1;
-	unsigned char in_pack_header_size; /* note: spare bits available! */
 	unsigned in_pack_idx:OE_IN_PACK_BITS;	/* already in pack */
-	unsigned type:TYPE_BITS;
+	unsigned size_valid:1;
+	unsigned z_delta_size:OE_Z_DELTA_BITS;
+	unsigned type_valid:1;
+	unsigned type_:TYPE_BITS;
 	unsigned in_pack_type:TYPE_BITS; /* could be delta */
 	unsigned preferred_base:1; /*
 				    * we do not pack this, but is available
@@ -94,21 +93,13 @@ struct object_entry {
 				    * objects against.
 				    */
 	unsigned no_try_delta:1;
+	unsigned char in_pack_header_size;
 	unsigned tagged:1; /* near the very tip of refs */
 	unsigned filled:1; /* assigned write-order */
-	unsigned size_valid:1;
-
-	/* XXX 8 bits hole, try to pack */
-
 	unsigned dfs_state:OE_DFS_STATE_BITS;
 	unsigned depth:OE_DEPTH_BITS;
-	/*
-	 * if delta_data contains a compressed delta, this contains
-	 * the compressed length
-	*/
-	unsigned z_delta_size:OE_Z_DELTA_BITS;
 
-	/* size: 80, bit_padding: 1 bits */
+	/* size: 80, bit_padding: 16 bits */
 };
 
 struct packing_data {
@@ -151,6 +142,21 @@ static inline uint32_t pack_name_hash(const char *name)
 	return hash;
 }
 
+static inline enum object_type oe_type(const struct object_entry *e)
+{
+	return e->type_valid ? e->type_ : OBJ_BAD;
+}
+
+static inline void oe_set_type(struct object_entry *e,
+			       enum object_type type)
+{
+	if (type >= OBJ_ANY)
+		die("BUG: OBJ_ANY cannot be set in pack-objects code");
+
+	e->type_valid = type >= OBJ_NONE;
+	e->type_ = (unsigned)type;
+}
+
 static inline unsigned int oe_in_pack_pos(const struct packing_data *pack,
 					  const struct object_entry *e)
 {
-- 8< --

Nguyễn Thái Ngọc Duy (11):
  pack-objects: a bit of document about struct object_entry
  pack-objects: turn type and in_pack_type to bitfields
  pack-objects: use bitfield for object_entry::dfs_state
  pack-objects: use bitfield for object_entry::depth
  pack-objects: move in_pack_pos out of struct object_entry
  pack-objects: move in_pack out of struct object_entry
  pack-objects: refer to delta objects by index instead of pointer
  pack-objects: shrink z_delta_size field in struct object_entry
  pack-objects: shrink size field in struct object_entry
  pack-objects: shrink delta_size field in struct object_entry
  pack-objects.h: reorder members to shrink struct object_entry

 Documentation/config.txt           |   4 +-
 Documentation/git-pack-objects.txt |  13 +-
 Documentation/git-repack.txt       |   4 +-
 builtin/pack-objects.c             | 309 +++++++++++++++++------------
 cache.h                            |   3 +
 object.h                           |   1 -
 pack-bitmap-write.c                |  14 +-
 pack-bitmap.c                      |   2 +-
 pack-bitmap.h                      |   4 +-
 pack-objects.h                     | 294 ++++++++++++++++++++++++---
 10 files changed, 488 insertions(+), 160 deletions(-)

-- 
2.16.2.903.gd04caf5039


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v4 01/11] pack-objects: a bit of document about struct object_entry
  2018-03-16 18:31       ` [PATCH v4 00/11] nd/pack-objects-pack-struct updates Nguyễn Thái Ngọc Duy
@ 2018-03-16 18:31         ` Nguyễn Thái Ngọc Duy
  2018-03-16 20:32           ` Junio C Hamano
  2018-03-16 18:31         ` [PATCH v4 02/11] pack-objects: turn type and in_pack_type to bitfields Nguyễn Thái Ngọc Duy
                           ` (10 subsequent siblings)
  11 siblings, 1 reply; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-16 18:31 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

The role of this comment block becomes more important after we shuffle
fields around to shrink this struct. It will be much harder to see what
field is related to what.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 pack-objects.h | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/pack-objects.h b/pack-objects.h
index 03f1191659..85345a4af1 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -1,6 +1,50 @@
 #ifndef PACK_OBJECTS_H
 #define PACK_OBJECTS_H
 
+/*
+ * basic object info
+ * -----------------
+ * idx.oid is filled up before delta searching starts. idx.crc32 and
+ * is only valid after the object is written out and will be used for
+ * generating the index. idx.offset will be both gradually set and
+ * used in writing phase (base objects get offset first, then deltas
+ * refer to them)
+ *
+ * "size" is the uncompressed object size. Compressed size is not
+ * cached (ie. raw data in a pack) but available via revindex.
+ *
+ * "hash" contains a path name hash which is used for sorting the
+ * delta list and also during delta searching. Once prepare_pack()
+ * returns it's no longer needed.
+ *
+ * source pack info
+ * ----------------
+ * The (in_pack, in_pack_offset, in_pack_header_size) tuple contains
+ * the location of the object in the source pack, with or without
+ * header.
+ *
+ * "type" and "in_pack_type" both describe object type. in_pack_type
+ * may contain a delta type, while type is always the canonical type.
+ *
+ * deltas
+ * ------
+ * Delta links (delta, delta_child and delta_sibling) are created
+ * reflect that delta graph from the source pack then updated or added
+ * during delta searching phase when we find better deltas.
+ *
+ * delta_child and delta_sibling are last needed in
+ * compute_write_order(). "delta" and "delta_size" must remain valid
+ * at object writing phase in case the delta is not cached.
+ *
+ * If a delta is cached in memory and is compressed delta_data points
+ * to the data and z_delta_size contains the compressed size. If it's
+ * uncompressed [1], z_delta_size must be zero. delta_size is always
+ * the uncompressed size and must be valid even if the delta is not
+ * cached.
+ *
+ * [1] during try_delta phase we don't bother with compressing because
+ * the delta could be quickly replaced with a better one.
+ */
 struct object_entry {
 	struct pack_idx_entry idx;
 	unsigned long size;	/* uncompressed size */
-- 
2.16.2.903.gd04caf5039


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v4 02/11] pack-objects: turn type and in_pack_type to bitfields
  2018-03-16 18:31       ` [PATCH v4 00/11] nd/pack-objects-pack-struct updates Nguyễn Thái Ngọc Duy
  2018-03-16 18:31         ` [PATCH v4 01/11] pack-objects: a bit of document about struct object_entry Nguyễn Thái Ngọc Duy
@ 2018-03-16 18:31         ` Nguyễn Thái Ngọc Duy
  2018-03-16 20:49           ` Junio C Hamano
  2018-03-16 18:31         ` [PATCH v4 03/11] pack-objects: use bitfield for object_entry::dfs_state Nguyễn Thái Ngọc Duy
                           ` (9 subsequent siblings)
  11 siblings, 1 reply; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-16 18:31 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

An extra field type_valid is added to carry the equivalent of OBJ_BAD
in the original "type" field. in_pack_type always contains a valid
type so we only need 3 bits for it.

A note about accepting OBJ_NONE as "valid" type. The function
read_object_list_from_stdin() can pass this value [1] and it
eventually calls create_object_entry() where current code skip setting
"type" field if the incoming type is zero. This does not have any bad
side effects because "type" field should be memset()'d anyway.

But since we also need to set type_valid now, skipping oe_set_type()
leaves type_valid zero/false, which will make oe_type() return
OBJ_BAD, not OBJ_NONE anymore. Apparently we do care about OBJ_NONE in
prepare_pack(). This switch from OBJ_NONE to OBJ_BAD may trigger

    fatal: unable to get type of object ...

Accepting OBJ_NONE [2] does sound wrong, but this is how it is has
been for a very long time and I haven't time to dig in further.

[1] See 5c49c11686 (pack-objects: better check_object() performances -
    2007-04-16)

[2] 21666f1aae (convert object type handling from a string to a number
    - 2007-02-26)

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 58 +++++++++++++++++++++++++-----------------
 cache.h                |  2 ++
 object.h               |  1 -
 pack-bitmap-write.c    |  6 ++---
 pack-objects.h         | 20 +++++++++++++--
 5 files changed, 57 insertions(+), 30 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 5c674b2843..13f6a44fb2 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -265,7 +265,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 	struct git_istream *st = NULL;
 
 	if (!usable_delta) {
-		if (entry->type == OBJ_BLOB &&
+		if (oe_type(entry) == OBJ_BLOB &&
 		    entry->size > big_file_threshold &&
 		    (st = open_istream(entry->idx.oid.hash, &type, &size, NULL)) != NULL)
 			buf = NULL;
@@ -371,7 +371,7 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 	struct pack_window *w_curs = NULL;
 	struct revindex_entry *revidx;
 	off_t offset;
-	enum object_type type = entry->type;
+	enum object_type type = oe_type(entry);
 	off_t datalen;
 	unsigned char header[MAX_PACK_OBJECT_HEADER],
 		      dheader[MAX_PACK_OBJECT_HEADER];
@@ -480,11 +480,12 @@ static off_t write_object(struct hashfile *f,
 		to_reuse = 0;	/* explicit */
 	else if (!entry->in_pack)
 		to_reuse = 0;	/* can't reuse what we don't have */
-	else if (entry->type == OBJ_REF_DELTA || entry->type == OBJ_OFS_DELTA)
+	else if (oe_type(entry) == OBJ_REF_DELTA ||
+		 oe_type(entry) == OBJ_OFS_DELTA)
 				/* check_object() decided it for us ... */
 		to_reuse = usable_delta;
 				/* ... but pack split may override that */
-	else if (entry->type != entry->in_pack_type)
+	else if (oe_type(entry) != entry->in_pack_type)
 		to_reuse = 0;	/* pack has delta which is unusable */
 	else if (entry->delta)
 		to_reuse = 0;	/* we want to pack afresh */
@@ -705,8 +706,8 @@ static struct object_entry **compute_write_order(void)
 	 * And then all remaining commits and tags.
 	 */
 	for (i = last_untagged; i < to_pack.nr_objects; i++) {
-		if (objects[i].type != OBJ_COMMIT &&
-		    objects[i].type != OBJ_TAG)
+		if (oe_type(&objects[i]) != OBJ_COMMIT &&
+		    oe_type(&objects[i]) != OBJ_TAG)
 			continue;
 		add_to_write_order(wo, &wo_end, &objects[i]);
 	}
@@ -715,7 +716,7 @@ static struct object_entry **compute_write_order(void)
 	 * And then all the trees.
 	 */
 	for (i = last_untagged; i < to_pack.nr_objects; i++) {
-		if (objects[i].type != OBJ_TREE)
+		if (oe_type(&objects[i]) != OBJ_TREE)
 			continue;
 		add_to_write_order(wo, &wo_end, &objects[i]);
 	}
@@ -1066,8 +1067,7 @@ static void create_object_entry(const struct object_id *oid,
 
 	entry = packlist_alloc(&to_pack, oid->hash, index_pos);
 	entry->hash = hash;
-	if (type)
-		entry->type = type;
+	oe_set_type(entry, type);
 	if (exclude)
 		entry->preferred_base = 1;
 	else
@@ -1407,6 +1407,7 @@ static void check_object(struct object_entry *entry)
 		unsigned long avail;
 		off_t ofs;
 		unsigned char *buf, c;
+		enum object_type type;
 
 		buf = use_pack(p, &w_curs, entry->in_pack_offset, &avail);
 
@@ -1415,11 +1416,15 @@ static void check_object(struct object_entry *entry)
 		 * since non-delta representations could still be reused.
 		 */
 		used = unpack_object_header_buffer(buf, avail,
-						   &entry->in_pack_type,
+						   &type,
 						   &entry->size);
 		if (used == 0)
 			goto give_up;
 
+		if (type < 0)
+			die("BUG: invalid type %d", type);
+		entry->in_pack_type = type;
+
 		/*
 		 * Determine if this is a delta and if so whether we can
 		 * reuse it or not.  Otherwise let's find out as cheaply as
@@ -1428,9 +1433,9 @@ static void check_object(struct object_entry *entry)
 		switch (entry->in_pack_type) {
 		default:
 			/* Not a delta hence we've already got all we need. */
-			entry->type = entry->in_pack_type;
+			oe_set_type(entry, entry->in_pack_type);
 			entry->in_pack_header_size = used;
-			if (entry->type < OBJ_COMMIT || entry->type > OBJ_BLOB)
+			if (oe_type(entry) < OBJ_COMMIT || oe_type(entry) > OBJ_BLOB)
 				goto give_up;
 			unuse_pack(&w_curs);
 			return;
@@ -1484,7 +1489,7 @@ static void check_object(struct object_entry *entry)
 			 * deltify other objects against, in order to avoid
 			 * circular deltas.
 			 */
-			entry->type = entry->in_pack_type;
+			oe_set_type(entry, entry->in_pack_type);
 			entry->delta = base_entry;
 			entry->delta_size = entry->size;
 			entry->delta_sibling = base_entry->delta_child;
@@ -1493,7 +1498,7 @@ static void check_object(struct object_entry *entry)
 			return;
 		}
 
-		if (entry->type) {
+		if (oe_type(entry)) {
 			/*
 			 * This must be a delta and we already know what the
 			 * final object type is.  Let's extract the actual
@@ -1516,7 +1521,7 @@ static void check_object(struct object_entry *entry)
 		unuse_pack(&w_curs);
 	}
 
-	entry->type = sha1_object_info(entry->idx.oid.hash, &entry->size);
+	oe_set_type(entry, sha1_object_info(entry->idx.oid.hash, &entry->size));
 	/*
 	 * The error condition is checked in prepare_pack().  This is
 	 * to permit a missing preferred base object to be ignored
@@ -1559,6 +1564,7 @@ static void drop_reused_delta(struct object_entry *entry)
 {
 	struct object_entry **p = &entry->delta->delta_child;
 	struct object_info oi = OBJECT_INFO_INIT;
+	enum object_type type;
 
 	while (*p) {
 		if (*p == entry)
@@ -1570,7 +1576,7 @@ static void drop_reused_delta(struct object_entry *entry)
 	entry->depth = 0;
 
 	oi.sizep = &entry->size;
-	oi.typep = &entry->type;
+	oi.typep = &type;
 	if (packed_object_info(entry->in_pack, entry->in_pack_offset, &oi) < 0) {
 		/*
 		 * We failed to get the info from this pack for some reason;
@@ -1578,8 +1584,10 @@ static void drop_reused_delta(struct object_entry *entry)
 		 * And if that fails, the error will be recorded in entry->type
 		 * and dealt with in prepare_pack().
 		 */
-		entry->type = sha1_object_info(entry->idx.oid.hash,
-					       &entry->size);
+		oe_set_type(entry, sha1_object_info(entry->idx.oid.hash,
+						    &entry->size));
+	} else {
+		oe_set_type(entry, type);
 	}
 }
 
@@ -1747,10 +1755,12 @@ static int type_size_sort(const void *_a, const void *_b)
 {
 	const struct object_entry *a = *(struct object_entry **)_a;
 	const struct object_entry *b = *(struct object_entry **)_b;
+	enum object_type a_type = oe_type(a);
+	enum object_type b_type = oe_type(b);
 
-	if (a->type > b->type)
+	if (a_type > b_type)
 		return -1;
-	if (a->type < b->type)
+	if (a_type < b_type)
 		return 1;
 	if (a->hash > b->hash)
 		return -1;
@@ -1826,7 +1836,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	void *delta_buf;
 
 	/* Don't bother doing diffs between different types */
-	if (trg_entry->type != src_entry->type)
+	if (oe_type(trg_entry) != oe_type(src_entry))
 		return -1;
 
 	/*
@@ -2432,11 +2442,11 @@ static void prepare_pack(int window, int depth)
 
 		if (!entry->preferred_base) {
 			nr_deltas++;
-			if (entry->type < 0)
+			if (oe_type(entry) < 0)
 				die("unable to get type of object %s",
 				    oid_to_hex(&entry->idx.oid));
 		} else {
-			if (entry->type < 0) {
+			if (oe_type(entry) < 0) {
 				/*
 				 * This object is not found, but we
 				 * don't have to include it anyway.
@@ -2545,7 +2555,7 @@ static void read_object_list_from_stdin(void)
 			die("expected object ID, got garbage:\n %s", line);
 
 		add_preferred_base_object(p + 1);
-		add_object_entry(&oid, 0, p + 1, 0);
+		add_object_entry(&oid, OBJ_NONE, p + 1, 0);
 	}
 }
 
diff --git a/cache.h b/cache.h
index 21fbcc2414..862bdff83a 100644
--- a/cache.h
+++ b/cache.h
@@ -373,6 +373,8 @@ extern void free_name_hash(struct index_state *istate);
 #define read_blob_data_from_cache(path, sz) read_blob_data_from_index(&the_index, (path), (sz))
 #endif
 
+#define TYPE_BITS 3
+
 enum object_type {
 	OBJ_BAD = -1,
 	OBJ_NONE = 0,
diff --git a/object.h b/object.h
index 87563d9056..8ce294d6ec 100644
--- a/object.h
+++ b/object.h
@@ -25,7 +25,6 @@ struct object_array {
 
 #define OBJECT_ARRAY_INIT { 0, 0, NULL }
 
-#define TYPE_BITS   3
 /*
  * object flag allocation:
  * revision.h:      0---------10                                26
diff --git a/pack-bitmap-write.c b/pack-bitmap-write.c
index e01f992884..fd11f08940 100644
--- a/pack-bitmap-write.c
+++ b/pack-bitmap-write.c
@@ -64,12 +64,12 @@ void bitmap_writer_build_type_index(struct pack_idx_entry **index,
 
 		entry->in_pack_pos = i;
 
-		switch (entry->type) {
+		switch (oe_type(entry)) {
 		case OBJ_COMMIT:
 		case OBJ_TREE:
 		case OBJ_BLOB:
 		case OBJ_TAG:
-			real_type = entry->type;
+			real_type = oe_type(entry);
 			break;
 
 		default:
@@ -98,7 +98,7 @@ void bitmap_writer_build_type_index(struct pack_idx_entry **index,
 		default:
 			die("Missing type information for %s (%d/%d)",
 			    oid_to_hex(&entry->idx.oid), real_type,
-			    entry->type);
+			    oe_type(entry));
 		}
 	}
 }
diff --git a/pack-objects.h b/pack-objects.h
index 85345a4af1..38d3ff167f 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -58,8 +58,9 @@ struct object_entry {
 	void *delta_data;	/* cached delta (uncompressed) */
 	unsigned long delta_size;	/* delta data size (uncompressed) */
 	unsigned long z_delta_size;	/* delta data size (compressed) */
-	enum object_type type;
-	enum object_type in_pack_type;	/* could be delta */
+	unsigned type_:TYPE_BITS;
+	unsigned in_pack_type:TYPE_BITS; /* could be delta */
+	unsigned type_valid:1;
 	uint32_t hash;			/* name hint hash */
 	unsigned int in_pack_pos;
 	unsigned char in_pack_header_size;
@@ -122,4 +123,19 @@ static inline uint32_t pack_name_hash(const char *name)
 	return hash;
 }
 
+static inline enum object_type oe_type(const struct object_entry *e)
+{
+	return e->type_valid ? e->type_ : OBJ_BAD;
+}
+
+static inline void oe_set_type(struct object_entry *e,
+			       enum object_type type)
+{
+	if (type >= OBJ_ANY)
+		die("BUG: OBJ_ANY cannot be set in pack-objects code");
+
+	e->type_valid = type >= OBJ_NONE;
+	e->type_ = (unsigned)type;
+}
+
 #endif
-- 
2.16.2.903.gd04caf5039


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v4 03/11] pack-objects: use bitfield for object_entry::dfs_state
  2018-03-16 18:31       ` [PATCH v4 00/11] nd/pack-objects-pack-struct updates Nguyễn Thái Ngọc Duy
  2018-03-16 18:31         ` [PATCH v4 01/11] pack-objects: a bit of document about struct object_entry Nguyễn Thái Ngọc Duy
  2018-03-16 18:31         ` [PATCH v4 02/11] pack-objects: turn type and in_pack_type to bitfields Nguyễn Thái Ngọc Duy
@ 2018-03-16 18:31         ` Nguyễn Thái Ngọc Duy
  2018-03-16 18:31         ` [PATCH v4 04/11] pack-objects: use bitfield for object_entry::depth Nguyễn Thái Ngọc Duy
                           ` (8 subsequent siblings)
  11 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-16 18:31 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c |  3 +++
 pack-objects.h         | 28 +++++++++++++++++-----------
 2 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 13f6a44fb2..09f8b4ef3e 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -3049,6 +3049,9 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 		OPT_END(),
 	};
 
+	if (DFS_NUM_STATES > (1 << OE_DFS_STATE_BITS))
+		die("BUG: too many dfs states, increase OE_DFS_STATE_BITS");
+
 	check_replace_refs = 0;
 
 	reset_pack_idx_option(&pack_idx_opts);
diff --git a/pack-objects.h b/pack-objects.h
index 38d3ff167f..2bb1732098 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -1,6 +1,21 @@
 #ifndef PACK_OBJECTS_H
 #define PACK_OBJECTS_H
 
+#define OE_DFS_STATE_BITS	2
+
+/*
+ * State flags for depth-first search used for analyzing delta cycles.
+ *
+ * The depth is measured in delta-links to the base (so if A is a delta
+ * against B, then A has a depth of 1, and B a depth of 0).
+ */
+enum dfs_state {
+	DFS_NONE = 0,
+	DFS_ACTIVE,
+	DFS_DONE,
+	DFS_NUM_STATES
+};
+
 /*
  * basic object info
  * -----------------
@@ -72,19 +87,10 @@ struct object_entry {
 	unsigned no_try_delta:1;
 	unsigned tagged:1; /* near the very tip of refs */
 	unsigned filled:1; /* assigned write-order */
+	unsigned dfs_state:OE_DFS_STATE_BITS;
 
-	/*
-	 * State flags for depth-first search used for analyzing delta cycles.
-	 *
-	 * The depth is measured in delta-links to the base (so if A is a delta
-	 * against B, then A has a depth of 1, and B a depth of 0).
-	 */
-	enum {
-		DFS_NONE = 0,
-		DFS_ACTIVE,
-		DFS_DONE
-	} dfs_state;
 	int depth;
+
 };
 
 struct packing_data {
-- 
2.16.2.903.gd04caf5039


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v4 04/11] pack-objects: use bitfield for object_entry::depth
  2018-03-16 18:31       ` [PATCH v4 00/11] nd/pack-objects-pack-struct updates Nguyễn Thái Ngọc Duy
                           ` (2 preceding siblings ...)
  2018-03-16 18:31         ` [PATCH v4 03/11] pack-objects: use bitfield for object_entry::dfs_state Nguyễn Thái Ngọc Duy
@ 2018-03-16 18:31         ` Nguyễn Thái Ngọc Duy
  2018-03-16 18:31         ` [PATCH v4 05/11] pack-objects: move in_pack_pos out of struct object_entry Nguyễn Thái Ngọc Duy
                           ` (7 subsequent siblings)
  11 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-16 18:31 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

Because of struct packing from now on we can only handle max depth
4095 (or even lower when new booleans are added in this struct). This
should be ok since long delta chain will cause significant slow down
anyway.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 Documentation/config.txt           | 1 +
 Documentation/git-pack-objects.txt | 4 +++-
 Documentation/git-repack.txt       | 4 +++-
 builtin/pack-objects.c             | 4 ++++
 pack-objects.h                     | 5 ++---
 5 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index f57e9cf10c..9bd3f5a789 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -2412,6 +2412,7 @@ pack.window::
 pack.depth::
 	The maximum delta depth used by linkgit:git-pack-objects[1] when no
 	maximum depth is given on the command line. Defaults to 50.
+	Maximum value is 4095.
 
 pack.windowMemory::
 	The maximum size of memory that is consumed by each thread
diff --git a/Documentation/git-pack-objects.txt b/Documentation/git-pack-objects.txt
index 81bc490ac5..3503c9e3e6 100644
--- a/Documentation/git-pack-objects.txt
+++ b/Documentation/git-pack-objects.txt
@@ -96,7 +96,9 @@ base-name::
 	it too deep affects the performance on the unpacker
 	side, because delta data needs to be applied that many
 	times to get to the necessary object.
-	The default value for --window is 10 and --depth is 50.
++
+The default value for --window is 10 and --depth is 50. The maximum
+depth is 4095.
 
 --window-memory=<n>::
 	This option provides an additional limit on top of `--window`;
diff --git a/Documentation/git-repack.txt b/Documentation/git-repack.txt
index ae750e9e11..25c83c4927 100644
--- a/Documentation/git-repack.txt
+++ b/Documentation/git-repack.txt
@@ -90,7 +90,9 @@ other objects in that pack they already have locally.
 	space. `--depth` limits the maximum delta depth; making it too deep
 	affects the performance on the unpacker side, because delta data needs
 	to be applied that many times to get to the necessary object.
-	The default value for --window is 10 and --depth is 50.
++
+The default value for --window is 10 and --depth is 50. The maximum
+depth is 4095.
 
 --threads=<n>::
 	This option is passed through to `git pack-objects`.
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 09f8b4ef3e..668eaf8cd7 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -3068,6 +3068,10 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 	if (pack_to_stdout != !base_name || argc)
 		usage_with_options(pack_usage, pack_objects_options);
 
+	if (depth >= (1 << OE_DEPTH_BITS))
+		die(_("delta chain depth %d is greater than maximum limit %d"),
+		    depth, (1 << OE_DEPTH_BITS));
+
 	argv_array_push(&rp, "pack-objects");
 	if (thin) {
 		use_internal_rev_list = 1;
diff --git a/pack-objects.h b/pack-objects.h
index 2bb1732098..50908d1f2d 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -2,6 +2,7 @@
 #define PACK_OBJECTS_H
 
 #define OE_DFS_STATE_BITS	2
+#define OE_DEPTH_BITS		12
 
 /*
  * State flags for depth-first search used for analyzing delta cycles.
@@ -88,9 +89,7 @@ struct object_entry {
 	unsigned tagged:1; /* near the very tip of refs */
 	unsigned filled:1; /* assigned write-order */
 	unsigned dfs_state:OE_DFS_STATE_BITS;
-
-	int depth;
-
+	unsigned depth:OE_DEPTH_BITS;
 };
 
 struct packing_data {
-- 
2.16.2.903.gd04caf5039


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v4 05/11] pack-objects: move in_pack_pos out of struct object_entry
  2018-03-16 18:31       ` [PATCH v4 00/11] nd/pack-objects-pack-struct updates Nguyễn Thái Ngọc Duy
                           ` (3 preceding siblings ...)
  2018-03-16 18:31         ` [PATCH v4 04/11] pack-objects: use bitfield for object_entry::depth Nguyễn Thái Ngọc Duy
@ 2018-03-16 18:31         ` Nguyễn Thái Ngọc Duy
  2018-03-16 18:31         ` [PATCH v4 06/11] pack-objects: move in_pack " Nguyễn Thái Ngọc Duy
                           ` (6 subsequent siblings)
  11 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-16 18:31 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

This field is only need for pack-bitmap, which is an optional
feature. Move it to a separate array that is only allocated when
pack-bitmap is used (it's not freed in the same way that objects[] is
not).

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c |  3 ++-
 pack-bitmap-write.c    |  8 +++++---
 pack-bitmap.c          |  2 +-
 pack-bitmap.h          |  4 +++-
 pack-objects.h         | 16 +++++++++++++++-
 5 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 668eaf8cd7..b281487b96 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -879,7 +879,8 @@ static void write_pack_file(void)
 
 			if (write_bitmap_index) {
 				bitmap_writer_set_checksum(oid.hash);
-				bitmap_writer_build_type_index(written_list, nr_written);
+				bitmap_writer_build_type_index(
+					&to_pack, written_list, nr_written);
 			}
 
 			finish_tmp_packfile(&tmpname, pack_tmp_name,
diff --git a/pack-bitmap-write.c b/pack-bitmap-write.c
index fd11f08940..f7c897515b 100644
--- a/pack-bitmap-write.c
+++ b/pack-bitmap-write.c
@@ -48,7 +48,8 @@ void bitmap_writer_show_progress(int show)
 /**
  * Build the initial type index for the packfile
  */
-void bitmap_writer_build_type_index(struct pack_idx_entry **index,
+void bitmap_writer_build_type_index(struct packing_data *to_pack,
+				    struct pack_idx_entry **index,
 				    uint32_t index_nr)
 {
 	uint32_t i;
@@ -57,12 +58,13 @@ void bitmap_writer_build_type_index(struct pack_idx_entry **index,
 	writer.trees = ewah_new();
 	writer.blobs = ewah_new();
 	writer.tags = ewah_new();
+	ALLOC_ARRAY(to_pack->in_pack_pos, to_pack->nr_objects);
 
 	for (i = 0; i < index_nr; ++i) {
 		struct object_entry *entry = (struct object_entry *)index[i];
 		enum object_type real_type;
 
-		entry->in_pack_pos = i;
+		oe_set_in_pack_pos(to_pack, entry, i);
 
 		switch (oe_type(entry)) {
 		case OBJ_COMMIT:
@@ -147,7 +149,7 @@ static uint32_t find_object_pos(const unsigned char *sha1)
 			"(object %s is missing)", sha1_to_hex(sha1));
 	}
 
-	return entry->in_pack_pos;
+	return oe_in_pack_pos(writer.to_pack, entry);
 }
 
 static void show_object(struct object *object, const char *name, void *data)
diff --git a/pack-bitmap.c b/pack-bitmap.c
index 9270983e5f..865d9ecc4e 100644
--- a/pack-bitmap.c
+++ b/pack-bitmap.c
@@ -1032,7 +1032,7 @@ int rebuild_existing_bitmaps(struct packing_data *mapping,
 		oe = packlist_find(mapping, sha1, NULL);
 
 		if (oe)
-			reposition[i] = oe->in_pack_pos + 1;
+			reposition[i] = oe_in_pack_pos(mapping, oe) + 1;
 	}
 
 	rebuild = bitmap_new();
diff --git a/pack-bitmap.h b/pack-bitmap.h
index 3742a00e14..5ded2f139a 100644
--- a/pack-bitmap.h
+++ b/pack-bitmap.h
@@ -44,7 +44,9 @@ int rebuild_existing_bitmaps(struct packing_data *mapping, khash_sha1 *reused_bi
 
 void bitmap_writer_show_progress(int show);
 void bitmap_writer_set_checksum(unsigned char *sha1);
-void bitmap_writer_build_type_index(struct pack_idx_entry **index, uint32_t index_nr);
+void bitmap_writer_build_type_index(struct packing_data *to_pack,
+				    struct pack_idx_entry **index,
+				    uint32_t index_nr);
 void bitmap_writer_reuse_bitmaps(struct packing_data *to_pack);
 void bitmap_writer_select_commits(struct commit **indexed_commits,
 		unsigned int indexed_commits_nr, int max_bitmaps);
diff --git a/pack-objects.h b/pack-objects.h
index 50908d1f2d..dae160e7c2 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -78,7 +78,6 @@ struct object_entry {
 	unsigned in_pack_type:TYPE_BITS; /* could be delta */
 	unsigned type_valid:1;
 	uint32_t hash;			/* name hint hash */
-	unsigned int in_pack_pos;
 	unsigned char in_pack_header_size;
 	unsigned preferred_base:1; /*
 				    * we do not pack this, but is available
@@ -98,6 +97,8 @@ struct packing_data {
 
 	int32_t *index;
 	uint32_t index_size;
+
+	unsigned int *in_pack_pos;
 };
 
 struct object_entry *packlist_alloc(struct packing_data *pdata,
@@ -143,4 +144,17 @@ static inline void oe_set_type(struct object_entry *e,
 	e->type_ = (unsigned)type;
 }
 
+static inline unsigned int oe_in_pack_pos(const struct packing_data *pack,
+					  const struct object_entry *e)
+{
+	return pack->in_pack_pos[e - pack->objects];
+}
+
+static inline void oe_set_in_pack_pos(const struct packing_data *pack,
+				      const struct object_entry *e,
+				      unsigned int pos)
+{
+	pack->in_pack_pos[e - pack->objects] = pos;
+}
+
 #endif
-- 
2.16.2.903.gd04caf5039


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v4 06/11] pack-objects: move in_pack out of struct object_entry
  2018-03-16 18:31       ` [PATCH v4 00/11] nd/pack-objects-pack-struct updates Nguyễn Thái Ngọc Duy
                           ` (4 preceding siblings ...)
  2018-03-16 18:31         ` [PATCH v4 05/11] pack-objects: move in_pack_pos out of struct object_entry Nguyễn Thái Ngọc Duy
@ 2018-03-16 18:31         ` Nguyễn Thái Ngọc Duy
  2018-03-26 20:39           ` Stefan Beller
  2018-03-16 18:31         ` [PATCH v4 07/11] pack-objects: refer to delta objects by index instead of pointer Nguyễn Thái Ngọc Duy
                           ` (5 subsequent siblings)
  11 siblings, 1 reply; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-16 18:31 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

Instead of using 8 bytes (on 64 bit arch) to store a pointer to a
pack. Use an index instead since the number of packs should be
relatively small.

This limits the number of packs we can handle to 16k. For now if you hit
16k pack files limit, pack-objects will simply fail [1].

[1] The escape hatch is .keep file to limit the non-kept pack files
    below 16k limit. Then you can go for another pack-objects run to
    combine another 16k pack files. Repeat until you're satisfied.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 Documentation/git-pack-objects.txt |  9 ++++++
 builtin/pack-objects.c             | 40 +++++++++++++++++----------
 cache.h                            |  1 +
 pack-objects.h                     | 44 +++++++++++++++++++++++++++++-
 4 files changed, 79 insertions(+), 15 deletions(-)

diff --git a/Documentation/git-pack-objects.txt b/Documentation/git-pack-objects.txt
index 3503c9e3e6..b8d936ccf5 100644
--- a/Documentation/git-pack-objects.txt
+++ b/Documentation/git-pack-objects.txt
@@ -269,6 +269,15 @@ Unexpected missing object will raise an error.
 	locally created objects [without .promisor] and objects from the
 	promisor remote [with .promisor].)  This is used with partial clone.
 
+LIMITATIONS
+-----------
+
+This command could only handle 16384 existing pack files at a time.
+If you have more than this, you need to exclude some pack files with
+".keep" file and --honor-pack-keep option, to combine 16k pack files
+in one, then remove these .keep files and run pack-objects one more
+time.
+
 SEE ALSO
 --------
 linkgit:git-rev-list[1]
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index b281487b96..ca993e55dd 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -29,6 +29,8 @@
 #include "list.h"
 #include "packfile.h"
 
+#define IN_PACK(obj) oe_in_pack(&to_pack, obj)
+
 static const char *pack_usage[] = {
 	N_("git pack-objects --stdout [<options>...] [< <ref-list> | < <object-list>]"),
 	N_("git pack-objects [<options>...] <base-name> [< <ref-list> | < <object-list>]"),
@@ -367,7 +369,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 				unsigned long limit, int usable_delta)
 {
-	struct packed_git *p = entry->in_pack;
+	struct packed_git *p = IN_PACK(entry);
 	struct pack_window *w_curs = NULL;
 	struct revindex_entry *revidx;
 	off_t offset;
@@ -478,7 +480,7 @@ static off_t write_object(struct hashfile *f,
 
 	if (!reuse_object)
 		to_reuse = 0;	/* explicit */
-	else if (!entry->in_pack)
+	else if (!IN_PACK(entry))
 		to_reuse = 0;	/* can't reuse what we don't have */
 	else if (oe_type(entry) == OBJ_REF_DELTA ||
 		 oe_type(entry) == OBJ_OFS_DELTA)
@@ -1025,7 +1027,7 @@ static int want_object_in_pack(const struct object_id *oid,
 	if (*found_pack) {
 		want = want_found_object(exclude, *found_pack);
 		if (want != -1)
-			return want;
+			goto done;
 	}
 
 	list_for_each(pos, &packed_git_mru) {
@@ -1048,11 +1050,16 @@ static int want_object_in_pack(const struct object_id *oid,
 			if (!exclude && want > 0)
 				list_move(&p->mru, &packed_git_mru);
 			if (want != -1)
-				return want;
+				goto done;
 		}
 	}
 
-	return 1;
+	want = 1;
+done:
+	if (want && *found_pack && !(*found_pack)->index)
+		oe_add_pack(&to_pack, *found_pack);
+
+	return want;
 }
 
 static void create_object_entry(const struct object_id *oid,
@@ -1074,7 +1081,7 @@ static void create_object_entry(const struct object_id *oid,
 	else
 		nr_result++;
 	if (found_pack) {
-		entry->in_pack = found_pack;
+		oe_set_in_pack(entry, found_pack);
 		entry->in_pack_offset = found_offset;
 	}
 
@@ -1399,8 +1406,8 @@ static void cleanup_preferred_base(void)
 
 static void check_object(struct object_entry *entry)
 {
-	if (entry->in_pack) {
-		struct packed_git *p = entry->in_pack;
+	if (IN_PACK(entry)) {
+		struct packed_git *p = IN_PACK(entry);
 		struct pack_window *w_curs = NULL;
 		const unsigned char *base_ref = NULL;
 		struct object_entry *base_entry;
@@ -1535,14 +1542,16 @@ static int pack_offset_sort(const void *_a, const void *_b)
 {
 	const struct object_entry *a = *(struct object_entry **)_a;
 	const struct object_entry *b = *(struct object_entry **)_b;
+	const struct packed_git *a_in_pack = IN_PACK(a);
+	const struct packed_git *b_in_pack = IN_PACK(b);
 
 	/* avoid filesystem trashing with loose objects */
-	if (!a->in_pack && !b->in_pack)
+	if (!a_in_pack && !b_in_pack)
 		return oidcmp(&a->idx.oid, &b->idx.oid);
 
-	if (a->in_pack < b->in_pack)
+	if (a_in_pack < b_in_pack)
 		return -1;
-	if (a->in_pack > b->in_pack)
+	if (a_in_pack > b_in_pack)
 		return 1;
 	return a->in_pack_offset < b->in_pack_offset ? -1 :
 			(a->in_pack_offset > b->in_pack_offset);
@@ -1578,7 +1587,7 @@ static void drop_reused_delta(struct object_entry *entry)
 
 	oi.sizep = &entry->size;
 	oi.typep = &type;
-	if (packed_object_info(entry->in_pack, entry->in_pack_offset, &oi) < 0) {
+	if (packed_object_info(IN_PACK(entry), entry->in_pack_offset, &oi) < 0) {
 		/*
 		 * We failed to get the info from this pack for some reason;
 		 * fall back to sha1_object_info, which may find another copy.
@@ -1848,8 +1857,8 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	 * it, we will still save the transfer cost, as we already know
 	 * the other side has it and we won't send src_entry at all.
 	 */
-	if (reuse_delta && trg_entry->in_pack &&
-	    trg_entry->in_pack == src_entry->in_pack &&
+	if (reuse_delta && IN_PACK(trg_entry) &&
+	    IN_PACK(trg_entry) == IN_PACK(src_entry) &&
 	    !src_entry->preferred_base &&
 	    trg_entry->in_pack_type != OBJ_REF_DELTA &&
 	    trg_entry->in_pack_type != OBJ_OFS_DELTA)
@@ -3191,6 +3200,9 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 		}
 	}
 
+	/* make sure IN_PACK(0) return NULL */
+	oe_add_pack(&to_pack, NULL);
+
 	if (progress)
 		progress_state = start_progress(_("Counting objects"), 0);
 	if (!use_internal_rev_list)
diff --git a/cache.h b/cache.h
index 862bdff83a..b90feb3802 100644
--- a/cache.h
+++ b/cache.h
@@ -1635,6 +1635,7 @@ extern struct packed_git {
 	int index_version;
 	time_t mtime;
 	int pack_fd;
+	int index;		/* for builtin/pack-objects.c */
 	unsigned pack_local:1,
 		 pack_keep:1,
 		 freshened:1,
diff --git a/pack-objects.h b/pack-objects.h
index dae160e7c2..9bcb5946e5 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -3,6 +3,7 @@
 
 #define OE_DFS_STATE_BITS	2
 #define OE_DEPTH_BITS		12
+#define OE_IN_PACK_BITS		14
 
 /*
  * State flags for depth-first search used for analyzing delta cycles.
@@ -18,6 +19,10 @@ enum dfs_state {
 };
 
 /*
+ * The size of struct nearly determines pack-objects's memory
+ * consumption. This struct is packed tight for that reason. When you
+ * add or reorder something in this struct, think a bit about this.
+ *
  * basic object info
  * -----------------
  * idx.oid is filled up before delta searching starts. idx.crc32 and
@@ -64,7 +69,7 @@ enum dfs_state {
 struct object_entry {
 	struct pack_idx_entry idx;
 	unsigned long size;	/* uncompressed size */
-	struct packed_git *in_pack;	/* already in pack */
+	unsigned in_pack_idx:OE_IN_PACK_BITS;	/* already in pack */
 	off_t in_pack_offset;
 	struct object_entry *delta;	/* delta base object */
 	struct object_entry *delta_child; /* deltified objects who bases me */
@@ -99,6 +104,8 @@ struct packing_data {
 	uint32_t index_size;
 
 	unsigned int *in_pack_pos;
+	int in_pack_count;
+	struct packed_git *in_pack[1 << OE_IN_PACK_BITS];
 };
 
 struct object_entry *packlist_alloc(struct packing_data *pdata,
@@ -157,4 +164,39 @@ static inline void oe_set_in_pack_pos(const struct packing_data *pack,
 	pack->in_pack_pos[e - pack->objects] = pos;
 }
 
+static inline unsigned int oe_add_pack(struct packing_data *pack,
+				       struct packed_git *p)
+{
+	if (pack->in_pack_count >= (1 << OE_IN_PACK_BITS))
+		die(_("too many packs to handle in one go. "
+		      "Please add .keep files to exclude\n"
+		      "some pack files and keep the number "
+		      "of non-kept files below %d."),
+		    1 << OE_IN_PACK_BITS);
+	if (p) {
+		if (p->index > 0)
+			die("BUG: this packed is already indexed");
+		p->index = pack->in_pack_count;
+	}
+	pack->in_pack[pack->in_pack_count] = p;
+	return pack->in_pack_count++;
+}
+
+static inline struct packed_git *oe_in_pack(const struct packing_data *pack,
+					    const struct object_entry *e)
+{
+	return pack->in_pack[e->in_pack_idx];
+
+}
+
+static inline void oe_set_in_pack(struct object_entry *e,
+				  struct packed_git *p)
+{
+	if (p->index <= 0)
+		die("BUG: found_pack should be NULL "
+		    "instead of having non-positive index");
+	e->in_pack_idx = p->index;
+
+}
+
 #endif
-- 
2.16.2.903.gd04caf5039


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v4 07/11] pack-objects: refer to delta objects by index instead of pointer
  2018-03-16 18:31       ` [PATCH v4 00/11] nd/pack-objects-pack-struct updates Nguyễn Thái Ngọc Duy
                           ` (5 preceding siblings ...)
  2018-03-16 18:31         ` [PATCH v4 06/11] pack-objects: move in_pack " Nguyễn Thái Ngọc Duy
@ 2018-03-16 18:31         ` Nguyễn Thái Ngọc Duy
  2018-03-16 20:59           ` Junio C Hamano
  2018-03-16 18:31         ` [PATCH v4 08/11] pack-objects: shrink z_delta_size field in struct object_entry Nguyễn Thái Ngọc Duy
                           ` (4 subsequent siblings)
  11 siblings, 1 reply; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-16 18:31 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

These delta pointers always point to elements in the objects[] array
in packing_data struct. We can only hold maximum 4GB of those objects
because the array length, nr_objects, is uint32_t. We could use
uint32_t indexes to address these elements instead of pointers. On
64-bit architecture (8 bytes per pointer) this would save 4 bytes per
pointer.

Convert these delta pointers to indexes. Since we need to handle NULL
pointers as well, the index is shifted by one [1].

[1] This means we can only index 2^32-2 objects even though nr_objects
    could contain 2^32-1 objects. It should not be a problem in
    practice because when we grow objects[], nr_alloc would probably
    blow up long before nr_objects hits the wall.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 116 ++++++++++++++++++++++-------------------
 pack-objects.h         |  67 ++++++++++++++++++++++--
 2 files changed, 124 insertions(+), 59 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index ca993e55dd..cdbad57082 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -30,6 +30,12 @@
 #include "packfile.h"
 
 #define IN_PACK(obj) oe_in_pack(&to_pack, obj)
+#define DELTA(obj) oe_delta(&to_pack, obj)
+#define DELTA_CHILD(obj) oe_delta_child(&to_pack, obj)
+#define DELTA_SIBLING(obj) oe_delta_sibling(&to_pack, obj)
+#define SET_DELTA(obj, val) oe_set_delta(&to_pack, obj, val)
+#define SET_DELTA_CHILD(obj, val) oe_set_delta_child(&to_pack, obj, val)
+#define SET_DELTA_SIBLING(obj, val) oe_set_delta_sibling(&to_pack, obj, val)
 
 static const char *pack_usage[] = {
 	N_("git pack-objects --stdout [<options>...] [< <ref-list> | < <object-list>]"),
@@ -127,11 +133,11 @@ static void *get_delta(struct object_entry *entry)
 	buf = read_sha1_file(entry->idx.oid.hash, &type, &size);
 	if (!buf)
 		die("unable to read %s", oid_to_hex(&entry->idx.oid));
-	base_buf = read_sha1_file(entry->delta->idx.oid.hash, &type,
+	base_buf = read_sha1_file(DELTA(entry)->idx.oid.hash, &type,
 				  &base_size);
 	if (!base_buf)
 		die("unable to read %s",
-		    oid_to_hex(&entry->delta->idx.oid));
+		    oid_to_hex(&DELTA(entry)->idx.oid));
 	delta_buf = diff_delta(base_buf, base_size,
 			       buf, size, &delta_size, 0);
 	if (!delta_buf || delta_size != entry->delta_size)
@@ -288,12 +294,12 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 		size = entry->delta_size;
 		buf = entry->delta_data;
 		entry->delta_data = NULL;
-		type = (allow_ofs_delta && entry->delta->idx.offset) ?
+		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	} else {
 		buf = get_delta(entry);
 		size = entry->delta_size;
-		type = (allow_ofs_delta && entry->delta->idx.offset) ?
+		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	}
 
@@ -317,7 +323,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 		 * encoding of the relative offset for the delta
 		 * base from this object's position in the pack.
 		 */
-		off_t ofs = entry->idx.offset - entry->delta->idx.offset;
+		off_t ofs = entry->idx.offset - DELTA(entry)->idx.offset;
 		unsigned pos = sizeof(dheader) - 1;
 		dheader[pos] = ofs & 127;
 		while (ofs >>= 7)
@@ -343,7 +349,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 			return 0;
 		}
 		hashwrite(f, header, hdrlen);
-		hashwrite(f, entry->delta->idx.oid.hash, 20);
+		hashwrite(f, DELTA(entry)->idx.oid.hash, 20);
 		hdrlen += 20;
 	} else {
 		if (limit && hdrlen + datalen + 20 >= limit) {
@@ -379,8 +385,8 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 		      dheader[MAX_PACK_OBJECT_HEADER];
 	unsigned hdrlen;
 
-	if (entry->delta)
-		type = (allow_ofs_delta && entry->delta->idx.offset) ?
+	if (DELTA(entry))
+		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	hdrlen = encode_in_pack_object_header(header, sizeof(header),
 					      type, entry->size);
@@ -408,7 +414,7 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 	}
 
 	if (type == OBJ_OFS_DELTA) {
-		off_t ofs = entry->idx.offset - entry->delta->idx.offset;
+		off_t ofs = entry->idx.offset - DELTA(entry)->idx.offset;
 		unsigned pos = sizeof(dheader) - 1;
 		dheader[pos] = ofs & 127;
 		while (ofs >>= 7)
@@ -427,7 +433,7 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 			return 0;
 		}
 		hashwrite(f, header, hdrlen);
-		hashwrite(f, entry->delta->idx.oid.hash, 20);
+		hashwrite(f, DELTA(entry)->idx.oid.hash, 20);
 		hdrlen += 20;
 		reused_delta++;
 	} else {
@@ -467,13 +473,13 @@ static off_t write_object(struct hashfile *f,
 	else
 		limit = pack_size_limit - write_offset;
 
-	if (!entry->delta)
+	if (!DELTA(entry))
 		usable_delta = 0;	/* no delta */
 	else if (!pack_size_limit)
 	       usable_delta = 1;	/* unlimited packfile */
-	else if (entry->delta->idx.offset == (off_t)-1)
+	else if (DELTA(entry)->idx.offset == (off_t)-1)
 		usable_delta = 0;	/* base was written to another pack */
-	else if (entry->delta->idx.offset)
+	else if (DELTA(entry)->idx.offset)
 		usable_delta = 1;	/* base already exists in this pack */
 	else
 		usable_delta = 0;	/* base could end up in another pack */
@@ -489,7 +495,7 @@ static off_t write_object(struct hashfile *f,
 				/* ... but pack split may override that */
 	else if (oe_type(entry) != entry->in_pack_type)
 		to_reuse = 0;	/* pack has delta which is unusable */
-	else if (entry->delta)
+	else if (DELTA(entry))
 		to_reuse = 0;	/* we want to pack afresh */
 	else
 		to_reuse = 1;	/* we have it in-pack undeltified,
@@ -541,12 +547,12 @@ static enum write_one_status write_one(struct hashfile *f,
 	}
 
 	/* if we are deltified, write out base object first. */
-	if (e->delta) {
+	if (DELTA(e)) {
 		e->idx.offset = 1; /* now recurse */
-		switch (write_one(f, e->delta, offset)) {
+		switch (write_one(f, DELTA(e), offset)) {
 		case WRITE_ONE_RECURSIVE:
 			/* we cannot depend on this one */
-			e->delta = NULL;
+			SET_DELTA(e, NULL);
 			break;
 		default:
 			break;
@@ -608,34 +614,34 @@ static void add_descendants_to_write_order(struct object_entry **wo,
 			/* add this node... */
 			add_to_write_order(wo, endp, e);
 			/* all its siblings... */
-			for (s = e->delta_sibling; s; s = s->delta_sibling) {
+			for (s = DELTA_SIBLING(e); s; s = DELTA_SIBLING(s)) {
 				add_to_write_order(wo, endp, s);
 			}
 		}
 		/* drop down a level to add left subtree nodes if possible */
-		if (e->delta_child) {
+		if (DELTA_CHILD(e)) {
 			add_to_order = 1;
-			e = e->delta_child;
+			e = DELTA_CHILD(e);
 		} else {
 			add_to_order = 0;
 			/* our sibling might have some children, it is next */
-			if (e->delta_sibling) {
-				e = e->delta_sibling;
+			if (DELTA_SIBLING(e)) {
+				e = DELTA_SIBLING(e);
 				continue;
 			}
 			/* go back to our parent node */
-			e = e->delta;
-			while (e && !e->delta_sibling) {
+			e = DELTA(e);
+			while (e && !DELTA_SIBLING(e)) {
 				/* we're on the right side of a subtree, keep
 				 * going up until we can go right again */
-				e = e->delta;
+				e = DELTA(e);
 			}
 			if (!e) {
 				/* done- we hit our original root node */
 				return;
 			}
 			/* pass it off to sibling at this level */
-			e = e->delta_sibling;
+			e = DELTA_SIBLING(e);
 		}
 	};
 }
@@ -646,7 +652,7 @@ static void add_family_to_write_order(struct object_entry **wo,
 {
 	struct object_entry *root;
 
-	for (root = e; root->delta; root = root->delta)
+	for (root = e; DELTA(root); root = DELTA(root))
 		; /* nothing */
 	add_descendants_to_write_order(wo, endp, root);
 }
@@ -661,8 +667,8 @@ static struct object_entry **compute_write_order(void)
 	for (i = 0; i < to_pack.nr_objects; i++) {
 		objects[i].tagged = 0;
 		objects[i].filled = 0;
-		objects[i].delta_child = NULL;
-		objects[i].delta_sibling = NULL;
+		SET_DELTA_CHILD(&objects[i], NULL);
+		SET_DELTA_SIBLING(&objects[i], NULL);
 	}
 
 	/*
@@ -672,11 +678,11 @@ static struct object_entry **compute_write_order(void)
 	 */
 	for (i = to_pack.nr_objects; i > 0;) {
 		struct object_entry *e = &objects[--i];
-		if (!e->delta)
+		if (!DELTA(e))
 			continue;
 		/* Mark me as the first child */
-		e->delta_sibling = e->delta->delta_child;
-		e->delta->delta_child = e;
+		e->delta_sibling_idx = DELTA(e)->delta_child_idx;
+		SET_DELTA_CHILD(DELTA(e), e);
 	}
 
 	/*
@@ -1498,10 +1504,10 @@ static void check_object(struct object_entry *entry)
 			 * circular deltas.
 			 */
 			oe_set_type(entry, entry->in_pack_type);
-			entry->delta = base_entry;
+			SET_DELTA(entry, base_entry);
 			entry->delta_size = entry->size;
-			entry->delta_sibling = base_entry->delta_child;
-			base_entry->delta_child = entry;
+			entry->delta_sibling_idx = base_entry->delta_child_idx;
+			SET_DELTA_CHILD(base_entry, entry);
 			unuse_pack(&w_curs);
 			return;
 		}
@@ -1572,17 +1578,19 @@ static int pack_offset_sort(const void *_a, const void *_b)
  */
 static void drop_reused_delta(struct object_entry *entry)
 {
-	struct object_entry **p = &entry->delta->delta_child;
+	unsigned *idx = &to_pack.objects[entry->delta_idx - 1].delta_child_idx;
 	struct object_info oi = OBJECT_INFO_INIT;
 	enum object_type type;
 
-	while (*p) {
-		if (*p == entry)
-			*p = (*p)->delta_sibling;
+	while (*idx) {
+		struct object_entry *oe = &to_pack.objects[*idx - 1];
+
+		if (oe == entry)
+			*idx = oe->delta_sibling_idx;
 		else
-			p = &(*p)->delta_sibling;
+			idx = &oe->delta_sibling_idx;
 	}
-	entry->delta = NULL;
+	SET_DELTA(entry, NULL);
 	entry->depth = 0;
 
 	oi.sizep = &entry->size;
@@ -1622,7 +1630,7 @@ static void break_delta_chains(struct object_entry *entry)
 
 	for (cur = entry, total_depth = 0;
 	     cur;
-	     cur = cur->delta, total_depth++) {
+	     cur = DELTA(cur), total_depth++) {
 		if (cur->dfs_state == DFS_DONE) {
 			/*
 			 * We've already seen this object and know it isn't
@@ -1647,7 +1655,7 @@ static void break_delta_chains(struct object_entry *entry)
 		 * it's not a delta, we're done traversing, but we'll mark it
 		 * done to save time on future traversals.
 		 */
-		if (!cur->delta) {
+		if (!DELTA(cur)) {
 			cur->dfs_state = DFS_DONE;
 			break;
 		}
@@ -1670,7 +1678,7 @@ static void break_delta_chains(struct object_entry *entry)
 		 * We keep all commits in the chain that we examined.
 		 */
 		cur->dfs_state = DFS_ACTIVE;
-		if (cur->delta->dfs_state == DFS_ACTIVE) {
+		if (DELTA(cur)->dfs_state == DFS_ACTIVE) {
 			drop_reused_delta(cur);
 			cur->dfs_state = DFS_DONE;
 			break;
@@ -1685,7 +1693,7 @@ static void break_delta_chains(struct object_entry *entry)
 	 * an extra "next" pointer to keep going after we reset cur->delta.
 	 */
 	for (cur = entry; cur; cur = next) {
-		next = cur->delta;
+		next = DELTA(cur);
 
 		/*
 		 * We should have a chain of zero or more ACTIVE states down to
@@ -1870,7 +1878,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 
 	/* Now some size filtering heuristics. */
 	trg_size = trg_entry->size;
-	if (!trg_entry->delta) {
+	if (!DELTA(trg_entry)) {
 		max_size = trg_size/2 - 20;
 		ref_depth = 1;
 	} else {
@@ -1946,7 +1954,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	if (!delta_buf)
 		return 0;
 
-	if (trg_entry->delta) {
+	if (DELTA(trg_entry)) {
 		/* Prefer only shallower same-sized deltas. */
 		if (delta_size == trg_entry->delta_size &&
 		    src->depth + 1 >= trg->depth) {
@@ -1975,7 +1983,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 		free(delta_buf);
 	}
 
-	trg_entry->delta = src_entry;
+	SET_DELTA(trg_entry, src_entry);
 	trg_entry->delta_size = delta_size;
 	trg->depth = src->depth + 1;
 
@@ -1984,13 +1992,13 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 
 static unsigned int check_delta_limit(struct object_entry *me, unsigned int n)
 {
-	struct object_entry *child = me->delta_child;
+	struct object_entry *child = DELTA_CHILD(me);
 	unsigned int m = n;
 	while (child) {
 		unsigned int c = check_delta_limit(child, n + 1);
 		if (m < c)
 			m = c;
-		child = child->delta_sibling;
+		child = DELTA_SIBLING(child);
 	}
 	return m;
 }
@@ -2059,7 +2067,7 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
 		 * otherwise they would become too deep.
 		 */
 		max_depth = depth;
-		if (entry->delta_child) {
+		if (DELTA_CHILD(entry)) {
 			max_depth -= check_delta_limit(entry, 0);
 			if (max_depth <= 0)
 				goto next;
@@ -2109,7 +2117,7 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
 		 * depth, leaving it in the window is pointless.  we
 		 * should evict it first.
 		 */
-		if (entry->delta && max_depth <= n->depth)
+		if (DELTA(entry) && max_depth <= n->depth)
 			continue;
 
 		/*
@@ -2117,7 +2125,7 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
 		 * currently deltified object, to keep it longer.  It will
 		 * be the first base object to be attempted next.
 		 */
-		if (entry->delta) {
+		if (DELTA(entry)) {
 			struct unpacked swap = array[best_base];
 			int dist = (window + idx - best_base) % window;
 			int dst = best_base;
@@ -2438,7 +2446,7 @@ static void prepare_pack(int window, int depth)
 	for (i = 0; i < to_pack.nr_objects; i++) {
 		struct object_entry *entry = to_pack.objects + i;
 
-		if (entry->delta)
+		if (DELTA(entry))
 			/* This happens if we decided to reuse existing
 			 * delta from a pack.  "reuse_delta &&" is implied.
 			 */
diff --git a/pack-objects.h b/pack-objects.h
index 9bcb5946e5..7f32de2a35 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -71,11 +71,11 @@ struct object_entry {
 	unsigned long size;	/* uncompressed size */
 	unsigned in_pack_idx:OE_IN_PACK_BITS;	/* already in pack */
 	off_t in_pack_offset;
-	struct object_entry *delta;	/* delta base object */
-	struct object_entry *delta_child; /* deltified objects who bases me */
-	struct object_entry *delta_sibling; /* other deltified objects who
-					     * uses the same base as me
-					     */
+	uint32_t delta_idx;	/* delta base object */
+	uint32_t delta_child_idx; /* deltified objects who bases me */
+	uint32_t delta_sibling_idx; /* other deltified objects who
+				     * uses the same base as me
+				     */
 	void *delta_data;	/* cached delta (uncompressed) */
 	unsigned long delta_size;	/* delta data size (uncompressed) */
 	unsigned long z_delta_size;	/* delta data size (compressed) */
@@ -199,4 +199,61 @@ static inline void oe_set_in_pack(struct object_entry *e,
 
 }
 
+static inline struct object_entry *oe_delta(
+		const struct packing_data *pack,
+		const struct object_entry *e)
+{
+	if (e->delta_idx)
+		return &pack->objects[e->delta_idx - 1];
+	return NULL;
+}
+
+static inline void oe_set_delta(struct packing_data *pack,
+				struct object_entry *e,
+				struct object_entry *delta)
+{
+	if (delta)
+		e->delta_idx = (delta - pack->objects) + 1;
+	else
+		e->delta_idx = 0;
+}
+
+static inline struct object_entry *oe_delta_child(
+		const struct packing_data *pack,
+		const struct object_entry *e)
+{
+	if (e->delta_child_idx)
+		return &pack->objects[e->delta_child_idx - 1];
+	return NULL;
+}
+
+static inline void oe_set_delta_child(struct packing_data *pack,
+				      struct object_entry *e,
+				      struct object_entry *delta)
+{
+	if (delta)
+		e->delta_child_idx = (delta - pack->objects) + 1;
+	else
+		e->delta_child_idx = 0;
+}
+
+static inline struct object_entry *oe_delta_sibling(
+		const struct packing_data *pack,
+		const struct object_entry *e)
+{
+	if (e->delta_sibling_idx)
+		return &pack->objects[e->delta_sibling_idx - 1];
+	return NULL;
+}
+
+static inline void oe_set_delta_sibling(struct packing_data *pack,
+					struct object_entry *e,
+					struct object_entry *delta)
+{
+	if (delta)
+		e->delta_sibling_idx = (delta - pack->objects) + 1;
+	else
+		e->delta_sibling_idx = 0;
+}
+
 #endif
-- 
2.16.2.903.gd04caf5039


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v4 08/11] pack-objects: shrink z_delta_size field in struct object_entry
  2018-03-16 18:31       ` [PATCH v4 00/11] nd/pack-objects-pack-struct updates Nguyễn Thái Ngọc Duy
                           ` (6 preceding siblings ...)
  2018-03-16 18:31         ` [PATCH v4 07/11] pack-objects: refer to delta objects by index instead of pointer Nguyễn Thái Ngọc Duy
@ 2018-03-16 18:31         ` Nguyễn Thái Ngọc Duy
  2018-03-16 19:40           ` Junio C Hamano
  2018-03-16 18:31         ` [PATCH v4 09/11] pack-objects: shrink size " Nguyễn Thái Ngọc Duy
                           ` (3 subsequent siblings)
  11 siblings, 1 reply; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-16 18:31 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

We only cache deltas when it's smaller than a certain limit. This limit
defaults to 1000 but save its compressed length in a 64-bit field.
Shrink that field down to 16 bits, so you can only cache 65kb deltas.
Larger deltas must be recomputed at when the pack is written down.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 Documentation/config.txt |  3 ++-
 builtin/pack-objects.c   | 22 ++++++++++++++++------
 pack-objects.h           |  3 ++-
 3 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index 9bd3f5a789..00fa824448 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -2449,7 +2449,8 @@ pack.deltaCacheLimit::
 	The maximum size of a delta, that is cached in
 	linkgit:git-pack-objects[1]. This cache is used to speed up the
 	writing object phase by not having to recompute the final delta
-	result once the best match for all objects is found. Defaults to 1000.
+	result once the best match for all objects is found.
+	Defaults to 1000. Maximum value is 65535.
 
 pack.threads::
 	Specifies the number of threads to spawn when searching for best
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index cdbad57082..9a0962cf31 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -2105,12 +2105,19 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
 		 * between writes at that moment.
 		 */
 		if (entry->delta_data && !pack_to_stdout) {
-			entry->z_delta_size = do_compress(&entry->delta_data,
-							  entry->delta_size);
-			cache_lock();
-			delta_cache_size -= entry->delta_size;
-			delta_cache_size += entry->z_delta_size;
-			cache_unlock();
+			unsigned long size;
+
+			size = do_compress(&entry->delta_data, entry->delta_size);
+			entry->z_delta_size = size;
+			if (entry->z_delta_size == size) {
+				cache_lock();
+				delta_cache_size -= entry->delta_size;
+				delta_cache_size += entry->z_delta_size;
+				cache_unlock();
+			} else {
+				FREE_AND_NULL(entry->delta_data);
+				entry->z_delta_size = 0;
+			}
 		}
 
 		/* if we made n a delta, and if n is already at max
@@ -3089,6 +3096,9 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 	if (depth >= (1 << OE_DEPTH_BITS))
 		die(_("delta chain depth %d is greater than maximum limit %d"),
 		    depth, (1 << OE_DEPTH_BITS));
+	if (cache_max_small_delta_size >= (1 << OE_Z_DELTA_BITS))
+		die(_("pack.deltaCacheLimit is greater than maximum limit %d"),
+		    1 << OE_Z_DELTA_BITS);
 
 	argv_array_push(&rp, "pack-objects");
 	if (thin) {
diff --git a/pack-objects.h b/pack-objects.h
index 7f32de2a35..a66c37e35a 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -4,6 +4,7 @@
 #define OE_DFS_STATE_BITS	2
 #define OE_DEPTH_BITS		12
 #define OE_IN_PACK_BITS		14
+#define OE_Z_DELTA_BITS		16
 
 /*
  * State flags for depth-first search used for analyzing delta cycles.
@@ -78,7 +79,7 @@ struct object_entry {
 				     */
 	void *delta_data;	/* cached delta (uncompressed) */
 	unsigned long delta_size;	/* delta data size (uncompressed) */
-	unsigned long z_delta_size;	/* delta data size (compressed) */
+	unsigned z_delta_size:OE_Z_DELTA_BITS;
 	unsigned type_:TYPE_BITS;
 	unsigned in_pack_type:TYPE_BITS; /* could be delta */
 	unsigned type_valid:1;
-- 
2.16.2.903.gd04caf5039


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v4 09/11] pack-objects: shrink size field in struct object_entry
  2018-03-16 18:31       ` [PATCH v4 00/11] nd/pack-objects-pack-struct updates Nguyễn Thái Ngọc Duy
                           ` (7 preceding siblings ...)
  2018-03-16 18:31         ` [PATCH v4 08/11] pack-objects: shrink z_delta_size field in struct object_entry Nguyễn Thái Ngọc Duy
@ 2018-03-16 18:31         ` Nguyễn Thái Ngọc Duy
  2018-03-16 19:49           ` Junio C Hamano
  2018-03-16 18:31         ` [PATCH v4 10/11] pack-objects: shrink delta_size " Nguyễn Thái Ngọc Duy
                           ` (2 subsequent siblings)
  11 siblings, 1 reply; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-16 18:31 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

It's very very rare that an uncompressd object is larger than 4GB
(partly because Git does not handle those large files very well to
begin with). Let's optimize it for the common case where object size
is smaller than this limit.

Shrink size field down to 32 bits [1] and one overflow bit. If the size
is too large, we read it back from disk.

Add two compare helpers that can take advantage of the overflow
bit (e.g. if the file is 4GB+, chances are it's already larger than
core.bigFileThreshold and there's no point in comparing the actual
value).

A small note about the conditional oe_set_size() in
check_object(). Technically if we don't get a valid type, it's not
wrong if we set uninitialized value "size" (we don't pre-initialize
this and sha1_object_info will not assign anything when it fails to
get the info).

This how changes the writing code path slightly which emits different
error messages (either way we die). One of our tests in t5530 depends
on this specific error message. Let's just keep the test as-is and
play safe by not assigning random value. That might trigger valgrind
anyway.

[1] it's actually already 32 bits on Windows

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 49 ++++++++++++++++++++++++++----------------
 pack-objects.h         | 43 +++++++++++++++++++++++++++++++++++-
 2 files changed, 73 insertions(+), 19 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 9a0962cf31..14aa4acd50 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -274,7 +274,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 
 	if (!usable_delta) {
 		if (oe_type(entry) == OBJ_BLOB &&
-		    entry->size > big_file_threshold &&
+		    oe_size_greater_than(entry, big_file_threshold) &&
 		    (st = open_istream(entry->idx.oid.hash, &type, &size, NULL)) != NULL)
 			buf = NULL;
 		else {
@@ -384,12 +384,13 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 	unsigned char header[MAX_PACK_OBJECT_HEADER],
 		      dheader[MAX_PACK_OBJECT_HEADER];
 	unsigned hdrlen;
+	unsigned long entry_size = oe_size(entry);
 
 	if (DELTA(entry))
 		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	hdrlen = encode_in_pack_object_header(header, sizeof(header),
-					      type, entry->size);
+					      type, entry_size);
 
 	offset = entry->in_pack_offset;
 	revidx = find_pack_revindex(p, offset);
@@ -406,7 +407,7 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 	datalen -= entry->in_pack_header_size;
 
 	if (!pack_to_stdout && p->index_version == 1 &&
-	    check_pack_inflate(p, &w_curs, offset, datalen, entry->size)) {
+	    check_pack_inflate(p, &w_curs, offset, datalen, entry_size)) {
 		error("corrupt packed object for %s",
 		      oid_to_hex(&entry->idx.oid));
 		unuse_pack(&w_curs);
@@ -1412,6 +1413,8 @@ static void cleanup_preferred_base(void)
 
 static void check_object(struct object_entry *entry)
 {
+	unsigned long size;
+
 	if (IN_PACK(entry)) {
 		struct packed_git *p = IN_PACK(entry);
 		struct pack_window *w_curs = NULL;
@@ -1431,13 +1434,14 @@ static void check_object(struct object_entry *entry)
 		 */
 		used = unpack_object_header_buffer(buf, avail,
 						   &type,
-						   &entry->size);
+						   &size);
 		if (used == 0)
 			goto give_up;
 
 		if (type < 0)
 			die("BUG: invalid type %d", type);
 		entry->in_pack_type = type;
+		oe_set_size(entry, size);
 
 		/*
 		 * Determine if this is a delta and if so whether we can
@@ -1505,7 +1509,7 @@ static void check_object(struct object_entry *entry)
 			 */
 			oe_set_type(entry, entry->in_pack_type);
 			SET_DELTA(entry, base_entry);
-			entry->delta_size = entry->size;
+			entry->delta_size = oe_size(entry);
 			entry->delta_sibling_idx = base_entry->delta_child_idx;
 			SET_DELTA_CHILD(base_entry, entry);
 			unuse_pack(&w_curs);
@@ -1513,14 +1517,17 @@ static void check_object(struct object_entry *entry)
 		}
 
 		if (oe_type(entry)) {
+			unsigned long size;
+
+			size = get_size_from_delta(p, &w_curs,
+				entry->in_pack_offset + entry->in_pack_header_size);
 			/*
 			 * This must be a delta and we already know what the
 			 * final object type is.  Let's extract the actual
 			 * object size from the delta header.
 			 */
-			entry->size = get_size_from_delta(p, &w_curs,
-					entry->in_pack_offset + entry->in_pack_header_size);
-			if (entry->size == 0)
+			oe_set_size(entry, size);
+			if (oe_size_less_than(entry, 1))
 				goto give_up;
 			unuse_pack(&w_curs);
 			return;
@@ -1535,13 +1542,15 @@ static void check_object(struct object_entry *entry)
 		unuse_pack(&w_curs);
 	}
 
-	oe_set_type(entry, sha1_object_info(entry->idx.oid.hash, &entry->size));
+	oe_set_type(entry, sha1_object_info(entry->idx.oid.hash, &size));
 	/*
 	 * The error condition is checked in prepare_pack().  This is
 	 * to permit a missing preferred base object to be ignored
 	 * as a preferred base.  Doing so can result in a larger
 	 * pack file, but the transfer will still take place.
 	 */
+	if (entry->type_valid)
+		oe_set_size(entry, size);
 }
 
 static int pack_offset_sort(const void *_a, const void *_b)
@@ -1581,6 +1590,7 @@ static void drop_reused_delta(struct object_entry *entry)
 	unsigned *idx = &to_pack.objects[entry->delta_idx - 1].delta_child_idx;
 	struct object_info oi = OBJECT_INFO_INIT;
 	enum object_type type;
+	unsigned long size;
 
 	while (*idx) {
 		struct object_entry *oe = &to_pack.objects[*idx - 1];
@@ -1593,7 +1603,7 @@ static void drop_reused_delta(struct object_entry *entry)
 	SET_DELTA(entry, NULL);
 	entry->depth = 0;
 
-	oi.sizep = &entry->size;
+	oi.sizep = &size;
 	oi.typep = &type;
 	if (packed_object_info(IN_PACK(entry), entry->in_pack_offset, &oi) < 0) {
 		/*
@@ -1603,10 +1613,11 @@ static void drop_reused_delta(struct object_entry *entry)
 		 * and dealt with in prepare_pack().
 		 */
 		oe_set_type(entry, sha1_object_info(entry->idx.oid.hash,
-						    &entry->size));
+						    &size));
 	} else {
 		oe_set_type(entry, type);
 	}
+	oe_set_size(entry, size);
 }
 
 /*
@@ -1746,7 +1757,7 @@ static void get_object_details(void)
 	for (i = 0; i < to_pack.nr_objects; i++) {
 		struct object_entry *entry = sorted_by_offset[i];
 		check_object(entry);
-		if (big_file_threshold < entry->size)
+		if (oe_size_greater_than(entry, big_file_threshold))
 			entry->no_try_delta = 1;
 	}
 
@@ -1775,6 +1786,8 @@ static int type_size_sort(const void *_a, const void *_b)
 	const struct object_entry *b = *(struct object_entry **)_b;
 	enum object_type a_type = oe_type(a);
 	enum object_type b_type = oe_type(b);
+	unsigned long a_size = oe_size(a);
+	unsigned long b_size = oe_size(b);
 
 	if (a_type > b_type)
 		return -1;
@@ -1788,9 +1801,9 @@ static int type_size_sort(const void *_a, const void *_b)
 		return -1;
 	if (a->preferred_base < b->preferred_base)
 		return 1;
-	if (a->size > b->size)
+	if (a_size > b_size)
 		return -1;
-	if (a->size < b->size)
+	if (a_size < b_size)
 		return 1;
 	return a < b ? -1 : (a > b);  /* newest first */
 }
@@ -1877,7 +1890,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 		return 0;
 
 	/* Now some size filtering heuristics. */
-	trg_size = trg_entry->size;
+	trg_size = oe_size(trg_entry);
 	if (!DELTA(trg_entry)) {
 		max_size = trg_size/2 - 20;
 		ref_depth = 1;
@@ -1889,7 +1902,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 						(max_depth - ref_depth + 1);
 	if (max_size == 0)
 		return 0;
-	src_size = src_entry->size;
+	src_size = oe_size(src_entry);
 	sizediff = src_size < trg_size ? trg_size - src_size : 0;
 	if (sizediff >= max_size)
 		return 0;
@@ -2009,7 +2022,7 @@ static unsigned long free_unpacked(struct unpacked *n)
 	free_delta_index(n->index);
 	n->index = NULL;
 	if (n->data) {
-		freed_mem += n->entry->size;
+		freed_mem += oe_size(n->entry);
 		FREE_AND_NULL(n->data);
 	}
 	n->entry = NULL;
@@ -2459,7 +2472,7 @@ static void prepare_pack(int window, int depth)
 			 */
 			continue;
 
-		if (entry->size < 50)
+		if (oe_size_less_than(entry, 50))
 			continue;
 
 		if (entry->no_try_delta)
diff --git a/pack-objects.h b/pack-objects.h
index a66c37e35a..5c7e15ca92 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -69,7 +69,9 @@ enum dfs_state {
  */
 struct object_entry {
 	struct pack_idx_entry idx;
-	unsigned long size;	/* uncompressed size */
+	/* object uncompressed size _if_ size_valid is true */
+	uint32_t size_;
+	unsigned size_valid:1;
 	unsigned in_pack_idx:OE_IN_PACK_BITS;	/* already in pack */
 	off_t in_pack_offset;
 	uint32_t delta_idx;	/* delta base object */
@@ -257,4 +259,43 @@ static inline void oe_set_delta_sibling(struct packing_data *pack,
 		e->delta_sibling_idx = 0;
 }
 
+static inline unsigned long oe_size(const struct object_entry *e)
+{
+	if (e->size_valid) {
+		return e->size_;
+	} else {
+		unsigned long size;
+
+		sha1_object_info(e->idx.oid.hash, &size);
+		return size;
+	}
+}
+
+static inline int oe_size_less_than(const struct object_entry *e,
+				    unsigned long limit)
+{
+	if (e->size_valid)
+		return e->size_ < limit;
+	if (limit > maximum_unsigned_value_of_type(uint32_t))
+		return 1;
+	return oe_size(e) < limit;
+}
+
+static inline int oe_size_greater_than(const struct object_entry *e,
+				       unsigned long limit)
+{
+	if (e->size_valid)
+		return e->size_ > limit;
+	if (limit <= maximum_unsigned_value_of_type(uint32_t))
+		return 1;
+	return oe_size(e) > limit;
+}
+
+static inline void oe_set_size(struct object_entry *e,
+			       unsigned long size)
+{
+	e->size_ = size;
+	e->size_valid = e->size_ == size;
+}
+
 #endif
-- 
2.16.2.903.gd04caf5039


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v4 10/11] pack-objects: shrink delta_size field in struct object_entry
  2018-03-16 18:31       ` [PATCH v4 00/11] nd/pack-objects-pack-struct updates Nguyễn Thái Ngọc Duy
                           ` (8 preceding siblings ...)
  2018-03-16 18:31         ` [PATCH v4 09/11] pack-objects: shrink size " Nguyễn Thái Ngọc Duy
@ 2018-03-16 18:31         ` Nguyễn Thái Ngọc Duy
  2018-03-16 18:32         ` [PATCH v4 11/11] pack-objects.h: reorder members to shrink " Nguyễn Thái Ngọc Duy
  2018-03-17 14:10         ` [PATCH v5 00/11] nd/pack-objects-pack-struct updates Nguyễn Thái Ngọc Duy
  11 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-16 18:31 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

Allowing a delta size of 64 bits is crazy. Shrink this field down to
31 bits with one overflow bit.

If we find an existing delta larger than 2GB, we do not cache
delta_size at all and will get the value from oe_size(), potentially
from disk if it's larger than 4GB.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 24 ++++++++++++++----------
 pack-objects.h         | 23 ++++++++++++++++++++++-
 2 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 14aa4acd50..c388d87c3e 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -30,10 +30,12 @@
 #include "packfile.h"
 
 #define IN_PACK(obj) oe_in_pack(&to_pack, obj)
+#define DELTA_SIZE(obj) oe_delta_size(&to_pack, obj)
 #define DELTA(obj) oe_delta(&to_pack, obj)
 #define DELTA_CHILD(obj) oe_delta_child(&to_pack, obj)
 #define DELTA_SIBLING(obj) oe_delta_sibling(&to_pack, obj)
 #define SET_DELTA(obj, val) oe_set_delta(&to_pack, obj, val)
+#define SET_DELTA_SIZE(obj, val) oe_set_delta_size(&to_pack, obj, val)
 #define SET_DELTA_CHILD(obj, val) oe_set_delta_child(&to_pack, obj, val)
 #define SET_DELTA_SIBLING(obj, val) oe_set_delta_sibling(&to_pack, obj, val)
 
@@ -140,7 +142,7 @@ static void *get_delta(struct object_entry *entry)
 		    oid_to_hex(&DELTA(entry)->idx.oid));
 	delta_buf = diff_delta(base_buf, base_size,
 			       buf, size, &delta_size, 0);
-	if (!delta_buf || delta_size != entry->delta_size)
+	if (!delta_buf || delta_size != DELTA_SIZE(entry))
 		die("delta size changed");
 	free(buf);
 	free(base_buf);
@@ -291,14 +293,14 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 		FREE_AND_NULL(entry->delta_data);
 		entry->z_delta_size = 0;
 	} else if (entry->delta_data) {
-		size = entry->delta_size;
+		size = DELTA_SIZE(entry);
 		buf = entry->delta_data;
 		entry->delta_data = NULL;
 		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	} else {
 		buf = get_delta(entry);
-		size = entry->delta_size;
+		size = DELTA_SIZE(entry);
 		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	}
@@ -1509,7 +1511,7 @@ static void check_object(struct object_entry *entry)
 			 */
 			oe_set_type(entry, entry->in_pack_type);
 			SET_DELTA(entry, base_entry);
-			entry->delta_size = oe_size(entry);
+			SET_DELTA_SIZE(entry, oe_size(entry));
 			entry->delta_sibling_idx = base_entry->delta_child_idx;
 			SET_DELTA_CHILD(base_entry, entry);
 			unuse_pack(&w_curs);
@@ -1895,7 +1897,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 		max_size = trg_size/2 - 20;
 		ref_depth = 1;
 	} else {
-		max_size = trg_entry->delta_size;
+		max_size = DELTA_SIZE(trg_entry);
 		ref_depth = trg->depth;
 	}
 	max_size = (uint64_t)max_size * (max_depth - src->depth) /
@@ -1966,10 +1968,12 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	delta_buf = create_delta(src->index, trg->data, trg_size, &delta_size, max_size);
 	if (!delta_buf)
 		return 0;
+	if (delta_size >= maximum_unsigned_value_of_type(uint32_t))
+		return 0;
 
 	if (DELTA(trg_entry)) {
 		/* Prefer only shallower same-sized deltas. */
-		if (delta_size == trg_entry->delta_size &&
+		if (delta_size == DELTA_SIZE(trg_entry) &&
 		    src->depth + 1 >= trg->depth) {
 			free(delta_buf);
 			return 0;
@@ -1984,7 +1988,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	free(trg_entry->delta_data);
 	cache_lock();
 	if (trg_entry->delta_data) {
-		delta_cache_size -= trg_entry->delta_size;
+		delta_cache_size -= DELTA_SIZE(trg_entry);
 		trg_entry->delta_data = NULL;
 	}
 	if (delta_cacheable(src_size, trg_size, delta_size)) {
@@ -1997,7 +2001,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	}
 
 	SET_DELTA(trg_entry, src_entry);
-	trg_entry->delta_size = delta_size;
+	SET_DELTA_SIZE(trg_entry, delta_size);
 	trg->depth = src->depth + 1;
 
 	return 1;
@@ -2120,11 +2124,11 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
 		if (entry->delta_data && !pack_to_stdout) {
 			unsigned long size;
 
-			size = do_compress(&entry->delta_data, entry->delta_size);
+			size = do_compress(&entry->delta_data, DELTA_SIZE(entry));
 			entry->z_delta_size = size;
 			if (entry->z_delta_size == size) {
 				cache_lock();
-				delta_cache_size -= entry->delta_size;
+				delta_cache_size -= DELTA_SIZE(entry);
 				delta_cache_size += entry->z_delta_size;
 				cache_unlock();
 			} else {
diff --git a/pack-objects.h b/pack-objects.h
index 5c7e15ca92..f430d938c6 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -5,6 +5,7 @@
 #define OE_DEPTH_BITS		12
 #define OE_IN_PACK_BITS		14
 #define OE_Z_DELTA_BITS		16
+#define OE_DELTA_SIZE_BITS	31
 
 /*
  * State flags for depth-first search used for analyzing delta cycles.
@@ -80,7 +81,8 @@ struct object_entry {
 				     * uses the same base as me
 				     */
 	void *delta_data;	/* cached delta (uncompressed) */
-	unsigned long delta_size;	/* delta data size (uncompressed) */
+	uint32_t delta_size_:OE_DELTA_SIZE_BITS; /* delta data size (uncompressed) */
+	uint32_t delta_size_valid:1;
 	unsigned z_delta_size:OE_Z_DELTA_BITS;
 	unsigned type_:TYPE_BITS;
 	unsigned in_pack_type:TYPE_BITS; /* could be delta */
@@ -298,4 +300,23 @@ static inline void oe_set_size(struct object_entry *e,
 	e->size_valid = e->size_ == size;
 }
 
+static inline unsigned long oe_delta_size(struct packing_data *pack,
+					  const struct object_entry *e)
+{
+	if (e->delta_size_valid)
+		return e->delta_size_;
+	return oe_size(e);
+}
+
+static inline void oe_set_delta_size(struct packing_data *pack,
+				     struct object_entry *e,
+				     unsigned long size)
+{
+	e->delta_size_ = size;
+	e->delta_size_valid =e->delta_size_ == size;
+	if (!e->delta_size_valid && size != oe_size(e))
+		die("BUG: this can only happen in check_object() "
+		    "where delta size is the same as entry size");
+}
+
 #endif
-- 
2.16.2.903.gd04caf5039


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v4 11/11] pack-objects.h: reorder members to shrink struct object_entry
  2018-03-16 18:31       ` [PATCH v4 00/11] nd/pack-objects-pack-struct updates Nguyễn Thái Ngọc Duy
                           ` (9 preceding siblings ...)
  2018-03-16 18:31         ` [PATCH v4 10/11] pack-objects: shrink delta_size " Nguyễn Thái Ngọc Duy
@ 2018-03-16 18:32         ` Nguyễn Thái Ngọc Duy
  2018-03-16 21:02           ` Junio C Hamano
  2018-03-17 14:10         ` [PATCH v5 00/11] nd/pack-objects-pack-struct updates Nguyễn Thái Ngọc Duy
  11 siblings, 1 reply; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-16 18:32 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

Previous patches leave lots of holes and padding in this struct. This
patch reorders the members and shrinks the struct down to 80 bytes
(from 136 bytes, before any field shrinking is done) with 16 bits to
spare (and a couple more in in_pack_header_size when we really run out
of bits).

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 pack-objects.h | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/pack-objects.h b/pack-objects.h
index f430d938c6..0fa0c83294 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -70,35 +70,36 @@ enum dfs_state {
  */
 struct object_entry {
 	struct pack_idx_entry idx;
-	/* object uncompressed size _if_ size_valid is true */
-	uint32_t size_;
-	unsigned size_valid:1;
-	unsigned in_pack_idx:OE_IN_PACK_BITS;	/* already in pack */
+	void *delta_data;	/* cached delta (uncompressed) */
 	off_t in_pack_offset;
+	uint32_t hash;			/* name hint hash */
+	uint32_t size_;	/* object uncompressed size _if_ size_valid is true */
 	uint32_t delta_idx;	/* delta base object */
 	uint32_t delta_child_idx; /* deltified objects who bases me */
 	uint32_t delta_sibling_idx; /* other deltified objects who
 				     * uses the same base as me
 				     */
-	void *delta_data;	/* cached delta (uncompressed) */
 	uint32_t delta_size_:OE_DELTA_SIZE_BITS; /* delta data size (uncompressed) */
 	uint32_t delta_size_valid:1;
+	unsigned in_pack_idx:OE_IN_PACK_BITS;	/* already in pack */
+	unsigned size_valid:1;
 	unsigned z_delta_size:OE_Z_DELTA_BITS;
+	unsigned type_valid:1;
 	unsigned type_:TYPE_BITS;
 	unsigned in_pack_type:TYPE_BITS; /* could be delta */
-	unsigned type_valid:1;
-	uint32_t hash;			/* name hint hash */
-	unsigned char in_pack_header_size;
 	unsigned preferred_base:1; /*
 				    * we do not pack this, but is available
 				    * to be used as the base object to delta
 				    * objects against.
 				    */
 	unsigned no_try_delta:1;
+	unsigned char in_pack_header_size;
 	unsigned tagged:1; /* near the very tip of refs */
 	unsigned filled:1; /* assigned write-order */
 	unsigned dfs_state:OE_DFS_STATE_BITS;
 	unsigned depth:OE_DEPTH_BITS;
+
+	/* size: 80, bit_padding: 16 bits */
 };
 
 struct packing_data {
-- 
2.16.2.903.gd04caf5039


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* Re: [PATCH v2 4/5] pack-objects: show some progress when counting kept objects
  2018-03-12 18:32       ` Ævar Arnfjörð Bjarmason
@ 2018-03-16 19:14         ` Duy Nguyen
  2018-03-16 20:13           ` Duy Nguyen
  0 siblings, 1 reply; 273+ messages in thread
From: Duy Nguyen @ 2018-03-16 19:14 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Eric Wong, Git Mailing List, Jeff King, Junio C Hamano

On Mon, Mar 12, 2018 at 7:32 PM, Ævar Arnfjörð Bjarmason
<avarab@gmail.com> wrote:
>
> On Tue, Mar 06 2018, Nguyễn Thái Ngọc Duy jotted:
>
>> We only show progress when there are new objects to be packed. But
>> when --keep-pack is specified on the base pack, we will exclude most
>> of objects. This makes 'pack-objects' stay silent for a long time
>> while the counting phase is going.
>>
>> Let's show some progress whenever we visit an object instead. The
>> number of packed objects will be shown after if it's not the same as
>> the number of visited objects.
>>
>> Since the meaning of this number has changed, use another word instead
>> of "Counting" to hint about the change.
>
> Can you elaborate on how the meaning has changed? With/without this on
> linux.git I get:
>
> With:
>
>     Enumerating objects: 5901144, done.
>     Getting object details: 100% (5901145/5901145), done.
>     Delta compression using up to 8 threads.
>
> Without:
>
>     Counting objects: 5901145, done.
>     Delta compression using up to 8 threads.
>
> So now we're seemingly off-by-one but otherwise doing the same thing?

Yep, it's an off-by-one bug.

> As for as user feedback goes we might as well have said "Reticulating
> splines", but I have some bias towards keeping the current "Counting
> objects..." phrasing. We ourselves have other docs referring to it that
> aren't changed by this patch, and there's
> e.g. https://githubengineering.com/counting-objects/ and lots of other
> 3rd party docs that refer to this.

This is why I changed the phrase. The counting is now a bit different.
Documents describing this exact phrase won't apply to the new version.

The old way counts objects that will be packed. The new way simply
counts objects that are visited. When you keep some packs, the number
of objects you visit but not pack could be very high, while in normal
case the two numbers should be the same (e.g. you pack everything you
visit). I would prefer to print both values (e.g. "counting objects:
<packed>/<visited>") but it's not possible with the current progress
code.
-- 
Duy

^ permalink raw reply	[flat|nested] 273+ messages in thread

* [PATCH v3 0/7] nd/repack-keep-pack updates
  2018-03-06 10:41   ` [PATCH v2 0/5] Avoid expensive 'repack -ad' in gc --auto Nguyễn Thái Ngọc Duy
                       ` (5 preceding siblings ...)
  2018-03-06 17:49     ` [PATCH v2 0/5] Avoid expensive 'repack -ad' in gc --auto Junio C Hamano
@ 2018-03-16 19:27     ` Nguyễn Thái Ngọc Duy
  2018-03-16 19:27       ` [PATCH v3 1/7] repack: add --keep-pack option Nguyễn Thái Ngọc Duy
                         ` (7 more replies)
  6 siblings, 8 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-16 19:27 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

This is not in mergeable state yet. But since there's lots of changes
and I'm pretty sure there's still room for improvement when it comes
to configuration and stuff, I'm posting it anyway to get the
discussion continue.

- --keep-pack does not imply --honor-pack-keep or --pack-kept-objects
  anymore. If you want it, you pass those in. If not, only
  --keep-pack'd packs are kept.

- v2 03/05 feels too big to me so it's now broken down in a couple
  patches.

- freebsd support is added and tested (which probably means OS X
  support is good too). Windows code for getting memory size remains
  untested.

- platforms that do not have support for getting memory will not have
  this "keep base pack on gc --auto" turned on. This may be just safer
  than printing "unrecognized platform" which may not reach platform
  developers anyway.

- gc.bigBasePackThreshold is renamed to gc.bigPackThreshold and now
  keeps all packs larger than this limit.

- fix the off-by-one in "enumerating objects" code.

- while I tend to agree to make the "50% physical memory"
  configurable. I'm not sure if I should add a config var for that
  (which leads to more questions like, should we allow to say "30%
  _free_ memory" as well?) or I should just give the memory estimation
  to a user and he/she can decide what to do with it via hooks.

Interdiff

-- 8< --
diff --git a/Documentation/config.txt b/Documentation/config.txt
index 120cf6bac9..ce40112e31 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -1549,12 +1549,16 @@ gc.autoDetach::
 	Make `git gc --auto` return immediately and run in background
 	if the system supports it. Default is true.
 
-gc.bigBasePackThreshold::
-	Make `git gc --auto` only enable `--keep-base-pack` when the
-	base pack's size is larger than this limit (in bytes).
-	Defaults to zero, which disables this check and lets
-	`git gc --auto` determine when to enable `--keep-base-pack`
-	based on memory usage.
+gc.bigPackThreshold::
+	If non-zero, all packs larger than this limit are kept when
+	`git gc` is run. This is very similar to `--keep-base-pack`
+	except that all packs that meet the threshold are kept, not
+	just the base pack. Defaults to zero.
++
+Note that if the number of kept packs is more than gc.autoPackLimit,
+this configuration variable is ignored, all packs except the base pack
+will be repacked. After this the number of packs should go below
+gc.autoPackLimit and gc.bigPackThreshold should be respected again.
 
 gc.logExpiry::
 	If the file gc.log exists, then `git gc --auto` won't run
diff --git a/Documentation/git-gc.txt b/Documentation/git-gc.txt
index 35ad420d5c..19b0d1741b 100644
--- a/Documentation/git-gc.txt
+++ b/Documentation/git-gc.txt
@@ -9,7 +9,7 @@ git-gc - Cleanup unnecessary files and optimize the local repository
 SYNOPSIS
 --------
 [verse]
-'git gc' [--aggressive] [--auto] [--quiet] [--prune=<date> | --no-prune] [--force]
+'git gc' [--aggressive] [--auto] [--quiet] [--prune=<date> | --no-prune] [--force] [--keep-base-pack]
 
 DESCRIPTION
 -----------
@@ -55,15 +55,16 @@ all loose objects are combined into a single pack using
 disables automatic packing of loose objects.
 +
 If the number of packs exceeds the value of `gc.autoPackLimit`,
-then existing packs (except those marked with a `.keep` file)
+then existing packs (except those marked with a `.keep` file
+or over `gc.bigPackThreshold` limit)
 are consolidated into a single pack by using the `-A` option of
-'git repack'. Setting `gc.autoPackLimit` to 0 disables
-automatic consolidation of packs.
-+
-If the physical amount of memory is considered not enough for `git
-repack` to run smoothly, `--keep-base-pack` is enabled. This could be
-overridden by setting `gc.bigBasePackThreshold` which only enables
-`--keep-base-pack` when the base pack is larger the specified limit.
+'git repack'.
+If the amount of memory is estimated not enough for `git repack` to
+run smoothly and `gc.bigPackThreshold` is not set, the largest
+pack will also be excluded (this is the equivalent of running `git gc`
+with `--keep-base-pack`).
+Setting `gc.autoPackLimit` to 0 disables automatic consolidation of
+packs.
 
 --prune=<date>::
 	Prune loose objects older than date (default is 2 weeks ago,
@@ -84,8 +85,10 @@ overridden by setting `gc.bigBasePackThreshold` which only enables
 	instance running on this repository.
 
 --keep-base-pack::
-	All packs except the base pack are consolidated into a single
-	pack. The largest pack is considered the base pack.
+	All packs except the base pack and those marked with a `.keep`
+	files are consolidated into a single pack. The largest pack is
+	considered the base pack. When this option is used,
+	`gc.bigPackThreshold` is ignored.
 
 Configuration
 -------------
@@ -176,10 +179,6 @@ run commands concurrently have to live with some risk of corruption (which
 seems to be low in practice) unless they turn off automatic garbage
 collection with 'git config gc.auto 0'.
 
-Set environment variable `GIT_TRACE` in order to see the memory usage
-estimation in `git gc --auto` that determines whether the base pack is
-kept.
-
 HOOKS
 -----
 
diff --git a/Documentation/git-pack-objects.txt b/Documentation/git-pack-objects.txt
index 1975477160..403524652a 100644
--- a/Documentation/git-pack-objects.txt
+++ b/Documentation/git-pack-objects.txt
@@ -12,7 +12,7 @@ SYNOPSIS
 'git pack-objects' [-q | --progress | --all-progress] [--all-progress-implied]
 	[--no-reuse-delta] [--delta-base-offset] [--non-empty]
 	[--local] [--incremental] [--window=<n>] [--depth=<n>]
-	[--revs [--unpacked | --all]]
+	[--revs [--unpacked | --all]] [--keep-pack=<pack-name>]
 	[--stdout [--filter=<filter-spec>] | base-name]
 	[--shallow] [--keep-true-parents] < object-list
 
@@ -126,9 +126,12 @@ base-name::
 	has a .keep file to be ignored, even if it would have
 	otherwise been packed.
 
---keep-pack=<pack name>::
-	Ignore the given pack. This is the equivalent of having
-	`.keep` file on the pack. Implies `--honor-pack-keep`.
+--keep-pack=<pack-name>::
+	This flag causes an object already in the given pack to be
+	ignored, even if it would have otherwise been
+	packed. `<pack-name>` is the the pack file name without
+	leading directory (e.g. `pack-123.pack`). The option could be
+	specified multiple times to keep multiple packs.
 
 --incremental::
 	This flag causes an object already in a pack to be ignored
diff --git a/Documentation/git-repack.txt b/Documentation/git-repack.txt
index 12b073e115..ce497d9d12 100644
--- a/Documentation/git-repack.txt
+++ b/Documentation/git-repack.txt
@@ -9,7 +9,7 @@ git-repack - Pack unpacked objects in a repository
 SYNOPSIS
 --------
 [verse]
-'git repack' [-a] [-A] [-d] [-f] [-F] [-l] [-n] [-q] [-b] [--window=<n>] [--depth=<n>] [--threads=<n>]
+'git repack' [-a] [-A] [-d] [-f] [-F] [-l] [-n] [-q] [-b] [--window=<n>] [--depth=<n>] [--threads=<n>] [--keep-pack=<pack-name>]
 
 DESCRIPTION
 -----------
@@ -133,9 +133,12 @@ other objects in that pack they already have locally.
 	with `-b` or `repack.writeBitmaps`, as it ensures that the
 	bitmapped packfile has the necessary objects.
 
---keep-pack=<pack name>::
+--keep-pack=<pack-name>::
 	Exclude the given pack from repacking. This is the equivalent
-	of having `.keep` file on the pack. Implies `--pack-kept-objects`.
+	of having `.keep` file on the pack. `<pack-name>` is the the
+	pack file name without leading directory (e.g. `pack-123.pack`).
+	The option could be specified multiple times to keep multiple
+	packs.
 
 --unpack-unreachable=<when>::
 	When loosening unreachable objects, do not bother loosening any
diff --git a/builtin/gc.c b/builtin/gc.c
index ff914264a5..140c1bb7dd 100644
--- a/builtin/gc.c
+++ b/builtin/gc.c
@@ -43,7 +43,7 @@ static timestamp_t gc_log_expire_time;
 static const char *gc_log_expire = "1.day.ago";
 static const char *prune_expire = "2.weeks.ago";
 static const char *prune_worktrees_expire = "3.months.ago";
-static unsigned long big_base_pack_threshold;
+static unsigned long big_pack_threshold;
 static unsigned long max_delta_cache_size = DEFAULT_DELTA_CACHE_SIZE;
 
 static struct argv_array pack_refs_cmd = ARGV_ARRAY_INIT;
@@ -132,7 +132,7 @@ static void gc_config(void)
 	git_config_get_expiry("gc.worktreepruneexpire", &prune_worktrees_expire);
 	git_config_get_expiry("gc.logexpiry", &gc_log_expire);
 
-	git_config_get_ulong("gc.bigbasepackthreshold", &big_base_pack_threshold);
+	git_config_get_ulong("gc.bigpackthreshold", &big_pack_threshold);
 	git_config_get_ulong("pack.deltacachesize", &max_delta_cache_size);
 
 	git_config(git_default_config, NULL);
@@ -173,18 +173,27 @@ static int too_many_loose_objects(void)
 	return needed;
 }
 
-static struct packed_git *find_the_base_pack(void)
+static struct packed_git *find_base_packs(struct string_list *packs,
+					  unsigned long limit)
 {
 	struct packed_git *p, *base = NULL;
 
 	prepare_packed_git();
 
 	for (p = packed_git; p; p = p->next) {
-		if (p->pack_local &&
-		    (!base || base->pack_size < p->pack_size))
+		if (!p->pack_local)
+			continue;
+		if (limit) {
+			if (p->pack_size >= limit)
+				string_list_append(packs, p->pack_name);
+		} else if (!base || base->pack_size < p->pack_size) {
 			base = p;
+		}
 	}
 
+	if (base)
+		string_list_append(packs, base->pack_name);
+
 	return base;
 }
 
@@ -211,21 +220,24 @@ static int too_many_packs(void)
 	return gc_auto_pack_limit < cnt;
 }
 
-static inline unsigned long total_ram(void)
+static uint64_t total_ram(void)
 {
-	unsigned long default_ram = 4;
-#ifdef HAVE_SYSINFO
+#if defined(HAVE_SYSINFO)
 	struct sysinfo si;
 
 	if (!sysinfo(&si))
 		return si.totalram;
-#elif defined(HAVE_BSD_SYSCTL) && defined(HW_MEMSIZE)
+#elif defined(HAVE_BSD_SYSCTL) && (defined(HW_MEMSIZE) || defined(HW_PHYSMEM))
 	int64_t physical_memory;
 	int mib[2];
 	size_t length;
 
 	mib[0] = CTL_HW;
+# if defined(HW_MEMSIZE)
 	mib[1] = HW_MEMSIZE;
+# else
+	mib[1] = HW_PHYSMEM;
+# endif
 	length = sizeof(int64_t);
 	if (!sysctl(mib, 2, &physical_memory, &length, NULL, 0))
 		return physical_memory;
@@ -235,24 +247,18 @@ static inline unsigned long total_ram(void)
 	memInfo.dwLength = sizeof(MEMORYSTATUSEX);
 	if (GlobalMemoryStatusEx(&memInfo))
 		return memInfo.ullTotalPhys;
-#else
-	fprintf(stderr, _("unrecognized platform, assuming %lu GB RAM\n"),
-		default_ram);
 #endif
-	return default_ram * 1024 * 1024 * 1024;
+	return 0;
 }
 
-static int pack_objects_uses_too_much_memory(struct packed_git *pack)
+static uint64_t estimate_repack_memory(struct packed_git *pack)
 {
 	unsigned long nr_objects = approximate_object_count();
-	size_t mem_want, mem_have, os_cache, heap;
+	size_t os_cache, heap;
 
 	if (!pack || !nr_objects)
 		return 0;
 
-	if (big_base_pack_threshold)
-		return pack->pack_size >= big_base_pack_threshold;
-
 	/*
 	 * First we have to scan through at least one pack.
 	 * Assume enough room in OS file cache to keep the entire pack
@@ -284,28 +290,16 @@ static int pack_objects_uses_too_much_memory(struct packed_git *pack)
 	/* and of course pack-objects has its own delta cache */
 	heap += max_delta_cache_size;
 
-	/*
-	 * Only allow 1/2 of memory for pack-objects, leave the rest
-	 * for the OS and other processes in the system.
-	 */
-	mem_have = total_ram() / 2;
-	mem_want = os_cache + heap;
-
-	trace_printf("gc mem estimation\n"
-		     "mem_have: %" PRIuMAX ", mem_want: %" PRIuMAX ", "
-		     "heap: %" PRIuMAX "\n"
-		     "pack_size: %" PRIuMAX ", index_size: %" PRIuMAX ", "
-		     "nr_objects: %" PRIuMAX "\n"
-		     "base_cache: %" PRIuMAX ", delta_cache: %" PRIuMAX "\n",
-		     (uintmax_t)mem_have, (uintmax_t)mem_want, (uintmax_t)heap,
-		     (uintmax_t)pack->pack_size, (uintmax_t)pack->index_size,
-		     (uintmax_t)nr_objects,
-		     (uintmax_t)delta_base_cache_limit, (uintmax_t)max_delta_cache_size);
-
-	return mem_want >= mem_have;
+	return os_cache + heap;
 }
 
-static void add_repack_all_option(struct packed_git *keep_pack)
+static int keep_one_pack(struct string_list_item *item, void *data)
+{
+	argv_array_pushf(&repack, "--keep-pack=%s", basename(item->string));
+	return 0;
+}
+
+static void add_repack_all_option(struct string_list *keep_pack)
 {
 	if (prune_expire && !strcmp(prune_expire, "now"))
 		argv_array_push(&repack, "-a");
@@ -316,8 +310,7 @@ static void add_repack_all_option(struct packed_git *keep_pack)
 	}
 
 	if (keep_pack)
-		argv_array_pushf(&repack, "--keep-pack=%s",
-				 basename(keep_pack->pack_name));
+		for_each_string_list(keep_pack, keep_one_pack, NULL);
 }
 
 static void add_repack_incremental_option(void)
@@ -341,12 +334,33 @@ static int need_to_gc(void)
 	 * there is no need.
 	 */
 	if (too_many_packs()) {
-		struct packed_git *exclude = find_the_base_pack();
+		struct string_list keep_pack = STRING_LIST_INIT_NODUP;
+
+		if (big_pack_threshold) {
+			find_base_packs(&keep_pack, big_pack_threshold);
+			if (keep_pack.nr >= gc_auto_pack_limit) {
+				big_pack_threshold = 0;
+				string_list_clear(&keep_pack, 0);
+				find_base_packs(&keep_pack, 0);
+			}
+		} else {
+			struct packed_git * p = find_base_packs(&keep_pack, 0);
+			uint64_t mem_have, mem_want;
+
+			mem_have = total_ram();
+			mem_want = estimate_repack_memory(p);
 
-		if (!pack_objects_uses_too_much_memory(exclude))
-			exclude = NULL;
+			/*
+			 * Only allow 1/2 of memory for pack-objects, leave
+			 * the rest for the OS and other processes in the
+			 * system.
+			 */
+			if (!mem_have || mem_want < mem_have / 2)
+				string_list_clear(&keep_pack, 0);
+		}
 
-		add_repack_all_option(exclude);
+		add_repack_all_option(&keep_pack);
+		string_list_clear(&keep_pack, 0);
 	} else if (too_many_loose_objects())
 		add_repack_incremental_option();
 	else
@@ -558,17 +572,17 @@ int cmd_gc(int argc, const char **argv, const char *prefix)
 			daemonized = !daemonize();
 		}
 	} else {
-		struct packed_git *base_pack = find_the_base_pack();
-		struct packed_git *exclude = NULL;
+		struct string_list keep_pack = STRING_LIST_INIT_NODUP;
 
 		if (keep_base_pack != -1) {
 			if (keep_base_pack)
-				exclude = base_pack;
-		} else if (base_pack && big_base_pack_threshold &&
-			   base_pack->pack_size >= big_base_pack_threshold)
-			exclude = base_pack;
+				find_base_packs(&keep_pack, 0);
+		} else if (big_pack_threshold) {
+			find_base_packs(&keep_pack, big_pack_threshold);
+		}
 
-		add_repack_all_option(exclude);
+		add_repack_all_option(&keep_pack);
+		string_list_clear(&keep_pack, 0);
 	}
 
 	name = lock_repo_for_gc(force, &pid);
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 2ec911bf10..ac8f29dd52 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -54,7 +54,7 @@ static int pack_loose_unreachable;
 static int local;
 static int have_non_local_packs;
 static int incremental;
-static int ignore_packed_keep;
+static int ignore_packed_keep, ignore_packed_keep_in_core;
 static int allow_ofs_delta;
 static struct pack_idx_option pack_idx_opts;
 static const char *base_name;
@@ -983,13 +983,15 @@ static int want_found_object(int exclude, struct packed_git *p)
 	 * Otherwise, we signal "-1" at the end to tell the caller that we do
 	 * not know either way, and it needs to check more packs.
 	 */
-	if (!ignore_packed_keep &&
+	if (!ignore_packed_keep && !ignore_packed_keep_in_core &&
 	    (!local || !have_non_local_packs))
 		return 1;
 
 	if (local && !p->pack_local)
 		return 0;
-	if (ignore_packed_keep && p->pack_local && p->pack_keep)
+	if (p->pack_local &&
+	    ((ignore_packed_keep && p->pack_keep) ||
+	     (ignore_packed_keep_in_core && p->pack_keep_in_core)))
 		return 0;
 
 	/* we don't know yet; keep looking for more packs */
@@ -1092,7 +1094,7 @@ static int add_object_entry(const struct object_id *oid, enum object_type type,
 	off_t found_offset = 0;
 	uint32_t index_pos;
 
-	display_progress(progress_state, nr_seen++);
+	display_progress(progress_state, ++nr_seen);
 
 	if (have_duplicate_entry(oid, exclude, &index_pos))
 		return 0;
@@ -1119,7 +1121,7 @@ static int add_object_entry_from_bitmap(const struct object_id *oid,
 {
 	uint32_t index_pos;
 
-	display_progress(progress_state, nr_seen++);
+	display_progress(progress_state, ++nr_seen);
 
 	if (have_duplicate_entry(oid, 0, &index_pos))
 		return 0;
@@ -2684,7 +2686,7 @@ static void add_objects_in_unpacked_packs(struct rev_info *revs)
 		struct object_id oid;
 		struct object *o;
 
-		if (!p->pack_local || p->pack_keep)
+		if (!p->pack_local || p->pack_keep || p->pack_keep_in_core)
 			continue;
 		if (open_pack_index(p))
 			die("cannot open pack index");
@@ -2746,7 +2748,8 @@ static int has_sha1_pack_kept_or_nonlocal(const struct object_id *oid)
 	p = (last_found != (void *)1) ? last_found : packed_git;
 
 	while (p) {
-		if ((!p->pack_local || p->pack_keep) &&
+		if ((!p->pack_local || p->pack_keep ||
+				p->pack_keep_in_core) &&
 			find_pack_entry_one(oid->hash, p)) {
 			last_found = p;
 			return 1;
@@ -2789,7 +2792,7 @@ static void loosen_unused_packed_objects(struct rev_info *revs)
 	struct object_id oid;
 
 	for (p = packed_git; p; p = p->next) {
-		if (!p->pack_local || p->pack_keep)
+		if (!p->pack_local || p->pack_keep || p->pack_keep_in_core)
 			continue;
 
 		if (open_pack_index(p))
@@ -2816,6 +2819,7 @@ static int pack_options_allow_reuse(void)
 	return pack_to_stdout &&
 	       allow_ofs_delta &&
 	       !ignore_packed_keep &&
+	       !ignore_packed_keep_in_core &&
 	       (!local || !have_non_local_packs) &&
 	       !incremental;
 }
@@ -2939,13 +2943,14 @@ static void add_extra_kept_packs(const struct string_list *names)
 		if (!p->pack_local)
 			continue;
 
-		for (i = 0; i < names->nr; i++) {
-			if (fspathcmp(name, names->items[i].string))
-				continue;
+		for (i = 0; i < names->nr; i++)
+			if (!fspathcmp(name, names->items[i].string))
+				break;
 
-			p->pack_keep = 1;
-			ignore_packed_keep = 1;
-			break;
+		if (i < names->nr) {
+			p->pack_keep_in_core = 1;
+			ignore_packed_keep_in_core = 1;
+			continue;
 		}
 	}
 }
diff --git a/builtin/repack.c b/builtin/repack.c
index 6a1dade0e1..6c636e159e 100644
--- a/builtin/repack.c
+++ b/builtin/repack.c
@@ -100,7 +100,7 @@ static void get_non_kept_pack_filenames(struct string_list *fname_list,
 		size_t len;
 		int i;
 
-		for (i = 0;i < extra_keep->nr; i++)
+		for (i = 0; i < extra_keep->nr; i++)
 			if (!fspathcmp(e->d_name, extra_keep->items[i].string))
 				break;
 		if (extra_keep->nr > 0 && i < extra_keep->nr)
@@ -227,8 +227,6 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
 	    (unpack_unreachable || (pack_everything & LOOSEN_UNREACHABLE)))
 		die(_("--keep-unreachable and -A are incompatible"));
 
-	if (keep_pack_list.nr && pack_kept_objects > 0)
-		die(_("incompatible --keep-pack and --pack-kept-objects"));
 	if (pack_kept_objects < 0)
 		pack_kept_objects = write_bitmaps;
 
diff --git a/cache.h b/cache.h
index 21fbcc2414..42d700f3d8 100644
--- a/cache.h
+++ b/cache.h
@@ -1635,6 +1635,7 @@ extern struct packed_git {
 	int pack_fd;
 	unsigned pack_local:1,
 		 pack_keep:1,
+		 pack_keep_in_core:1,
 		 freshened:1,
 		 do_not_close:1,
 		 pack_promisor:1;
diff --git a/t/t6500-gc.sh b/t/t6500-gc.sh
index 863fdbb0fd..96ca70f9cc 100755
--- a/t/t6500-gc.sh
+++ b/t/t6500-gc.sh
@@ -7,9 +7,9 @@ test_description='basic git gc tests
 
 test_expect_success 'setup' '
 	# do not let the amount of physical memory affects gc
-	# behavior, make sure the pack_objects_uses_too_much_memory()
-	# always returns false
-	git config gc.bigBasePackThreshold 2g
+	# behavior, make sure we always pack everything to one pack by
+	# default
+	git config gc.bigPackThreshold 2g
 '
 
 test_expect_success 'gc empty repository' '
diff --git a/t/t7700-repack.sh b/t/t7700-repack.sh
index 553d907d34..05ae0de3aa 100755
--- a/t/t7700-repack.sh
+++ b/t/t7700-repack.sh
@@ -200,16 +200,22 @@ test_expect_success 'repack --keep-pack' '
 	test_create_repo keep-pack &&
 	(
 		cd keep-pack &&
-		for cmit in one two three four; do
-			test_commit $cmit &&
-			git repack -d
-		done &&
+		test_commit one &&
+		git repack -d &&
+		test_commit two &&
+		git repack -d &&
+		test_commit three &&
+		git repack -d &&
+		test_commit four &&
+		git repack -d &&
 		( cd .git/objects/pack && ls *.pack ) >pack-list &&
 		test_line_count = 4 pack-list &&
-		KEEP1=`head -n1 pack-list` &&
-		KEEP4=`tail -n1 pack-list` &&
+		KEEP1=$(head -n1 pack-list) &&
+		KEEP4=$(tail -n1 pack-list) &&
 		git repack -a -d --keep-pack $KEEP1 --keep-pack $KEEP4 &&
 		ls .git/objects/pack/*.pack >new-counts &&
+		grep -q $KEEP1 new-counts &&
+		grep -q $KEEP4 new-counts &&
 		test_line_count = 3 new-counts &&
 		git fsck
 	)
-- 8< --

Nguyễn Thái Ngọc Duy (7):
  repack: add --keep-pack option
  gc: add --keep-base-pack
  gc: detect base packs based on gc.bigPackThreshold config
  gc --auto: exclude base pack if not enough mem to "repack -ad"
  gc: handle a corner case in gc.bigPackThreshold
  pack-objects: show some progress when counting kept objects
  pack-objects: display progress in get_object_details()

 Documentation/config.txt           |  11 ++
 Documentation/git-gc.txt           |  20 +++-
 Documentation/git-pack-objects.txt |   9 +-
 Documentation/git-repack.txt       |   9 +-
 builtin/gc.c                       | 167 +++++++++++++++++++++++++++--
 builtin/pack-objects.c             |  68 +++++++++---
 builtin/repack.c                   |  21 +++-
 cache.h                            |   1 +
 config.mak.uname                   |   1 +
 git-compat-util.h                  |   4 +
 pack-objects.h                     |   2 +
 t/t6500-gc.sh                      |  29 +++++
 t/t7700-repack.sh                  |  25 +++++
 13 files changed, 339 insertions(+), 28 deletions(-)

-- 
2.16.2.903.gd04caf5039


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v3 1/7] repack: add --keep-pack option
  2018-03-16 19:27     ` [PATCH v3 0/7] nd/repack-keep-pack updates Nguyễn Thái Ngọc Duy
@ 2018-03-16 19:27       ` Nguyễn Thái Ngọc Duy
  2018-03-16 19:27       ` [PATCH v3 2/7] gc: add --keep-base-pack Nguyễn Thái Ngọc Duy
                         ` (6 subsequent siblings)
  7 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-16 19:27 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

We allow to keep existing packs by having companion .keep files. This
is helpful when a pack is permanently kept. In the next patch, git-gc
just wants to keep a pack temporarily, for one pack-objects
run. git-gc can use --keep-pack for this use case.

A note about why the pack_keep field cannot be reused and
pack_keep_in_core has to be added. This is about the case when
--keep-pack is specified together with either --keep-unreachable or
--unpack-unreachable, but --honor-pack-keep is NOT specified.

In this case, we want to exclude objects from the packs specified on
command line, not from ones with .keep files. If only one bit flag is
used, we have to clear pack_keep on pack files with the .keep file.

But we can't make any assumption about unreachable objects in .keep
packs. If "pack_keep" field is false for .keep packs, we could
potentially pull lots of unreachable objects into the new pack, or
unpack them loose. The safer approach is ignore all packs with either
.keep file or --keep-pack.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 Documentation/git-pack-objects.txt |  9 +++++-
 Documentation/git-repack.txt       |  9 +++++-
 builtin/pack-objects.c             | 48 ++++++++++++++++++++++++++----
 builtin/repack.c                   | 21 +++++++++++--
 cache.h                            |  1 +
 t/t7700-repack.sh                  | 25 ++++++++++++++++
 6 files changed, 102 insertions(+), 11 deletions(-)

diff --git a/Documentation/git-pack-objects.txt b/Documentation/git-pack-objects.txt
index 81bc490ac5..403524652a 100644
--- a/Documentation/git-pack-objects.txt
+++ b/Documentation/git-pack-objects.txt
@@ -12,7 +12,7 @@ SYNOPSIS
 'git pack-objects' [-q | --progress | --all-progress] [--all-progress-implied]
 	[--no-reuse-delta] [--delta-base-offset] [--non-empty]
 	[--local] [--incremental] [--window=<n>] [--depth=<n>]
-	[--revs [--unpacked | --all]]
+	[--revs [--unpacked | --all]] [--keep-pack=<pack-name>]
 	[--stdout [--filter=<filter-spec>] | base-name]
 	[--shallow] [--keep-true-parents] < object-list
 
@@ -126,6 +126,13 @@ base-name::
 	has a .keep file to be ignored, even if it would have
 	otherwise been packed.
 
+--keep-pack=<pack-name>::
+	This flag causes an object already in the given pack to be
+	ignored, even if it would have otherwise been
+	packed. `<pack-name>` is the the pack file name without
+	leading directory (e.g. `pack-123.pack`). The option could be
+	specified multiple times to keep multiple packs.
+
 --incremental::
 	This flag causes an object already in a pack to be ignored
 	even if it would have otherwise been packed.
diff --git a/Documentation/git-repack.txt b/Documentation/git-repack.txt
index ae750e9e11..ce497d9d12 100644
--- a/Documentation/git-repack.txt
+++ b/Documentation/git-repack.txt
@@ -9,7 +9,7 @@ git-repack - Pack unpacked objects in a repository
 SYNOPSIS
 --------
 [verse]
-'git repack' [-a] [-A] [-d] [-f] [-F] [-l] [-n] [-q] [-b] [--window=<n>] [--depth=<n>] [--threads=<n>]
+'git repack' [-a] [-A] [-d] [-f] [-F] [-l] [-n] [-q] [-b] [--window=<n>] [--depth=<n>] [--threads=<n>] [--keep-pack=<pack-name>]
 
 DESCRIPTION
 -----------
@@ -133,6 +133,13 @@ other objects in that pack they already have locally.
 	with `-b` or `repack.writeBitmaps`, as it ensures that the
 	bitmapped packfile has the necessary objects.
 
+--keep-pack=<pack-name>::
+	Exclude the given pack from repacking. This is the equivalent
+	of having `.keep` file on the pack. `<pack-name>` is the the
+	pack file name without leading directory (e.g. `pack-123.pack`).
+	The option could be specified multiple times to keep multiple
+	packs.
+
 --unpack-unreachable=<when>::
 	When loosening unreachable objects, do not bother loosening any
 	objects older than `<when>`. This can be used to optimize out
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 5c674b2843..7b9fe6c89f 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -28,6 +28,7 @@
 #include "argv-array.h"
 #include "list.h"
 #include "packfile.h"
+#include "dir.h"
 
 static const char *pack_usage[] = {
 	N_("git pack-objects --stdout [<options>...] [< <ref-list> | < <object-list>]"),
@@ -53,7 +54,7 @@ static int pack_loose_unreachable;
 static int local;
 static int have_non_local_packs;
 static int incremental;
-static int ignore_packed_keep;
+static int ignore_packed_keep, ignore_packed_keep_in_core;
 static int allow_ofs_delta;
 static struct pack_idx_option pack_idx_opts;
 static const char *base_name;
@@ -982,13 +983,15 @@ static int want_found_object(int exclude, struct packed_git *p)
 	 * Otherwise, we signal "-1" at the end to tell the caller that we do
 	 * not know either way, and it needs to check more packs.
 	 */
-	if (!ignore_packed_keep &&
+	if (!ignore_packed_keep && !ignore_packed_keep_in_core &&
 	    (!local || !have_non_local_packs))
 		return 1;
 
 	if (local && !p->pack_local)
 		return 0;
-	if (ignore_packed_keep && p->pack_local && p->pack_keep)
+	if (p->pack_local &&
+	    ((ignore_packed_keep && p->pack_keep) ||
+	     (ignore_packed_keep_in_core && p->pack_keep_in_core)))
 		return 0;
 
 	/* we don't know yet; keep looking for more packs */
@@ -2677,7 +2680,7 @@ static void add_objects_in_unpacked_packs(struct rev_info *revs)
 		struct object_id oid;
 		struct object *o;
 
-		if (!p->pack_local || p->pack_keep)
+		if (!p->pack_local || p->pack_keep || p->pack_keep_in_core)
 			continue;
 		if (open_pack_index(p))
 			die("cannot open pack index");
@@ -2739,7 +2742,8 @@ static int has_sha1_pack_kept_or_nonlocal(const struct object_id *oid)
 	p = (last_found != (void *)1) ? last_found : packed_git;
 
 	while (p) {
-		if ((!p->pack_local || p->pack_keep) &&
+		if ((!p->pack_local || p->pack_keep ||
+				p->pack_keep_in_core) &&
 			find_pack_entry_one(oid->hash, p)) {
 			last_found = p;
 			return 1;
@@ -2782,7 +2786,7 @@ static void loosen_unused_packed_objects(struct rev_info *revs)
 	struct object_id oid;
 
 	for (p = packed_git; p; p = p->next) {
-		if (!p->pack_local || p->pack_keep)
+		if (!p->pack_local || p->pack_keep || p->pack_keep_in_core)
 			continue;
 
 		if (open_pack_index(p))
@@ -2809,6 +2813,7 @@ static int pack_options_allow_reuse(void)
 	return pack_to_stdout &&
 	       allow_ofs_delta &&
 	       !ignore_packed_keep &&
+	       !ignore_packed_keep_in_core &&
 	       (!local || !have_non_local_packs) &&
 	       !incremental;
 }
@@ -2917,6 +2922,33 @@ static void get_object_list(int ac, const char **av)
 	oid_array_clear(&recent_objects);
 }
 
+static void add_extra_kept_packs(const struct string_list *names)
+{
+	struct packed_git *p;
+
+	if (!names->nr)
+		return;
+
+	prepare_packed_git();
+	for (p = packed_git; p; p = p->next) {
+		const char *name = basename(p->pack_name);
+		int i;
+
+		if (!p->pack_local)
+			continue;
+
+		for (i = 0; i < names->nr; i++)
+			if (!fspathcmp(name, names->items[i].string))
+				break;
+
+		if (i < names->nr) {
+			p->pack_keep_in_core = 1;
+			ignore_packed_keep_in_core = 1;
+			continue;
+		}
+	}
+}
+
 static int option_parse_index_version(const struct option *opt,
 				      const char *arg, int unset)
 {
@@ -2956,6 +2988,7 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 	struct argv_array rp = ARGV_ARRAY_INIT;
 	int rev_list_unpacked = 0, rev_list_all = 0, rev_list_reflog = 0;
 	int rev_list_index = 0;
+	struct string_list keep_pack_list = STRING_LIST_INIT_NODUP;
 	struct option pack_objects_options[] = {
 		OPT_SET_INT('q', "quiet", &progress,
 			    N_("do not show progress meter"), 0),
@@ -3022,6 +3055,8 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 			 N_("create packs suitable for shallow fetches")),
 		OPT_BOOL(0, "honor-pack-keep", &ignore_packed_keep,
 			 N_("ignore packs that have companion .keep file")),
+		OPT_STRING_LIST(0, "keep-pack", &keep_pack_list, N_("name"),
+				N_("ignore this pack")),
 		OPT_INTEGER(0, "compression", &pack_compression_level,
 			    N_("pack compression level")),
 		OPT_SET_INT(0, "keep-true-parents", &grafts_replace_parents,
@@ -3150,6 +3185,7 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 		progress = 2;
 
 	prepare_packed_git();
+	add_extra_kept_packs(&keep_pack_list);
 	if (ignore_packed_keep) {
 		struct packed_git *p;
 		for (p = packed_git; p; p = p->next)
diff --git a/builtin/repack.c b/builtin/repack.c
index 7bdb40142f..6c636e159e 100644
--- a/builtin/repack.c
+++ b/builtin/repack.c
@@ -86,7 +86,8 @@ static void remove_pack_on_signal(int signo)
  * have a corresponding .keep or .promisor file. These packs are not to
  * be kept if we are going to pack everything into one file.
  */
-static void get_non_kept_pack_filenames(struct string_list *fname_list)
+static void get_non_kept_pack_filenames(struct string_list *fname_list,
+					const struct string_list *extra_keep)
 {
 	DIR *dir;
 	struct dirent *e;
@@ -97,6 +98,14 @@ static void get_non_kept_pack_filenames(struct string_list *fname_list)
 
 	while ((e = readdir(dir)) != NULL) {
 		size_t len;
+		int i;
+
+		for (i = 0; i < extra_keep->nr; i++)
+			if (!fspathcmp(e->d_name, extra_keep->items[i].string))
+				break;
+		if (extra_keep->nr > 0 && i < extra_keep->nr)
+			continue;
+
 		if (!strip_suffix(e->d_name, ".pack", &len))
 			continue;
 
@@ -148,7 +157,7 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
 	struct string_list rollback = STRING_LIST_INIT_NODUP;
 	struct string_list existing_packs = STRING_LIST_INIT_DUP;
 	struct strbuf line = STRBUF_INIT;
-	int ext, ret, failed;
+	int i, ext, ret, failed;
 	FILE *out;
 
 	/* variables to be filled by option parsing */
@@ -160,6 +169,7 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
 	const char *depth = NULL;
 	const char *threads = NULL;
 	const char *max_pack_size = NULL;
+	struct string_list keep_pack_list = STRING_LIST_INIT_NODUP;
 	int no_reuse_delta = 0, no_reuse_object = 0;
 	int no_update_server_info = 0;
 	int quiet = 0;
@@ -200,6 +210,8 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
 				N_("maximum size of each packfile")),
 		OPT_BOOL(0, "pack-kept-objects", &pack_kept_objects,
 				N_("repack objects in packs marked with .keep")),
+		OPT_STRING_LIST(0, "keep-pack", &keep_pack_list, N_("name"),
+				N_("do not repack this pack")),
 		OPT_END()
 	};
 
@@ -230,6 +242,9 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
 	argv_array_push(&cmd.args, "--keep-true-parents");
 	if (!pack_kept_objects)
 		argv_array_push(&cmd.args, "--honor-pack-keep");
+	for (i = 0; i < keep_pack_list.nr; i++)
+		argv_array_pushf(&cmd.args, "--keep-pack=%s",
+				 keep_pack_list.items[i].string);
 	argv_array_push(&cmd.args, "--non-empty");
 	argv_array_push(&cmd.args, "--all");
 	argv_array_push(&cmd.args, "--reflog");
@@ -254,7 +269,7 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
 		argv_array_push(&cmd.args, "--write-bitmap-index");
 
 	if (pack_everything & ALL_INTO_ONE) {
-		get_non_kept_pack_filenames(&existing_packs);
+		get_non_kept_pack_filenames(&existing_packs, &keep_pack_list);
 
 		if (existing_packs.nr && delete_redundant) {
 			if (unpack_unreachable) {
diff --git a/cache.h b/cache.h
index 21fbcc2414..42d700f3d8 100644
--- a/cache.h
+++ b/cache.h
@@ -1635,6 +1635,7 @@ extern struct packed_git {
 	int pack_fd;
 	unsigned pack_local:1,
 		 pack_keep:1,
+		 pack_keep_in_core:1,
 		 freshened:1,
 		 do_not_close:1,
 		 pack_promisor:1;
diff --git a/t/t7700-repack.sh b/t/t7700-repack.sh
index 38247afbec..05ae0de3aa 100755
--- a/t/t7700-repack.sh
+++ b/t/t7700-repack.sh
@@ -196,5 +196,30 @@ test_expect_success 'objects made unreachable by grafts only are kept' '
 	git cat-file -t $H1
 '
 
+test_expect_success 'repack --keep-pack' '
+	test_create_repo keep-pack &&
+	(
+		cd keep-pack &&
+		test_commit one &&
+		git repack -d &&
+		test_commit two &&
+		git repack -d &&
+		test_commit three &&
+		git repack -d &&
+		test_commit four &&
+		git repack -d &&
+		( cd .git/objects/pack && ls *.pack ) >pack-list &&
+		test_line_count = 4 pack-list &&
+		KEEP1=$(head -n1 pack-list) &&
+		KEEP4=$(tail -n1 pack-list) &&
+		git repack -a -d --keep-pack $KEEP1 --keep-pack $KEEP4 &&
+		ls .git/objects/pack/*.pack >new-counts &&
+		grep -q $KEEP1 new-counts &&
+		grep -q $KEEP4 new-counts &&
+		test_line_count = 3 new-counts &&
+		git fsck
+	)
+'
+
 test_done
 
-- 
2.16.2.903.gd04caf5039


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v3 2/7] gc: add --keep-base-pack
  2018-03-16 19:27     ` [PATCH v3 0/7] nd/repack-keep-pack updates Nguyễn Thái Ngọc Duy
  2018-03-16 19:27       ` [PATCH v3 1/7] repack: add --keep-pack option Nguyễn Thái Ngọc Duy
@ 2018-03-16 19:27       ` Nguyễn Thái Ngọc Duy
  2018-03-16 21:05         ` Ævar Arnfjörð Bjarmason
  2018-03-16 21:25         ` Ævar Arnfjörð Bjarmason
  2018-03-16 19:27       ` [PATCH v3 3/7] gc: detect base packs based on gc.bigPackThreshold config Nguyễn Thái Ngọc Duy
                         ` (5 subsequent siblings)
  7 siblings, 2 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-16 19:27 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

This adds a new repack mode that combines everything into a secondary
pack, leaving the largest/base pack alone.

This could help reduce memory pressure. On linux-2.6.git, valgrind
massif reports 1.6GB heap in "pack all" case, and 535MB in "pack
all except the base pack" case. We save roughly 1GB memory by
excluding the base pack.

This should also lower I/O because we don't have to rewrite a giant
pack every time (e.g. for linux-2.6.git that's a 1.4GB pack file)..

PS. The use of string_list here seems overkill, but we'll need it in
the next patch...

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 Documentation/git-gc.txt |  7 +++++-
 builtin/gc.c             | 47 ++++++++++++++++++++++++++++++++++++----
 t/t6500-gc.sh            | 22 +++++++++++++++++++
 3 files changed, 71 insertions(+), 5 deletions(-)

diff --git a/Documentation/git-gc.txt b/Documentation/git-gc.txt
index 571b5a7e3c..1717517043 100644
--- a/Documentation/git-gc.txt
+++ b/Documentation/git-gc.txt
@@ -9,7 +9,7 @@ git-gc - Cleanup unnecessary files and optimize the local repository
 SYNOPSIS
 --------
 [verse]
-'git gc' [--aggressive] [--auto] [--quiet] [--prune=<date> | --no-prune] [--force]
+'git gc' [--aggressive] [--auto] [--quiet] [--prune=<date> | --no-prune] [--force] [--keep-base-pack]
 
 DESCRIPTION
 -----------
@@ -78,6 +78,11 @@ automatic consolidation of packs.
 	Force `git gc` to run even if there may be another `git gc`
 	instance running on this repository.
 
+--keep-base-pack::
+	All packs except the base pack and those marked with a `.keep`
+	files are consolidated into a single pack. The largest pack is
+	considered the base pack.
+
 Configuration
 -------------
 
diff --git a/builtin/gc.c b/builtin/gc.c
index 77fa720bd0..362dd537a4 100644
--- a/builtin/gc.c
+++ b/builtin/gc.c
@@ -164,6 +164,24 @@ static int too_many_loose_objects(void)
 	return needed;
 }
 
+static void find_base_packs(struct string_list *packs)
+{
+	struct packed_git *p, *base = NULL;
+
+	prepare_packed_git();
+
+	for (p = packed_git; p; p = p->next) {
+		if (!p->pack_local)
+			continue;
+		if (!base || base->pack_size < p->pack_size) {
+			base = p;
+		}
+	}
+
+	if (base)
+		string_list_append(packs, base->pack_name);
+}
+
 static int too_many_packs(void)
 {
 	struct packed_git *p;
@@ -187,7 +205,13 @@ static int too_many_packs(void)
 	return gc_auto_pack_limit < cnt;
 }
 
-static void add_repack_all_option(void)
+static int keep_one_pack(struct string_list_item *item, void *data)
+{
+	argv_array_pushf(&repack, "--keep-pack=%s", basename(item->string));
+	return 0;
+}
+
+static void add_repack_all_option(struct string_list *keep_pack)
 {
 	if (prune_expire && !strcmp(prune_expire, "now"))
 		argv_array_push(&repack, "-a");
@@ -196,6 +220,9 @@ static void add_repack_all_option(void)
 		if (prune_expire)
 			argv_array_pushf(&repack, "--unpack-unreachable=%s", prune_expire);
 	}
+
+	if (keep_pack)
+		for_each_string_list(keep_pack, keep_one_pack, NULL);
 }
 
 static void add_repack_incremental_option(void)
@@ -219,7 +246,7 @@ static int need_to_gc(void)
 	 * there is no need.
 	 */
 	if (too_many_packs())
-		add_repack_all_option();
+		add_repack_all_option(NULL);
 	else if (too_many_loose_objects())
 		add_repack_incremental_option();
 	else
@@ -353,6 +380,7 @@ int cmd_gc(int argc, const char **argv, const char *prefix)
 	const char *name;
 	pid_t pid;
 	int daemonized = 0;
+	int keep_base_pack = -1;
 
 	struct option builtin_gc_options[] = {
 		OPT__QUIET(&quiet, N_("suppress progress reporting")),
@@ -362,6 +390,8 @@ int cmd_gc(int argc, const char **argv, const char *prefix)
 		OPT_BOOL(0, "aggressive", &aggressive, N_("be more thorough (increased runtime)")),
 		OPT_BOOL(0, "auto", &auto_gc, N_("enable auto-gc mode")),
 		OPT_BOOL(0, "force", &force, N_("force running gc even if there may be another gc running")),
+		OPT_BOOL(0, "keep-base-pack", &keep_base_pack,
+			 N_("repack all other packs except the base pack")),
 		OPT_END()
 	};
 
@@ -427,8 +457,17 @@ int cmd_gc(int argc, const char **argv, const char *prefix)
 			 */
 			daemonized = !daemonize();
 		}
-	} else
-		add_repack_all_option();
+	} else {
+		struct string_list keep_pack = STRING_LIST_INIT_NODUP;
+
+		if (keep_base_pack != -1) {
+			if (keep_base_pack)
+				find_base_packs(&keep_pack);
+		}
+
+		add_repack_all_option(&keep_pack);
+		string_list_clear(&keep_pack, 0);
+	}
 
 	name = lock_repo_for_gc(force, &pid);
 	if (name) {
diff --git a/t/t6500-gc.sh b/t/t6500-gc.sh
index 41b0be575d..4136681b47 100755
--- a/t/t6500-gc.sh
+++ b/t/t6500-gc.sh
@@ -116,6 +116,28 @@ test_expect_success 'background auto gc respects lock for all operations' '
 	test_path_is_file .git/refs/heads/should-be-loose
 '
 
+test_expect_success 'gc --keep-base-pack' '
+	test_create_repo keep-pack &&
+	(
+		cd keep-pack &&
+		for i in 10; do
+			test_commit $i
+		done &&
+		git gc &&
+		( cd .git/objects/pack && ls *.pack ) >pack-list &&
+		test_line_count = 1 pack-list &&
+		BASE_PACK=.git/objects/pack/pack-*.pack &&
+		for i in 10; do
+			test_commit more-$i
+		done &&
+		git gc --keep-base-pack &&
+		( cd .git/objects/pack && ls *.pack ) >pack-list &&
+		test_line_count = 2 pack-list &&
+		test_path_is_file $BASE_PACK &&
+		git fsck
+	)
+'
+
 # DO NOT leave a detached auto gc process running near the end of the
 # test script: it can run long enough in the background to racily
 # interfere with the cleanup in 'test_done'.
-- 
2.16.2.903.gd04caf5039


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v3 3/7] gc: detect base packs based on gc.bigPackThreshold config
  2018-03-16 19:27     ` [PATCH v3 0/7] nd/repack-keep-pack updates Nguyễn Thái Ngọc Duy
  2018-03-16 19:27       ` [PATCH v3 1/7] repack: add --keep-pack option Nguyễn Thái Ngọc Duy
  2018-03-16 19:27       ` [PATCH v3 2/7] gc: add --keep-base-pack Nguyễn Thái Ngọc Duy
@ 2018-03-16 19:27       ` Nguyễn Thái Ngọc Duy
  2018-03-16 21:02         ` Ævar Arnfjörð Bjarmason
  2018-03-16 19:27       ` [PATCH v3 4/7] gc --auto: exclude base pack if not enough mem to "repack -ad" Nguyễn Thái Ngọc Duy
                         ` (4 subsequent siblings)
  7 siblings, 1 reply; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-16 19:27 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

The --keep-base-pack option is not very convenient to use because you
need to tell gc to do this explicitly (and probably on just a few
large repos).

Add a config key that enables this mode when packs larger than a
limit are found.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 Documentation/config.txt |  6 ++++++
 Documentation/git-gc.txt |  6 ++++--
 builtin/gc.c             | 26 ++++++++++++++++++++------
 3 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index f57e9cf10c..c12c58813c 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -1549,6 +1549,12 @@ gc.autoDetach::
 	Make `git gc --auto` return immediately and run in background
 	if the system supports it. Default is true.
 
+gc.bigPackThreshold::
+	If non-zero, all packs larger than this limit are kept when
+	`git gc` is run. This is very similar to `--keep-base-pack`
+	except that all packs that meet the threshold are kept, not
+	just the base pack. Defaults to zero.
+
 gc.logExpiry::
 	If the file gc.log exists, then `git gc --auto` won't run
 	unless that file is more than 'gc.logExpiry' old.  Default is
diff --git a/Documentation/git-gc.txt b/Documentation/git-gc.txt
index 1717517043..89f074f924 100644
--- a/Documentation/git-gc.txt
+++ b/Documentation/git-gc.txt
@@ -55,7 +55,8 @@ all loose objects are combined into a single pack using
 disables automatic packing of loose objects.
 +
 If the number of packs exceeds the value of `gc.autoPackLimit`,
-then existing packs (except those marked with a `.keep` file)
+then existing packs (except those marked with a `.keep` file
+or over `gc.bigPackThreshold` limit)
 are consolidated into a single pack by using the `-A` option of
 'git repack'. Setting `gc.autoPackLimit` to 0 disables
 automatic consolidation of packs.
@@ -81,7 +82,8 @@ automatic consolidation of packs.
 --keep-base-pack::
 	All packs except the base pack and those marked with a `.keep`
 	files are consolidated into a single pack. The largest pack is
-	considered the base pack.
+	considered the base pack. When this option is used,
+	`gc.bigPackThreshold` is ignored.
 
 Configuration
 -------------
diff --git a/builtin/gc.c b/builtin/gc.c
index 362dd537a4..849f0821a9 100644
--- a/builtin/gc.c
+++ b/builtin/gc.c
@@ -39,6 +39,7 @@ static timestamp_t gc_log_expire_time;
 static const char *gc_log_expire = "1.day.ago";
 static const char *prune_expire = "2.weeks.ago";
 static const char *prune_worktrees_expire = "3.months.ago";
+static unsigned long big_pack_threshold;
 
 static struct argv_array pack_refs_cmd = ARGV_ARRAY_INIT;
 static struct argv_array reflog = ARGV_ARRAY_INIT;
@@ -126,6 +127,8 @@ static void gc_config(void)
 	git_config_get_expiry("gc.worktreepruneexpire", &prune_worktrees_expire);
 	git_config_get_expiry("gc.logexpiry", &gc_log_expire);
 
+	git_config_get_ulong("gc.bigpackthreshold", &big_pack_threshold);
+
 	git_config(git_default_config, NULL);
 }
 
@@ -164,7 +167,7 @@ static int too_many_loose_objects(void)
 	return needed;
 }
 
-static void find_base_packs(struct string_list *packs)
+static void find_base_packs(struct string_list *packs, unsigned long limit)
 {
 	struct packed_git *p, *base = NULL;
 
@@ -173,7 +176,10 @@ static void find_base_packs(struct string_list *packs)
 	for (p = packed_git; p; p = p->next) {
 		if (!p->pack_local)
 			continue;
-		if (!base || base->pack_size < p->pack_size) {
+		if (limit) {
+			if (p->pack_size >= limit)
+				string_list_append(packs, p->pack_name);
+		} else if (!base || base->pack_size < p->pack_size) {
 			base = p;
 		}
 	}
@@ -245,9 +251,15 @@ static int need_to_gc(void)
 	 * we run "repack -A -d -l".  Otherwise we tell the caller
 	 * there is no need.
 	 */
-	if (too_many_packs())
-		add_repack_all_option(NULL);
-	else if (too_many_loose_objects())
+	if (too_many_packs()) {
+		struct string_list keep_pack = STRING_LIST_INIT_NODUP;
+
+		if (big_pack_threshold)
+			find_base_packs(&keep_pack, big_pack_threshold);
+
+		add_repack_all_option(&keep_pack);
+		string_list_clear(&keep_pack, 0);
+	} else if (too_many_loose_objects())
 		add_repack_incremental_option();
 	else
 		return 0;
@@ -462,7 +474,9 @@ int cmd_gc(int argc, const char **argv, const char *prefix)
 
 		if (keep_base_pack != -1) {
 			if (keep_base_pack)
-				find_base_packs(&keep_pack);
+				find_base_packs(&keep_pack, 0);
+		} else if (big_pack_threshold) {
+			find_base_packs(&keep_pack, big_pack_threshold);
 		}
 
 		add_repack_all_option(&keep_pack);
-- 
2.16.2.903.gd04caf5039


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v3 4/7] gc --auto: exclude base pack if not enough mem to "repack -ad"
  2018-03-16 19:27     ` [PATCH v3 0/7] nd/repack-keep-pack updates Nguyễn Thái Ngọc Duy
                         ` (2 preceding siblings ...)
  2018-03-16 19:27       ` [PATCH v3 3/7] gc: detect base packs based on gc.bigPackThreshold config Nguyễn Thái Ngọc Duy
@ 2018-03-16 19:27       ` Nguyễn Thái Ngọc Duy
  2018-03-16 21:14         ` Ævar Arnfjörð Bjarmason
  2018-03-16 19:27       ` [PATCH v3 5/7] gc: handle a corner case in gc.bigPackThreshold Nguyễn Thái Ngọc Duy
                         ` (3 subsequent siblings)
  7 siblings, 1 reply; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-16 19:27 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

pack-objects could be a big memory hog especially on large repos,
everybody knows that. The suggestion to stick a .keep file on the
giant base pack to avoid this problem is also known for a long time.

Recent patches add an option to do just this, but it has to be either
configured or activated manually. This patch lets `git gc --auto`
activate this mode automatically when it thinks `repack -ad` will use
a lot of memory and start affecting the system due to swapping or
flushing OS cache.

gc --auto decides to do this based on an estimation of pack-objects
memory usage, which is quite accurate at least for the heap part, and
whether that fits in half of system memory (the assumption here is for
desktop environment where there are many other applications running).

This mechanism only kicks in if gc.bigBasePackThreshold is not configured.
If it is, it is assumed that the user already knows what they want.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 Documentation/git-gc.txt |  9 +++-
 builtin/gc.c             | 99 +++++++++++++++++++++++++++++++++++++++-
 builtin/pack-objects.c   |  2 +-
 config.mak.uname         |  1 +
 git-compat-util.h        |  4 ++
 pack-objects.h           |  2 +
 t/t6500-gc.sh            |  7 +++
 7 files changed, 120 insertions(+), 4 deletions(-)

diff --git a/Documentation/git-gc.txt b/Documentation/git-gc.txt
index 89f074f924..19b0d1741b 100644
--- a/Documentation/git-gc.txt
+++ b/Documentation/git-gc.txt
@@ -58,8 +58,13 @@ If the number of packs exceeds the value of `gc.autoPackLimit`,
 then existing packs (except those marked with a `.keep` file
 or over `gc.bigPackThreshold` limit)
 are consolidated into a single pack by using the `-A` option of
-'git repack'. Setting `gc.autoPackLimit` to 0 disables
-automatic consolidation of packs.
+'git repack'.
+If the amount of memory is estimated not enough for `git repack` to
+run smoothly and `gc.bigPackThreshold` is not set, the largest
+pack will also be excluded (this is the equivalent of running `git gc`
+with `--keep-base-pack`).
+Setting `gc.autoPackLimit` to 0 disables automatic consolidation of
+packs.
 
 --prune=<date>::
 	Prune loose objects older than date (default is 2 weeks ago,
diff --git a/builtin/gc.c b/builtin/gc.c
index 849f0821a9..c0f1922c24 100644
--- a/builtin/gc.c
+++ b/builtin/gc.c
@@ -20,6 +20,10 @@
 #include "argv-array.h"
 #include "commit.h"
 #include "packfile.h"
+#include "pack.h"
+#include "pack-objects.h"
+#include "blob.h"
+#include "tree.h"
 
 #define FAILED_RUN "failed to run %s"
 
@@ -40,6 +44,7 @@ static const char *gc_log_expire = "1.day.ago";
 static const char *prune_expire = "2.weeks.ago";
 static const char *prune_worktrees_expire = "3.months.ago";
 static unsigned long big_pack_threshold;
+static unsigned long max_delta_cache_size = DEFAULT_DELTA_CACHE_SIZE;
 
 static struct argv_array pack_refs_cmd = ARGV_ARRAY_INIT;
 static struct argv_array reflog = ARGV_ARRAY_INIT;
@@ -128,6 +133,7 @@ static void gc_config(void)
 	git_config_get_expiry("gc.logexpiry", &gc_log_expire);
 
 	git_config_get_ulong("gc.bigpackthreshold", &big_pack_threshold);
+	git_config_get_ulong("pack.deltacachesize", &max_delta_cache_size);
 
 	git_config(git_default_config, NULL);
 }
@@ -167,7 +173,8 @@ static int too_many_loose_objects(void)
 	return needed;
 }
 
-static void find_base_packs(struct string_list *packs, unsigned long limit)
+static struct packed_git *find_base_packs(struct string_list *packs,
+					  unsigned long limit)
 {
 	struct packed_git *p, *base = NULL;
 
@@ -186,6 +193,8 @@ static void find_base_packs(struct string_list *packs, unsigned long limit)
 
 	if (base)
 		string_list_append(packs, base->pack_name);
+
+	return base;
 }
 
 static int too_many_packs(void)
@@ -211,6 +220,79 @@ static int too_many_packs(void)
 	return gc_auto_pack_limit < cnt;
 }
 
+static uint64_t total_ram(void)
+{
+#if defined(HAVE_SYSINFO)
+	struct sysinfo si;
+
+	if (!sysinfo(&si))
+		return si.totalram;
+#elif defined(HAVE_BSD_SYSCTL) && (defined(HW_MEMSIZE) || defined(HW_PHYSMEM))
+	int64_t physical_memory;
+	int mib[2];
+	size_t length;
+
+	mib[0] = CTL_HW;
+# if defined(HW_MEMSIZE)
+	mib[1] = HW_MEMSIZE;
+# else
+	mib[1] = HW_PHYSMEM;
+# endif
+	length = sizeof(int64_t);
+	if (!sysctl(mib, 2, &physical_memory, &length, NULL, 0))
+		return physical_memory;
+#elif defined(GIT_WINDOWS_NATIVE)
+	MEMORYSTATUSEX memInfo;
+
+	memInfo.dwLength = sizeof(MEMORYSTATUSEX);
+	if (GlobalMemoryStatusEx(&memInfo))
+		return memInfo.ullTotalPhys;
+#endif
+	return 0;
+}
+
+static uint64_t estimate_repack_memory(struct packed_git *pack)
+{
+	unsigned long nr_objects = approximate_object_count();
+	size_t os_cache, heap;
+
+	if (!pack || !nr_objects)
+		return 0;
+
+	/*
+	 * First we have to scan through at least one pack.
+	 * Assume enough room in OS file cache to keep the entire pack
+	 * or we may accidentally evict data of other processes from
+	 * the cache.
+	 */
+	os_cache = pack->pack_size + pack->index_size;
+	/* then pack-objects needs lots more for book keeping */
+	heap = sizeof(struct object_entry) * nr_objects;
+	/*
+	 * internal rev-list --all --objects takes up some memory too,
+	 * let's say half of it is for blobs
+	 */
+	heap += sizeof(struct blob) * nr_objects / 2;
+	/*
+	 * and the other half is for trees (commits and tags are
+	 * usually insignificant)
+	 */
+	heap += sizeof(struct tree) * nr_objects / 2;
+	/* and then obj_hash[], underestimated in fact */
+	heap += sizeof(struct object *) * nr_objects;
+	/* revindex is used also */
+	heap += sizeof(struct revindex_entry) * nr_objects;
+	/*
+	 * read_sha1_file() (either at delta calculation phase, or
+	 * writing phase) also fills up the delta base cache
+	 */
+	heap += delta_base_cache_limit;
+	/* and of course pack-objects has its own delta cache */
+	heap += max_delta_cache_size;
+
+	return os_cache + heap;
+}
+
 static int keep_one_pack(struct string_list_item *item, void *data)
 {
 	argv_array_pushf(&repack, "--keep-pack=%s", basename(item->string));
@@ -256,6 +338,21 @@ static int need_to_gc(void)
 
 		if (big_pack_threshold)
 			find_base_packs(&keep_pack, big_pack_threshold);
+		else {
+			struct packed_git * p = find_base_packs(&keep_pack, 0);
+			uint64_t mem_have, mem_want;
+
+			mem_have = total_ram();
+			mem_want = estimate_repack_memory(p);
+
+			/*
+			 * Only allow 1/2 of memory for pack-objects, leave
+			 * the rest for the OS and other processes in the
+			 * system.
+			 */
+			if (!mem_have || mem_want < mem_have / 2)
+				string_list_clear(&keep_pack, 0);
+		}
 
 		add_repack_all_option(&keep_pack);
 		string_list_clear(&keep_pack, 0);
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 7b9fe6c89f..6abde6ec6d 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -79,7 +79,7 @@ static uint16_t write_bitmap_options;
 static int exclude_promisor_objects;
 
 static unsigned long delta_cache_size = 0;
-static unsigned long max_delta_cache_size = 256 * 1024 * 1024;
+static unsigned long max_delta_cache_size = DEFAULT_DELTA_CACHE_SIZE;
 static unsigned long cache_max_small_delta_size = 1000;
 
 static unsigned long window_memory_limit = 0;
diff --git a/config.mak.uname b/config.mak.uname
index 6a1d0de0cc..ae9cbccec1 100644
--- a/config.mak.uname
+++ b/config.mak.uname
@@ -37,6 +37,7 @@ ifeq ($(uname_S),Linux)
 	HAVE_GETDELIM = YesPlease
 	SANE_TEXT_GREP=-a
 	FREAD_READS_DIRECTORIES = UnfortunatelyYes
+	BASIC_CFLAGS += -DHAVE_SYSINFO
 endif
 ifeq ($(uname_S),GNU/kFreeBSD)
 	HAVE_ALLOCA_H = YesPlease
diff --git a/git-compat-util.h b/git-compat-util.h
index 68b2ad531e..a84b21986d 100644
--- a/git-compat-util.h
+++ b/git-compat-util.h
@@ -284,6 +284,10 @@ extern char *gitdirname(char *);
 #include <openssl/err.h>
 #endif
 
+#ifdef HAVE_SYSINFO
+# include <sys/sysinfo.h>
+#endif
+
 /* On most systems <netdb.h> would have given us this, but
  * not on some systems (e.g. z/OS).
  */
diff --git a/pack-objects.h b/pack-objects.h
index 03f1191659..af4f46c026 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -1,6 +1,8 @@
 #ifndef PACK_OBJECTS_H
 #define PACK_OBJECTS_H
 
+#define DEFAULT_DELTA_CACHE_SIZE (256 * 1024 * 1024)
+
 struct object_entry {
 	struct pack_idx_entry idx;
 	unsigned long size;	/* uncompressed size */
diff --git a/t/t6500-gc.sh b/t/t6500-gc.sh
index 4136681b47..96ca70f9cc 100755
--- a/t/t6500-gc.sh
+++ b/t/t6500-gc.sh
@@ -5,6 +5,13 @@ test_description='basic git gc tests
 
 . ./test-lib.sh
 
+test_expect_success 'setup' '
+	# do not let the amount of physical memory affects gc
+	# behavior, make sure we always pack everything to one pack by
+	# default
+	git config gc.bigPackThreshold 2g
+'
+
 test_expect_success 'gc empty repository' '
 	git gc
 '
-- 
2.16.2.903.gd04caf5039


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v3 5/7] gc: handle a corner case in gc.bigPackThreshold
  2018-03-16 19:27     ` [PATCH v3 0/7] nd/repack-keep-pack updates Nguyễn Thái Ngọc Duy
                         ` (3 preceding siblings ...)
  2018-03-16 19:27       ` [PATCH v3 4/7] gc --auto: exclude base pack if not enough mem to "repack -ad" Nguyễn Thái Ngọc Duy
@ 2018-03-16 19:27       ` Nguyễn Thái Ngọc Duy
  2018-03-16 21:10         ` Ævar Arnfjörð Bjarmason
  2018-03-16 19:27       ` [PATCH v3 6/7] pack-objects: show some progress when counting kept objects Nguyễn Thái Ngọc Duy
                         ` (2 subsequent siblings)
  7 siblings, 1 reply; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-16 19:27 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

This config allows us to keep <N> packs back if their size is larger
than a limit. But if this N >= gc.autoPackLimit, we may have a
problem. We are supposed to reduce the number of packs after a
threshold because it affects performance.

We could tell the user that they have incompatible gc.bigPackThreshold
and gc.autoPackLimit, but it's kinda hard when 'git gc --auto' runs in
background. Instead let's fall back to the next best stategy: try to
reduce the number of packs anyway, but keep the base pack out. This
reduces the number of packs to two and hopefully won't take up too
much resources to repack (the assumption still is the base pack takes
most resources to handle).

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 Documentation/config.txt | 5 +++++
 builtin/gc.c             | 9 +++++++--
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index c12c58813c..ce40112e31 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -1554,6 +1554,11 @@ gc.bigPackThreshold::
 	`git gc` is run. This is very similar to `--keep-base-pack`
 	except that all packs that meet the threshold are kept, not
 	just the base pack. Defaults to zero.
++
+Note that if the number of kept packs is more than gc.autoPackLimit,
+this configuration variable is ignored, all packs except the base pack
+will be repacked. After this the number of packs should go below
+gc.autoPackLimit and gc.bigPackThreshold should be respected again.
 
 gc.logExpiry::
 	If the file gc.log exists, then `git gc --auto` won't run
diff --git a/builtin/gc.c b/builtin/gc.c
index c0f1922c24..140c1bb7dd 100644
--- a/builtin/gc.c
+++ b/builtin/gc.c
@@ -336,9 +336,14 @@ static int need_to_gc(void)
 	if (too_many_packs()) {
 		struct string_list keep_pack = STRING_LIST_INIT_NODUP;
 
-		if (big_pack_threshold)
+		if (big_pack_threshold) {
 			find_base_packs(&keep_pack, big_pack_threshold);
-		else {
+			if (keep_pack.nr >= gc_auto_pack_limit) {
+				big_pack_threshold = 0;
+				string_list_clear(&keep_pack, 0);
+				find_base_packs(&keep_pack, 0);
+			}
+		} else {
 			struct packed_git * p = find_base_packs(&keep_pack, 0);
 			uint64_t mem_have, mem_want;
 
-- 
2.16.2.903.gd04caf5039


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v3 6/7] pack-objects: show some progress when counting kept objects
  2018-03-16 19:27     ` [PATCH v3 0/7] nd/repack-keep-pack updates Nguyễn Thái Ngọc Duy
                         ` (4 preceding siblings ...)
  2018-03-16 19:27       ` [PATCH v3 5/7] gc: handle a corner case in gc.bigPackThreshold Nguyễn Thái Ngọc Duy
@ 2018-03-16 19:27       ` Nguyễn Thái Ngọc Duy
  2018-03-16 19:27       ` [PATCH v3 7/7] pack-objects: display progress in get_object_details() Nguyễn Thái Ngọc Duy
  2018-03-24  7:25       ` [PATCH v4 0/7] nd/repack-keep-pack updates Nguyễn Thái Ngọc Duy
  7 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-16 19:27 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

We only show progress when there are new objects to be packed. But
when --keep-pack is specified on the base pack, we will exclude most
of objects. This makes 'pack-objects' stay silent for a long time
while the counting phase is going.

Let's show some progress whenever we visit an object instead. The
number of packed objects will be shown after if it's not the same as
the number of visited objects.

Since the meaning of this number has changed, use another word instead
of "Counting" to hint about the change.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 6abde6ec6d..f74e9117f7 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -44,7 +44,7 @@ static const char *pack_usage[] = {
 static struct packing_data to_pack;
 
 static struct pack_idx_entry **written_list;
-static uint32_t nr_result, nr_written;
+static uint32_t nr_result, nr_written, nr_seen;
 
 static int non_empty;
 static int reuse_delta = 1, reuse_object = 1;
@@ -1094,6 +1094,8 @@ static int add_object_entry(const struct object_id *oid, enum object_type type,
 	off_t found_offset = 0;
 	uint32_t index_pos;
 
+	display_progress(progress_state, ++nr_seen);
+
 	if (have_duplicate_entry(oid, exclude, &index_pos))
 		return 0;
 
@@ -1109,8 +1111,6 @@ static int add_object_entry(const struct object_id *oid, enum object_type type,
 	create_object_entry(oid, type, pack_name_hash(name),
 			    exclude, name && no_try_delta(name),
 			    index_pos, found_pack, found_offset);
-
-	display_progress(progress_state, nr_result);
 	return 1;
 }
 
@@ -1121,6 +1121,8 @@ static int add_object_entry_from_bitmap(const struct object_id *oid,
 {
 	uint32_t index_pos;
 
+	display_progress(progress_state, ++nr_seen);
+
 	if (have_duplicate_entry(oid, 0, &index_pos))
 		return 0;
 
@@ -1128,8 +1130,6 @@ static int add_object_entry_from_bitmap(const struct object_id *oid,
 		return 0;
 
 	create_object_entry(oid, type, name_hash, 0, 0, index_pos, pack, offset);
-
-	display_progress(progress_state, nr_result);
 	return 1;
 }
 
@@ -3210,7 +3210,7 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 	}
 
 	if (progress)
-		progress_state = start_progress(_("Counting objects"), 0);
+		progress_state = start_progress(_("Enumerating objects"), 0);
 	if (!use_internal_rev_list)
 		read_object_list_from_stdin();
 	else {
-- 
2.16.2.903.gd04caf5039


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v3 7/7] pack-objects: display progress in get_object_details()
  2018-03-16 19:27     ` [PATCH v3 0/7] nd/repack-keep-pack updates Nguyễn Thái Ngọc Duy
                         ` (5 preceding siblings ...)
  2018-03-16 19:27       ` [PATCH v3 6/7] pack-objects: show some progress when counting kept objects Nguyễn Thái Ngọc Duy
@ 2018-03-16 19:27       ` Nguyễn Thái Ngọc Duy
  2018-03-24  7:25       ` [PATCH v4 0/7] nd/repack-keep-pack updates Nguyễn Thái Ngọc Duy
  7 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-16 19:27 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

This code is mostly about reading object headers, which is cheap. But
when the number of objects is very large (e.g. 6.5M on linux-2.6.git)
and the system is under memory pressure, this could take some time (86
seconds on my system).

Show something during this time to let the user know pack-objects is
still going strong.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index f74e9117f7..ac8f29dd52 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -1715,6 +1715,10 @@ static void get_object_details(void)
 	uint32_t i;
 	struct object_entry **sorted_by_offset;
 
+	if (progress)
+		progress_state = start_progress(_("Getting object details"),
+						to_pack.nr_objects);
+
 	sorted_by_offset = xcalloc(to_pack.nr_objects, sizeof(struct object_entry *));
 	for (i = 0; i < to_pack.nr_objects; i++)
 		sorted_by_offset[i] = to_pack.objects + i;
@@ -1725,7 +1729,9 @@ static void get_object_details(void)
 		check_object(entry);
 		if (big_file_threshold < entry->size)
 			entry->no_try_delta = 1;
+		display_progress(progress_state, i + 1);
 	}
+	stop_progress(&progress_state);
 
 	/*
 	 * This must happen in a second pass, since we rely on the delta
-- 
2.16.2.903.gd04caf5039


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* Re: [PATCH v4 08/11] pack-objects: shrink z_delta_size field in struct object_entry
  2018-03-16 18:31         ` [PATCH v4 08/11] pack-objects: shrink z_delta_size field in struct object_entry Nguyễn Thái Ngọc Duy
@ 2018-03-16 19:40           ` Junio C Hamano
  0 siblings, 0 replies; 273+ messages in thread
From: Junio C Hamano @ 2018-03-16 19:40 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: avarab, e, git, peff

Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:

> We only cache deltas when it's smaller than a certain limit. This limit
> defaults to 1000 but save its compressed length in a 64-bit field.
> Shrink that field down to 16 bits, so you can only cache 65kb deltas.
> Larger deltas must be recomputed at when the pack is written down.
>
> Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
> ---

>  		if (entry->delta_data && !pack_to_stdout) {
> -			entry->z_delta_size = do_compress(&entry->delta_data,
> -							  entry->delta_size);
> -			cache_lock();
> -			delta_cache_size -= entry->delta_size;
> -			delta_cache_size += entry->z_delta_size;
> -			cache_unlock();
> +			unsigned long size;
> +
> +			size = do_compress(&entry->delta_data, entry->delta_size);
> +			entry->z_delta_size = size;
> +			if (entry->z_delta_size == size) {

It is confusing to readers to write

	A = B;
	if (A == B) {
		/* OK, A was big enough */
	} else {
		/* No, B is too big to fit on A */
	}

I actually was about to complain that you attempted an unrelated
micro-optimization to skip cache_lock/unlock when delta_size and
z_delta_size are the same, and made a typo.  Something like:

	size = do_compress(...);
	if (size < (1 << OE_Z_DELTA_BITS)) {
		entry->z_delta_size = size;
		cache_lock();
		...
                cache_unlock();
	} else {
		FREE_AND_NULL(entry->delta_data);
		entry->z_delta_size = 0;
	}

would have saved me a few dozens of seconds of head-scratching.

> +				cache_lock();
> +				delta_cache_size -= entry->delta_size;
> +				delta_cache_size += entry->z_delta_size;
> +				cache_unlock();
> +			} else {
> +				FREE_AND_NULL(entry->delta_data);
> +				entry->z_delta_size = 0;
> +			}
>  		}

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v4 09/11] pack-objects: shrink size field in struct object_entry
  2018-03-16 18:31         ` [PATCH v4 09/11] pack-objects: shrink size " Nguyễn Thái Ngọc Duy
@ 2018-03-16 19:49           ` Junio C Hamano
  2018-03-16 21:34             ` Junio C Hamano
  0 siblings, 1 reply; 273+ messages in thread
From: Junio C Hamano @ 2018-03-16 19:49 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: avarab, e, git, peff

Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:

> It's very very rare that an uncompressd object is larger than 4GB
> (partly because Git does not handle those large files very well to
> begin with). Let's optimize it for the common case where object size
> is smaller than this limit.
>
> Shrink size field down to 32 bits [1] and one overflow bit. If the size
> is too large, we read it back from disk.

OK.

> Add two compare helpers that can take advantage of the overflow
> bit (e.g. if the file is 4GB+, chances are it's already larger than
> core.bigFileThreshold and there's no point in comparing the actual
> value).

I had trouble reading the callers of these helpers.

> +static inline int oe_size_less_than(const struct object_entry *e,
> +				    unsigned long limit)
> +{
> +	if (e->size_valid)
> +		return e->size_ < limit;
> +	if (limit > maximum_unsigned_value_of_type(uint32_t))
> +		return 1;

When size_valid bit is false, that means that the size is larger
than 4GB.  If "limit" is larger than 4GB, then we do not know
anything, no?  I'd understand if this "optimization" were

	if (limit < 4GB) {
		/*
		 * we know e whose size won't fit in 4GB is larger
		 * than that!
		 */
		return 0;
	}

> +	return oe_size(e) < limit;
> +}

Also, don't we want to use uintmax_t throughout the callchain?  How
would the code in this series work when your ulong is 32-bit?

> +
> +static inline int oe_size_greater_than(const struct object_entry *e,
> +				       unsigned long limit)
> +{
> +	if (e->size_valid)
> +		return e->size_ > limit;
> +	if (limit <= maximum_unsigned_value_of_type(uint32_t))
> +		return 1;
> +	return oe_size(e) > limit;
> +}
> +
> +static inline void oe_set_size(struct object_entry *e,
> +			       unsigned long size)
> +{
> +	e->size_ = size;
> +	e->size_valid = e->size_ == size;
> +}
> +
>  #endif

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v2 4/5] pack-objects: show some progress when counting kept objects
  2018-03-16 19:14         ` Duy Nguyen
@ 2018-03-16 20:13           ` Duy Nguyen
  0 siblings, 0 replies; 273+ messages in thread
From: Duy Nguyen @ 2018-03-16 20:13 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Eric Wong, Git Mailing List, Jeff King, Junio C Hamano

On Fri, Mar 16, 2018 at 8:14 PM, Duy Nguyen <pclouds@gmail.com> wrote:
> On Mon, Mar 12, 2018 at 7:32 PM, Ævar Arnfjörð Bjarmason
> <avarab@gmail.com> wrote:
>>
>> On Tue, Mar 06 2018, Nguyễn Thái Ngọc Duy jotted:
>>
>>> We only show progress when there are new objects to be packed. But
>>> when --keep-pack is specified on the base pack, we will exclude most
>>> of objects. This makes 'pack-objects' stay silent for a long time
>>> while the counting phase is going.
>>>
>>> Let's show some progress whenever we visit an object instead. The
>>> number of packed objects will be shown after if it's not the same as
>>> the number of visited objects.
>>>
>>> Since the meaning of this number has changed, use another word instead
>>> of "Counting" to hint about the change.
>>
>> Can you elaborate on how the meaning has changed? With/without this on
>> linux.git I get:
>>
>> With:
>>
>>     Enumerating objects: 5901144, done.
>>     Getting object details: 100% (5901145/5901145), done.
>>     Delta compression using up to 8 threads.
>>
>> Without:
>>
>>     Counting objects: 5901145, done.
>>     Delta compression using up to 8 threads.
>>
>> So now we're seemingly off-by-one but otherwise doing the same thing?
>
> Yep, it's an off-by-one bug.
>
>> As for as user feedback goes we might as well have said "Reticulating
>> splines", but I have some bias towards keeping the current "Counting
>> objects..." phrasing. We ourselves have other docs referring to it that
>> aren't changed by this patch, and there's
>> e.g. https://githubengineering.com/counting-objects/ and lots of other
>> 3rd party docs that refer to this.
>
> This is why I changed the phrase. The counting is now a bit different.
> Documents describing this exact phrase won't apply to the new version.
>
> The old way counts objects that will be packed. The new way simply
> counts objects that are visited. When you keep some packs, the number
> of objects you visit but not pack could be very high, while in normal
> case the two numbers should be the same (e.g. you pack everything you
> visit). I would prefer to print both values (e.g. "counting objects:
> <packed>/<visited>") but it's not possible with the current progress
> code.

On second thought, maybe instead of introducing a new line "getting
object details" i could just rename that line to "counting objects"?
They are exactly the same, except that in the new version, this
"counting objects" line could run a lot faster than the old line.


-- 
Duy

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v4 01/11] pack-objects: a bit of document about struct object_entry
  2018-03-16 18:31         ` [PATCH v4 01/11] pack-objects: a bit of document about struct object_entry Nguyễn Thái Ngọc Duy
@ 2018-03-16 20:32           ` Junio C Hamano
  2018-03-17 11:59             ` Duy Nguyen
  0 siblings, 1 reply; 273+ messages in thread
From: Junio C Hamano @ 2018-03-16 20:32 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: avarab, e, git, peff

Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:

> The role of this comment block becomes more important after we shuffle
> fields around to shrink this struct. It will be much harder to see what
> field is related to what.
>
> Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
> ---
>  pack-objects.h | 44 ++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 44 insertions(+)
>
> diff --git a/pack-objects.h b/pack-objects.h
> index 03f1191659..85345a4af1 100644
> --- a/pack-objects.h
> +++ b/pack-objects.h
> @@ -1,6 +1,50 @@
>  #ifndef PACK_OBJECTS_H
>  #define PACK_OBJECTS_H
>  
> +/*
> + * basic object info
> + * -----------------
> + * idx.oid is filled up before delta searching starts. idx.crc32 and
> + * is only valid after the object is written out and will be used for

"and is"?

> + * generating the index. idx.offset will be both gradually set and
> + * used in writing phase (base objects get offset first, then deltas
> + * refer to them)
> + *
> + * "size" is the uncompressed object size. Compressed size is not
> + * cached (ie. raw data in a pack) but available via revindex.

I am having a hard time understanding what "ie. raw data in a pack"
is doing in that sentence.

It is correct that compressed size is not cached; it does not even
exist and the only way to know it is to compute it by reversing the
.idx file (or actually uncompressing the compressed stream).

Perhaps:

    Compressed size of the raw data for an object in a pack is not
    stored anywhere but is computed and made available when reverse
    .idx is made.

> + * "hash" contains a path name hash which is used for sorting the
> + * delta list and also during delta searching. Once prepare_pack()
> + * returns it's no longer needed.

Hmm, that suggests an interesting optimization opportunity ;-)

> + * source pack info
> + * ----------------
> + * The (in_pack, in_pack_offset, in_pack_header_size) tuple contains
> + * the location of the object in the source pack, with or without
> + * header.

"with or without", meaning...?  An object in the source pack may or
may not have any in_pack_header, in which case in_pack_header_size
is zero, or something?  Not suggesting to rephrase (at least not
yet), but trying to understand.

> + * "type" and "in_pack_type" both describe object type. in_pack_type
> + * may contain a delta type, while type is always the canonical type.
> + *
> + * deltas
> + * ------
> + * Delta links (delta, delta_child and delta_sibling) are created
> + * reflect that delta graph from the source pack then updated or added
> + * during delta searching phase when we find better deltas.

Isn't anything missing after "are created"?  Perhaps "to"?

> + *
> + * delta_child and delta_sibling are last needed in
> + * compute_write_order(). "delta" and "delta_size" must remain valid
> + * at object writing phase in case the delta is not cached.

True.  I thought child and sibling are only needed during write
order computing, so there may be an optimization opportunity there.

> + * If a delta is cached in memory and is compressed delta_data points

s/compressed delta_data/compressed, delta_data/;

> + * to the data and z_delta_size contains the compressed size. If it's
> + * uncompressed [1], z_delta_size must be zero. delta_size is always
> + * the uncompressed size and must be valid even if the delta is not
> + * cached.
> + *
> + * [1] during try_delta phase we don't bother with compressing because
> + * the delta could be quickly replaced with a better one.
> + */
>  struct object_entry {
>  	struct pack_idx_entry idx;
>  	unsigned long size;	/* uncompressed size */

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v4 02/11] pack-objects: turn type and in_pack_type to bitfields
  2018-03-16 18:31         ` [PATCH v4 02/11] pack-objects: turn type and in_pack_type to bitfields Nguyễn Thái Ngọc Duy
@ 2018-03-16 20:49           ` Junio C Hamano
  0 siblings, 0 replies; 273+ messages in thread
From: Junio C Hamano @ 2018-03-16 20:49 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: avarab, e, git, peff

Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:

> An extra field type_valid is added to carry the equivalent of OBJ_BAD
> in the original "type" field. in_pack_type always contains a valid
> type so we only need 3 bits for it.
> ...
> @@ -1570,7 +1576,7 @@ static void drop_reused_delta(struct object_entry *entry)
>  	entry->depth = 0;
>  
>  	oi.sizep = &entry->size;
> -	oi.typep = &entry->type;
> +	oi.typep = &type;
>  	if (packed_object_info(entry->in_pack, entry->in_pack_offset, &oi) < 0) {
>  		/*
>  		 * We failed to get the info from this pack for some reason;
> @@ -1578,8 +1584,10 @@ static void drop_reused_delta(struct object_entry *entry)
>  		 * And if that fails, the error will be recorded in entry->type

This "entry->type" needs updating.

>  		 * and dealt with in prepare_pack().
>  		 */
> -		entry->type = sha1_object_info(entry->idx.oid.hash,
> -					       &entry->size);
> +		oe_set_type(entry, sha1_object_info(entry->idx.oid.hash,
> +						    &entry->size));
> +	} else {
> +		oe_set_type(entry, type);
>  	}
>  }

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v4 07/11] pack-objects: refer to delta objects by index instead of pointer
  2018-03-16 18:31         ` [PATCH v4 07/11] pack-objects: refer to delta objects by index instead of pointer Nguyễn Thái Ngọc Duy
@ 2018-03-16 20:59           ` Junio C Hamano
  0 siblings, 0 replies; 273+ messages in thread
From: Junio C Hamano @ 2018-03-16 20:59 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: avarab, e, git, peff

Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:

> These delta pointers always point to elements in the objects[] array
> in packing_data struct. We can only hold maximum 4GB of those objects

4GB, as in "number of bytes"?  Or "We can hold 4 billion or so of
those objects"?

> because the array length, nr_objects, is uint32_t. We could use
> uint32_t indexes to address these elements instead of pointers. On
> 64-bit architecture (8 bytes per pointer) this would save 4 bytes per
> pointer.
>
> Convert these delta pointers to indexes. Since we need to handle NULL
> pointers as well, the index is shifted by one [1].
>
> [1] This means we can only index 2^32-2 objects even though nr_objects
>     could contain 2^32-1 objects. It should not be a problem in
>     practice because when we grow objects[], nr_alloc would probably
>     blow up long before nr_objects hits the wall.


^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v4 11/11] pack-objects.h: reorder members to shrink struct object_entry
  2018-03-16 18:32         ` [PATCH v4 11/11] pack-objects.h: reorder members to shrink " Nguyễn Thái Ngọc Duy
@ 2018-03-16 21:02           ` Junio C Hamano
  2018-03-17 12:07             ` Duy Nguyen
  0 siblings, 1 reply; 273+ messages in thread
From: Junio C Hamano @ 2018-03-16 21:02 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: avarab, e, git, peff

Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:

> Previous patches leave lots of holes and padding in this struct. This
> patch reorders the members and shrinks the struct down to 80 bytes
> (from 136 bytes, before any field shrinking is done) with 16 bits to
> spare (and a couple more in in_pack_header_size when we really run out
> of bits).

Nice.

I am wondering if we need some conditional code for 32-bit platform.
For example, you have uint32_t field and do things like this:

        static inline int oe_size_less_than(const struct object_entry *e,
                                            unsigned long limit)
        {
                if (e->size_valid)
                        return e->size_ < limit;
                if (limit > maximum_unsigned_value_of_type(uint32_t))
                        return 1;
                return oe_size(e) < limit;
        }

Do we and compilers do the right thing when your ulong is uint32_t?

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v3 3/7] gc: detect base packs based on gc.bigPackThreshold config
  2018-03-16 19:27       ` [PATCH v3 3/7] gc: detect base packs based on gc.bigPackThreshold config Nguyễn Thái Ngọc Duy
@ 2018-03-16 21:02         ` Ævar Arnfjörð Bjarmason
  0 siblings, 0 replies; 273+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2018-03-16 21:02 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: e, git, gitster, peff


On Fri, Mar 16 2018, Nguyễn Thái Ngọc Duy jotted:

Awesome, thanks for making this support N large packs.

> +gc.bigPackThreshold::
> +	If non-zero, all packs larger than this limit are kept when
> +	`git gc` is run. This is very similar to `--keep-base-pack`
> +	except that all packs that meet the threshold are kept, not
> +	just the base pack. Defaults to zero.
> +

We should add:

    +
    Common unit suffixes of 'k', 'm', or 'g' are supported.

Since this now supports those suffixes (yay!), see existing copy/pasting
of that phrase in "git help config".

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v3 2/7] gc: add --keep-base-pack
  2018-03-16 19:27       ` [PATCH v3 2/7] gc: add --keep-base-pack Nguyễn Thái Ngọc Duy
@ 2018-03-16 21:05         ` Ævar Arnfjörð Bjarmason
  2018-03-19 17:26           ` Duy Nguyen
  2018-03-16 21:25         ` Ævar Arnfjörð Bjarmason
  1 sibling, 1 reply; 273+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2018-03-16 21:05 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: e, git, gitster, peff


On Fri, Mar 16 2018, Nguyễn Thái Ngọc Duy jotted:

> +--keep-base-pack::
> +	All packs except the base pack and those marked with a `.keep`
> +	files are consolidated into a single pack. The largest pack is
> +	considered the base pack.
> +

I wonder if all of this would be less confusing as:

> +--keep-biggest-pack::
> +	All packs except the largest pack and those marked with a `.keep`
> +	files are consolidated into a single pack.

I.e. just skimming these docs I'd expect "base" to somehow be the thing
that we initially cloned, of course in almost all cases that *is* the
largest pack, but not necessarily. So rather than communicate that
expectation let's just say largest/biggest?

Maybe I'm the only one who finds this confusing...

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v3 5/7] gc: handle a corner case in gc.bigPackThreshold
  2018-03-16 19:27       ` [PATCH v3 5/7] gc: handle a corner case in gc.bigPackThreshold Nguyễn Thái Ngọc Duy
@ 2018-03-16 21:10         ` Ævar Arnfjörð Bjarmason
  0 siblings, 0 replies; 273+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2018-03-16 21:10 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: e, git, gitster, peff


On Fri, Mar 16 2018, Nguyễn Thái Ngọc Duy jotted:

> This config allows us to keep <N> packs back if their size is larger
> than a limit. But if this N >= gc.autoPackLimit, we may have a
> problem. We are supposed to reduce the number of packs after a
> threshold because it affects performance.
>
> We could tell the user that they have incompatible gc.bigPackThreshold
> and gc.autoPackLimit, but it's kinda hard when 'git gc --auto' runs in
> background. Instead let's fall back to the next best stategy: try to
> reduce the number of packs anyway, but keep the base pack out. This
> reduces the number of packs to two and hopefully won't take up too
> much resources to repack (the assumption still is the base pack takes
> most resources to handle).

I think this strategy makes perfect sense.

Those with say a 1GB "base" pack might set this setting at to 500MB or
something large like that, then it's realistically never going to happen
that you're going to then have a collision between gc.bigPackThreshold
and gc.autoPackLimit, even if your checkout is many years old *maybe*
you've accumulated 5-10 of those 500MB packs for any sane repo.

But this also allows for setting this value really low, e.g. 50MB or
something to place a very low upper bound on how much memory GC takes on
a regular basis, but of course you'll need to repack that set of 50MB's
eventually.

Great!

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v3 4/7] gc --auto: exclude base pack if not enough mem to "repack -ad"
  2018-03-16 19:27       ` [PATCH v3 4/7] gc --auto: exclude base pack if not enough mem to "repack -ad" Nguyễn Thái Ngọc Duy
@ 2018-03-16 21:14         ` Ævar Arnfjörð Bjarmason
  0 siblings, 0 replies; 273+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2018-03-16 21:14 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: e, git, gitster, peff


On Fri, Mar 16 2018, Nguyễn Thái Ngọc Duy jotted:


> +			struct packed_git * p = find_base_packs(&keep_pack, 0);

Style nit: space after "*".

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v3 2/7] gc: add --keep-base-pack
  2018-03-16 19:27       ` [PATCH v3 2/7] gc: add --keep-base-pack Nguyễn Thái Ngọc Duy
  2018-03-16 21:05         ` Ævar Arnfjörð Bjarmason
@ 2018-03-16 21:25         ` Ævar Arnfjörð Bjarmason
  1 sibling, 0 replies; 273+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2018-03-16 21:25 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: e, git, gitster, peff


On Fri, Mar 16 2018, Nguyễn Thái Ngọc Duy jotted:

>  	struct option builtin_gc_options[] = {
>  		OPT__QUIET(&quiet, N_("suppress progress reporting")),
> @@ -362,6 +390,8 @@ int cmd_gc(int argc, const char **argv, const char *prefix)
>  		OPT_BOOL(0, "aggressive", &aggressive, N_("be more thorough (increased runtime)")),
>  		OPT_BOOL(0, "auto", &auto_gc, N_("enable auto-gc mode")),
>  		OPT_BOOL(0, "force", &force, N_("force running gc even if there may be another gc running")),
> +		OPT_BOOL(0, "keep-base-pack", &keep_base_pack,
> +			 N_("repack all other packs except the base pack")),
>  		OPT_END()
>  	};

There's an easy to solve merge conflict here between the current master
& this. Pushed out a solution (for my own use) at
https://github.com/avar/git/ gc-auto-keep-base-pack. Interdiff with
yours:

    @@ -112,9 +112,9 @@
        struct option builtin_gc_options[] = {
                OPT__QUIET(&quiet, N_("suppress progress reporting")),
     @@
    -           OPT_BOOL(0, "aggressive", &aggressive, N_("be more thorough (increased runtime)")),
    -           OPT_BOOL(0, "auto", &auto_gc, N_("enable auto-gc mode")),
    -           OPT_BOOL(0, "force", &force, N_("force running gc even if there may be another gc running")),
    +           OPT_BOOL_F(0, "force", &force,
    +                      N_("force running gc even if there may be another gc running"),
    +                      PARSE_OPT_NOCOMPLETE),
     +          OPT_BOOL(0, "keep-base-pack", &keep_base_pack,
     +                   N_("repack all other packs except the base pack")),
                OPT_END()

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v4 09/11] pack-objects: shrink size field in struct object_entry
  2018-03-16 19:49           ` Junio C Hamano
@ 2018-03-16 21:34             ` Junio C Hamano
  0 siblings, 0 replies; 273+ messages in thread
From: Junio C Hamano @ 2018-03-16 21:34 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: avarab, e, git, peff

Junio C Hamano <gitster@pobox.com> writes:

> Also, don't we want to use uintmax_t throughout the callchain?  How
> would the code in this series work when your ulong is 32-bit?

My own answer to this question is "no conversion to uintmax_t, at
least not in this series."  As long as the original code uses
"unsigned long", this series also should, I think.


^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v4 01/11] pack-objects: a bit of document about struct object_entry
  2018-03-16 20:32           ` Junio C Hamano
@ 2018-03-17 11:59             ` Duy Nguyen
  0 siblings, 0 replies; 273+ messages in thread
From: Duy Nguyen @ 2018-03-17 11:59 UTC (permalink / raw)
  To: Junio C Hamano
  Cc: Ævar Arnfjörð Bjarmason, Eric Wong,
	Git Mailing List, Jeff King

On Fri, Mar 16, 2018 at 9:32 PM, Junio C Hamano <gitster@pobox.com> wrote:
>> +/*
>> + * basic object info
>> + * -----------------
>> + * idx.oid is filled up before delta searching starts. idx.crc32 and
>> + * is only valid after the object is written out and will be used for
>
> "and is"?

There was another field that I thought was only valid after blah blah.
But it was wrong and I forgot to delete this "and" after deleting that
field.

>> + * "hash" contains a path name hash which is used for sorting the
>> + * delta list and also during delta searching. Once prepare_pack()
>> + * returns it's no longer needed.
>
> Hmm, that suggests an interesting optimization opportunity ;-)

Heh.. it does not reduce peak memory consumption though which is why
I'm less interested in freeing it after prepare_pack().

>> + * source pack info
>> + * ----------------
>> + * The (in_pack, in_pack_offset, in_pack_header_size) tuple contains
>> + * the location of the object in the source pack, with or without
>> + * header.
>
> "with or without", meaning...?  An object in the source pack may or
> may not have any in_pack_header, in which case in_pack_header_size
> is zero, or something?  Not suggesting to rephrase (at least not
> yet), but trying to understand.

The location with the header (i.e. true beginning an object in a pack)
or without/after the header so you are at the zlib stream, ready to
inflate or reuse. I'll rephrase this a bit.

>> + *
>> + * delta_child and delta_sibling are last needed in
>> + * compute_write_order(). "delta" and "delta_size" must remain valid
>> + * at object writing phase in case the delta is not cached.
>
> True.  I thought child and sibling are only needed during write
> order computing, so there may be an optimization opportunity there.

See. I wrote all this for a reason. Somebody looking for low hang
fruit can always find some ;-)
-- 
Duy

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v4 11/11] pack-objects.h: reorder members to shrink struct object_entry
  2018-03-16 21:02           ` Junio C Hamano
@ 2018-03-17 12:07             ` Duy Nguyen
  0 siblings, 0 replies; 273+ messages in thread
From: Duy Nguyen @ 2018-03-17 12:07 UTC (permalink / raw)
  To: Junio C Hamano
  Cc: Ævar Arnfjörð Bjarmason, Eric Wong,
	Git Mailing List, Jeff King

On Fri, Mar 16, 2018 at 10:02 PM, Junio C Hamano <gitster@pobox.com> wrote:
> Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:
>
>> Previous patches leave lots of holes and padding in this struct. This
>> patch reorders the members and shrinks the struct down to 80 bytes
>> (from 136 bytes, before any field shrinking is done) with 16 bits to
>> spare (and a couple more in in_pack_header_size when we really run out
>> of bits).
>
> Nice.
>
> I am wondering if we need some conditional code for 32-bit platform.
> For example, you have uint32_t field and do things like this:
>
>         static inline int oe_size_less_than(const struct object_entry *e,
>                                             unsigned long limit)
>         {
>                 if (e->size_valid)
>                         return e->size_ < limit;
>                 if (limit > maximum_unsigned_value_of_type(uint32_t))
>                         return 1;
>                 return oe_size(e) < limit;
>         }
>
> Do we and compilers do the right thing when your ulong is uint32_t?

Another good point. My 32-bit build does complain

In file included from builtin/pack-objects.c:20:0:
./pack-objects.h: In function ?oe_size_less_than?:
./pack-objects.h:282:12: error: comparison is always false due to
limited range of data type [-Werror=type-limits]
  if (limit > maximum_unsigned_value_of_type(uint32_t))
            ^
-- 
Duy

^ permalink raw reply	[flat|nested] 273+ messages in thread

* [PATCH v5 00/11] nd/pack-objects-pack-struct updates
  2018-03-16 18:31       ` [PATCH v4 00/11] nd/pack-objects-pack-struct updates Nguyễn Thái Ngọc Duy
                           ` (10 preceding siblings ...)
  2018-03-16 18:32         ` [PATCH v4 11/11] pack-objects.h: reorder members to shrink " Nguyễn Thái Ngọc Duy
@ 2018-03-17 14:10         ` Nguyễn Thái Ngọc Duy
  2018-03-17 14:10           ` [PATCH v5 01/11] pack-objects: a bit of document about struct object_entry Nguyễn Thái Ngọc Duy
                             ` (12 more replies)
  11 siblings, 13 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-17 14:10 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

v5 changes are small enough that the interdiff is pretty self
explanatory (there's also a couple commit msg updates).

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index c388d87c3e..fb2aba80bf 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -1611,7 +1611,7 @@ static void drop_reused_delta(struct object_entry *entry)
 		/*
 		 * We failed to get the info from this pack for some reason;
 		 * fall back to sha1_object_info, which may find another copy.
-		 * And if that fails, the error will be recorded in entry->type
+		 * And if that fails, the error will be recorded in oe_type(entry)
 		 * and dealt with in prepare_pack().
 		 */
 		oe_set_type(entry, sha1_object_info(entry->idx.oid.hash,
@@ -1968,7 +1968,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	delta_buf = create_delta(src->index, trg->data, trg_size, &delta_size, max_size);
 	if (!delta_buf)
 		return 0;
-	if (delta_size >= maximum_unsigned_value_of_type(uint32_t))
+	if (delta_size >= (1 << OE_DELTA_SIZE_BITS))
 		return 0;
 
 	if (DELTA(trg_entry)) {
@@ -2125,8 +2125,8 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
 			unsigned long size;
 
 			size = do_compress(&entry->delta_data, DELTA_SIZE(entry));
-			entry->z_delta_size = size;
-			if (entry->z_delta_size == size) {
+			if (size < (1 << OE_Z_DELTA_BITS)) {
+				entry->z_delta_size = size;
 				cache_lock();
 				delta_cache_size -= DELTA_SIZE(entry);
 				delta_cache_size += entry->z_delta_size;
diff --git a/pack-objects.h b/pack-objects.h
index 0fa0c83294..8979289f5f 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -27,14 +27,15 @@ enum dfs_state {
  *
  * basic object info
  * -----------------
- * idx.oid is filled up before delta searching starts. idx.crc32 and
- * is only valid after the object is written out and will be used for
+ * idx.oid is filled up before delta searching starts. idx.crc32 is
+ * only valid after the object is written out and will be used for
  * generating the index. idx.offset will be both gradually set and
  * used in writing phase (base objects get offset first, then deltas
  * refer to them)
  *
- * "size" is the uncompressed object size. Compressed size is not
- * cached (ie. raw data in a pack) but available via revindex.
+ * "size" is the uncompressed object size. Compressed size of the raw
+ * data for an object in a pack is not stored anywhere but is computed
+ * and made available when reverse .idx is made.
  *
  * "hash" contains a path name hash which is used for sorting the
  * delta list and also during delta searching. Once prepare_pack()
@@ -42,16 +43,16 @@ enum dfs_state {
  *
  * source pack info
  * ----------------
- * The (in_pack, in_pack_offset, in_pack_header_size) tuple contains
- * the location of the object in the source pack, with or without
- * header.
+ * The (in_pack, in_pack_offset) tuple contains the location of the
+ * object in the source pack. in_pack_header_size allows quickly
+ * skipping the header and going straight to the zlib stream.
  *
  * "type" and "in_pack_type" both describe object type. in_pack_type
  * may contain a delta type, while type is always the canonical type.
  *
  * deltas
  * ------
- * Delta links (delta, delta_child and delta_sibling) are created
+ * Delta links (delta, delta_child and delta_sibling) are created to
  * reflect that delta graph from the source pack then updated or added
  * during delta searching phase when we find better deltas.
  *
@@ -59,7 +60,7 @@ enum dfs_state {
  * compute_write_order(). "delta" and "delta_size" must remain valid
  * at object writing phase in case the delta is not cached.
  *
- * If a delta is cached in memory and is compressed delta_data points
+ * If a delta is cached in memory and is compressed, delta_data points
  * to the data and z_delta_size contains the compressed size. If it's
  * uncompressed [1], z_delta_size must be zero. delta_size is always
  * the uncompressed size and must be valid even if the delta is not
@@ -274,12 +275,19 @@ static inline unsigned long oe_size(const struct object_entry *e)
 	}
 }
 
+static inline int contains_in_32bits(unsigned long limit)
+{
+	uint32_t truncated_limit = (uint32_t)limit;
+
+	return limit == truncated_limit;
+}
+
 static inline int oe_size_less_than(const struct object_entry *e,
 				    unsigned long limit)
 {
 	if (e->size_valid)
 		return e->size_ < limit;
-	if (limit > maximum_unsigned_value_of_type(uint32_t))
+	if (contains_in_32bits(limit))
 		return 1;
 	return oe_size(e) < limit;
 }
@@ -289,8 +297,8 @@ static inline int oe_size_greater_than(const struct object_entry *e,
 {
 	if (e->size_valid)
 		return e->size_ > limit;
-	if (limit <= maximum_unsigned_value_of_type(uint32_t))
-		return 1;
+	if (contains_in_32bits(limit))
+		return 0;
 	return oe_size(e) > limit;
 }
 
@@ -314,7 +322,7 @@ static inline void oe_set_delta_size(struct packing_data *pack,
 				     unsigned long size)
 {
 	e->delta_size_ = size;
-	e->delta_size_valid =e->delta_size_ == size;
+	e->delta_size_valid = e->delta_size_ == size;
 	if (!e->delta_size_valid && size != oe_size(e))
 		die("BUG: this can only happen in check_object() "
 		    "where delta size is the same as entry size");

-- 
2.17.0.rc0.347.gf9cf61673a


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v5 01/11] pack-objects: a bit of document about struct object_entry
  2018-03-17 14:10         ` [PATCH v5 00/11] nd/pack-objects-pack-struct updates Nguyễn Thái Ngọc Duy
@ 2018-03-17 14:10           ` Nguyễn Thái Ngọc Duy
  2018-03-17 14:10           ` [PATCH v5 02/11] pack-objects: turn type and in_pack_type to bitfields Nguyễn Thái Ngọc Duy
                             ` (11 subsequent siblings)
  12 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-17 14:10 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

The role of this comment block becomes more important after we shuffle
fields around to shrink this struct. It will be much harder to see what
field is related to what.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 pack-objects.h | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/pack-objects.h b/pack-objects.h
index 03f1191659..de91edd264 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -1,6 +1,51 @@
 #ifndef PACK_OBJECTS_H
 #define PACK_OBJECTS_H
 
+/*
+ * basic object info
+ * -----------------
+ * idx.oid is filled up before delta searching starts. idx.crc32 is
+ * only valid after the object is written out and will be used for
+ * generating the index. idx.offset will be both gradually set and
+ * used in writing phase (base objects get offset first, then deltas
+ * refer to them)
+ *
+ * "size" is the uncompressed object size. Compressed size of the raw
+ * data for an object in a pack is not stored anywhere but is computed
+ * and made available when reverse .idx is made.
+ *
+ * "hash" contains a path name hash which is used for sorting the
+ * delta list and also during delta searching. Once prepare_pack()
+ * returns it's no longer needed.
+ *
+ * source pack info
+ * ----------------
+ * The (in_pack, in_pack_offset) tuple contains the location of the
+ * object in the source pack. in_pack_header_size allows quickly
+ * skipping the header and going straight to the zlib stream.
+ *
+ * "type" and "in_pack_type" both describe object type. in_pack_type
+ * may contain a delta type, while type is always the canonical type.
+ *
+ * deltas
+ * ------
+ * Delta links (delta, delta_child and delta_sibling) are created to
+ * reflect that delta graph from the source pack then updated or added
+ * during delta searching phase when we find better deltas.
+ *
+ * delta_child and delta_sibling are last needed in
+ * compute_write_order(). "delta" and "delta_size" must remain valid
+ * at object writing phase in case the delta is not cached.
+ *
+ * If a delta is cached in memory and is compressed, delta_data points
+ * to the data and z_delta_size contains the compressed size. If it's
+ * uncompressed [1], z_delta_size must be zero. delta_size is always
+ * the uncompressed size and must be valid even if the delta is not
+ * cached.
+ *
+ * [1] during try_delta phase we don't bother with compressing because
+ * the delta could be quickly replaced with a better one.
+ */
 struct object_entry {
 	struct pack_idx_entry idx;
 	unsigned long size;	/* uncompressed size */
-- 
2.17.0.rc0.347.gf9cf61673a


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v5 02/11] pack-objects: turn type and in_pack_type to bitfields
  2018-03-17 14:10         ` [PATCH v5 00/11] nd/pack-objects-pack-struct updates Nguyễn Thái Ngọc Duy
  2018-03-17 14:10           ` [PATCH v5 01/11] pack-objects: a bit of document about struct object_entry Nguyễn Thái Ngọc Duy
@ 2018-03-17 14:10           ` Nguyễn Thái Ngọc Duy
  2018-03-17 14:10           ` [PATCH v5 03/11] pack-objects: use bitfield for object_entry::dfs_state Nguyễn Thái Ngọc Duy
                             ` (10 subsequent siblings)
  12 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-17 14:10 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

An extra field type_valid is added to carry the equivalent of OBJ_BAD
in the original "type" field. in_pack_type always contains a valid
type so we only need 3 bits for it.

A note about accepting OBJ_NONE as "valid" type. The function
read_object_list_from_stdin() can pass this value [1] and it
eventually calls create_object_entry() where current code skip setting
"type" field if the incoming type is zero. This does not have any bad
side effects because "type" field should be memset()'d anyway.

But since we also need to set type_valid now, skipping oe_set_type()
leaves type_valid zero/false, which will make oe_type() return
OBJ_BAD, not OBJ_NONE anymore. Apparently we do care about OBJ_NONE in
prepare_pack(). This switch from OBJ_NONE to OBJ_BAD may trigger

    fatal: unable to get type of object ...

Accepting OBJ_NONE [2] does sound wrong, but this is how it is has
been for a very long time and I haven't time to dig in further.

[1] See 5c49c11686 (pack-objects: better check_object() performances -
    2007-04-16)

[2] 21666f1aae (convert object type handling from a string to a number
    - 2007-02-26)

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 60 ++++++++++++++++++++++++------------------
 cache.h                |  2 ++
 object.h               |  1 -
 pack-bitmap-write.c    |  6 ++---
 pack-objects.h         | 20 ++++++++++++--
 5 files changed, 58 insertions(+), 31 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 5c674b2843..647c01ea34 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -265,7 +265,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 	struct git_istream *st = NULL;
 
 	if (!usable_delta) {
-		if (entry->type == OBJ_BLOB &&
+		if (oe_type(entry) == OBJ_BLOB &&
 		    entry->size > big_file_threshold &&
 		    (st = open_istream(entry->idx.oid.hash, &type, &size, NULL)) != NULL)
 			buf = NULL;
@@ -371,7 +371,7 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 	struct pack_window *w_curs = NULL;
 	struct revindex_entry *revidx;
 	off_t offset;
-	enum object_type type = entry->type;
+	enum object_type type = oe_type(entry);
 	off_t datalen;
 	unsigned char header[MAX_PACK_OBJECT_HEADER],
 		      dheader[MAX_PACK_OBJECT_HEADER];
@@ -480,11 +480,12 @@ static off_t write_object(struct hashfile *f,
 		to_reuse = 0;	/* explicit */
 	else if (!entry->in_pack)
 		to_reuse = 0;	/* can't reuse what we don't have */
-	else if (entry->type == OBJ_REF_DELTA || entry->type == OBJ_OFS_DELTA)
+	else if (oe_type(entry) == OBJ_REF_DELTA ||
+		 oe_type(entry) == OBJ_OFS_DELTA)
 				/* check_object() decided it for us ... */
 		to_reuse = usable_delta;
 				/* ... but pack split may override that */
-	else if (entry->type != entry->in_pack_type)
+	else if (oe_type(entry) != entry->in_pack_type)
 		to_reuse = 0;	/* pack has delta which is unusable */
 	else if (entry->delta)
 		to_reuse = 0;	/* we want to pack afresh */
@@ -705,8 +706,8 @@ static struct object_entry **compute_write_order(void)
 	 * And then all remaining commits and tags.
 	 */
 	for (i = last_untagged; i < to_pack.nr_objects; i++) {
-		if (objects[i].type != OBJ_COMMIT &&
-		    objects[i].type != OBJ_TAG)
+		if (oe_type(&objects[i]) != OBJ_COMMIT &&
+		    oe_type(&objects[i]) != OBJ_TAG)
 			continue;
 		add_to_write_order(wo, &wo_end, &objects[i]);
 	}
@@ -715,7 +716,7 @@ static struct object_entry **compute_write_order(void)
 	 * And then all the trees.
 	 */
 	for (i = last_untagged; i < to_pack.nr_objects; i++) {
-		if (objects[i].type != OBJ_TREE)
+		if (oe_type(&objects[i]) != OBJ_TREE)
 			continue;
 		add_to_write_order(wo, &wo_end, &objects[i]);
 	}
@@ -1066,8 +1067,7 @@ static void create_object_entry(const struct object_id *oid,
 
 	entry = packlist_alloc(&to_pack, oid->hash, index_pos);
 	entry->hash = hash;
-	if (type)
-		entry->type = type;
+	oe_set_type(entry, type);
 	if (exclude)
 		entry->preferred_base = 1;
 	else
@@ -1407,6 +1407,7 @@ static void check_object(struct object_entry *entry)
 		unsigned long avail;
 		off_t ofs;
 		unsigned char *buf, c;
+		enum object_type type;
 
 		buf = use_pack(p, &w_curs, entry->in_pack_offset, &avail);
 
@@ -1415,11 +1416,15 @@ static void check_object(struct object_entry *entry)
 		 * since non-delta representations could still be reused.
 		 */
 		used = unpack_object_header_buffer(buf, avail,
-						   &entry->in_pack_type,
+						   &type,
 						   &entry->size);
 		if (used == 0)
 			goto give_up;
 
+		if (type < 0)
+			die("BUG: invalid type %d", type);
+		entry->in_pack_type = type;
+
 		/*
 		 * Determine if this is a delta and if so whether we can
 		 * reuse it or not.  Otherwise let's find out as cheaply as
@@ -1428,9 +1433,9 @@ static void check_object(struct object_entry *entry)
 		switch (entry->in_pack_type) {
 		default:
 			/* Not a delta hence we've already got all we need. */
-			entry->type = entry->in_pack_type;
+			oe_set_type(entry, entry->in_pack_type);
 			entry->in_pack_header_size = used;
-			if (entry->type < OBJ_COMMIT || entry->type > OBJ_BLOB)
+			if (oe_type(entry) < OBJ_COMMIT || oe_type(entry) > OBJ_BLOB)
 				goto give_up;
 			unuse_pack(&w_curs);
 			return;
@@ -1484,7 +1489,7 @@ static void check_object(struct object_entry *entry)
 			 * deltify other objects against, in order to avoid
 			 * circular deltas.
 			 */
-			entry->type = entry->in_pack_type;
+			oe_set_type(entry, entry->in_pack_type);
 			entry->delta = base_entry;
 			entry->delta_size = entry->size;
 			entry->delta_sibling = base_entry->delta_child;
@@ -1493,7 +1498,7 @@ static void check_object(struct object_entry *entry)
 			return;
 		}
 
-		if (entry->type) {
+		if (oe_type(entry)) {
 			/*
 			 * This must be a delta and we already know what the
 			 * final object type is.  Let's extract the actual
@@ -1516,7 +1521,7 @@ static void check_object(struct object_entry *entry)
 		unuse_pack(&w_curs);
 	}
 
-	entry->type = sha1_object_info(entry->idx.oid.hash, &entry->size);
+	oe_set_type(entry, sha1_object_info(entry->idx.oid.hash, &entry->size));
 	/*
 	 * The error condition is checked in prepare_pack().  This is
 	 * to permit a missing preferred base object to be ignored
@@ -1559,6 +1564,7 @@ static void drop_reused_delta(struct object_entry *entry)
 {
 	struct object_entry **p = &entry->delta->delta_child;
 	struct object_info oi = OBJECT_INFO_INIT;
+	enum object_type type;
 
 	while (*p) {
 		if (*p == entry)
@@ -1570,16 +1576,18 @@ static void drop_reused_delta(struct object_entry *entry)
 	entry->depth = 0;
 
 	oi.sizep = &entry->size;
-	oi.typep = &entry->type;
+	oi.typep = &type;
 	if (packed_object_info(entry->in_pack, entry->in_pack_offset, &oi) < 0) {
 		/*
 		 * We failed to get the info from this pack for some reason;
 		 * fall back to sha1_object_info, which may find another copy.
-		 * And if that fails, the error will be recorded in entry->type
+		 * And if that fails, the error will be recorded in oe_type(entry)
 		 * and dealt with in prepare_pack().
 		 */
-		entry->type = sha1_object_info(entry->idx.oid.hash,
-					       &entry->size);
+		oe_set_type(entry, sha1_object_info(entry->idx.oid.hash,
+						    &entry->size));
+	} else {
+		oe_set_type(entry, type);
 	}
 }
 
@@ -1747,10 +1755,12 @@ static int type_size_sort(const void *_a, const void *_b)
 {
 	const struct object_entry *a = *(struct object_entry **)_a;
 	const struct object_entry *b = *(struct object_entry **)_b;
+	enum object_type a_type = oe_type(a);
+	enum object_type b_type = oe_type(b);
 
-	if (a->type > b->type)
+	if (a_type > b_type)
 		return -1;
-	if (a->type < b->type)
+	if (a_type < b_type)
 		return 1;
 	if (a->hash > b->hash)
 		return -1;
@@ -1826,7 +1836,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	void *delta_buf;
 
 	/* Don't bother doing diffs between different types */
-	if (trg_entry->type != src_entry->type)
+	if (oe_type(trg_entry) != oe_type(src_entry))
 		return -1;
 
 	/*
@@ -2432,11 +2442,11 @@ static void prepare_pack(int window, int depth)
 
 		if (!entry->preferred_base) {
 			nr_deltas++;
-			if (entry->type < 0)
+			if (oe_type(entry) < 0)
 				die("unable to get type of object %s",
 				    oid_to_hex(&entry->idx.oid));
 		} else {
-			if (entry->type < 0) {
+			if (oe_type(entry) < 0) {
 				/*
 				 * This object is not found, but we
 				 * don't have to include it anyway.
@@ -2545,7 +2555,7 @@ static void read_object_list_from_stdin(void)
 			die("expected object ID, got garbage:\n %s", line);
 
 		add_preferred_base_object(p + 1);
-		add_object_entry(&oid, 0, p + 1, 0);
+		add_object_entry(&oid, OBJ_NONE, p + 1, 0);
 	}
 }
 
diff --git a/cache.h b/cache.h
index 21fbcc2414..862bdff83a 100644
--- a/cache.h
+++ b/cache.h
@@ -373,6 +373,8 @@ extern void free_name_hash(struct index_state *istate);
 #define read_blob_data_from_cache(path, sz) read_blob_data_from_index(&the_index, (path), (sz))
 #endif
 
+#define TYPE_BITS 3
+
 enum object_type {
 	OBJ_BAD = -1,
 	OBJ_NONE = 0,
diff --git a/object.h b/object.h
index 87563d9056..8ce294d6ec 100644
--- a/object.h
+++ b/object.h
@@ -25,7 +25,6 @@ struct object_array {
 
 #define OBJECT_ARRAY_INIT { 0, 0, NULL }
 
-#define TYPE_BITS   3
 /*
  * object flag allocation:
  * revision.h:      0---------10                                26
diff --git a/pack-bitmap-write.c b/pack-bitmap-write.c
index e01f992884..fd11f08940 100644
--- a/pack-bitmap-write.c
+++ b/pack-bitmap-write.c
@@ -64,12 +64,12 @@ void bitmap_writer_build_type_index(struct pack_idx_entry **index,
 
 		entry->in_pack_pos = i;
 
-		switch (entry->type) {
+		switch (oe_type(entry)) {
 		case OBJ_COMMIT:
 		case OBJ_TREE:
 		case OBJ_BLOB:
 		case OBJ_TAG:
-			real_type = entry->type;
+			real_type = oe_type(entry);
 			break;
 
 		default:
@@ -98,7 +98,7 @@ void bitmap_writer_build_type_index(struct pack_idx_entry **index,
 		default:
 			die("Missing type information for %s (%d/%d)",
 			    oid_to_hex(&entry->idx.oid), real_type,
-			    entry->type);
+			    oe_type(entry));
 		}
 	}
 }
diff --git a/pack-objects.h b/pack-objects.h
index de91edd264..5f568b609c 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -59,8 +59,9 @@ struct object_entry {
 	void *delta_data;	/* cached delta (uncompressed) */
 	unsigned long delta_size;	/* delta data size (uncompressed) */
 	unsigned long z_delta_size;	/* delta data size (compressed) */
-	enum object_type type;
-	enum object_type in_pack_type;	/* could be delta */
+	unsigned type_:TYPE_BITS;
+	unsigned in_pack_type:TYPE_BITS; /* could be delta */
+	unsigned type_valid:1;
 	uint32_t hash;			/* name hint hash */
 	unsigned int in_pack_pos;
 	unsigned char in_pack_header_size;
@@ -123,4 +124,19 @@ static inline uint32_t pack_name_hash(const char *name)
 	return hash;
 }
 
+static inline enum object_type oe_type(const struct object_entry *e)
+{
+	return e->type_valid ? e->type_ : OBJ_BAD;
+}
+
+static inline void oe_set_type(struct object_entry *e,
+			       enum object_type type)
+{
+	if (type >= OBJ_ANY)
+		die("BUG: OBJ_ANY cannot be set in pack-objects code");
+
+	e->type_valid = type >= OBJ_NONE;
+	e->type_ = (unsigned)type;
+}
+
 #endif
-- 
2.17.0.rc0.347.gf9cf61673a


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v5 03/11] pack-objects: use bitfield for object_entry::dfs_state
  2018-03-17 14:10         ` [PATCH v5 00/11] nd/pack-objects-pack-struct updates Nguyễn Thái Ngọc Duy
  2018-03-17 14:10           ` [PATCH v5 01/11] pack-objects: a bit of document about struct object_entry Nguyễn Thái Ngọc Duy
  2018-03-17 14:10           ` [PATCH v5 02/11] pack-objects: turn type and in_pack_type to bitfields Nguyễn Thái Ngọc Duy
@ 2018-03-17 14:10           ` Nguyễn Thái Ngọc Duy
  2018-03-17 14:10           ` [PATCH v5 04/11] pack-objects: use bitfield for object_entry::depth Nguyễn Thái Ngọc Duy
                             ` (9 subsequent siblings)
  12 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-17 14:10 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c |  3 +++
 pack-objects.h         | 28 +++++++++++++++++-----------
 2 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 647c01ea34..83f8154865 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -3049,6 +3049,9 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 		OPT_END(),
 	};
 
+	if (DFS_NUM_STATES > (1 << OE_DFS_STATE_BITS))
+		die("BUG: too many dfs states, increase OE_DFS_STATE_BITS");
+
 	check_replace_refs = 0;
 
 	reset_pack_idx_option(&pack_idx_opts);
diff --git a/pack-objects.h b/pack-objects.h
index 5f568b609c..4c6b73a4d6 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -1,6 +1,21 @@
 #ifndef PACK_OBJECTS_H
 #define PACK_OBJECTS_H
 
+#define OE_DFS_STATE_BITS	2
+
+/*
+ * State flags for depth-first search used for analyzing delta cycles.
+ *
+ * The depth is measured in delta-links to the base (so if A is a delta
+ * against B, then A has a depth of 1, and B a depth of 0).
+ */
+enum dfs_state {
+	DFS_NONE = 0,
+	DFS_ACTIVE,
+	DFS_DONE,
+	DFS_NUM_STATES
+};
+
 /*
  * basic object info
  * -----------------
@@ -73,19 +88,10 @@ struct object_entry {
 	unsigned no_try_delta:1;
 	unsigned tagged:1; /* near the very tip of refs */
 	unsigned filled:1; /* assigned write-order */
+	unsigned dfs_state:OE_DFS_STATE_BITS;
 
-	/*
-	 * State flags for depth-first search used for analyzing delta cycles.
-	 *
-	 * The depth is measured in delta-links to the base (so if A is a delta
-	 * against B, then A has a depth of 1, and B a depth of 0).
-	 */
-	enum {
-		DFS_NONE = 0,
-		DFS_ACTIVE,
-		DFS_DONE
-	} dfs_state;
 	int depth;
+
 };
 
 struct packing_data {
-- 
2.17.0.rc0.347.gf9cf61673a


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v5 04/11] pack-objects: use bitfield for object_entry::depth
  2018-03-17 14:10         ` [PATCH v5 00/11] nd/pack-objects-pack-struct updates Nguyễn Thái Ngọc Duy
                             ` (2 preceding siblings ...)
  2018-03-17 14:10           ` [PATCH v5 03/11] pack-objects: use bitfield for object_entry::dfs_state Nguyễn Thái Ngọc Duy
@ 2018-03-17 14:10           ` Nguyễn Thái Ngọc Duy
  2018-03-17 21:26             ` Ævar Arnfjörð Bjarmason
  2018-03-17 14:10           ` [PATCH v5 05/11] pack-objects: move in_pack_pos out of struct object_entry Nguyễn Thái Ngọc Duy
                             ` (8 subsequent siblings)
  12 siblings, 1 reply; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-17 14:10 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

Because of struct packing from now on we can only handle max depth
4095 (or even lower when new booleans are added in this struct). This
should be ok since long delta chain will cause significant slow down
anyway.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 Documentation/config.txt           | 1 +
 Documentation/git-pack-objects.txt | 4 +++-
 Documentation/git-repack.txt       | 4 +++-
 builtin/pack-objects.c             | 4 ++++
 pack-objects.h                     | 5 ++---
 5 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index f57e9cf10c..9bd3f5a789 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -2412,6 +2412,7 @@ pack.window::
 pack.depth::
 	The maximum delta depth used by linkgit:git-pack-objects[1] when no
 	maximum depth is given on the command line. Defaults to 50.
+	Maximum value is 4095.
 
 pack.windowMemory::
 	The maximum size of memory that is consumed by each thread
diff --git a/Documentation/git-pack-objects.txt b/Documentation/git-pack-objects.txt
index 81bc490ac5..3503c9e3e6 100644
--- a/Documentation/git-pack-objects.txt
+++ b/Documentation/git-pack-objects.txt
@@ -96,7 +96,9 @@ base-name::
 	it too deep affects the performance on the unpacker
 	side, because delta data needs to be applied that many
 	times to get to the necessary object.
-	The default value for --window is 10 and --depth is 50.
++
+The default value for --window is 10 and --depth is 50. The maximum
+depth is 4095.
 
 --window-memory=<n>::
 	This option provides an additional limit on top of `--window`;
diff --git a/Documentation/git-repack.txt b/Documentation/git-repack.txt
index ae750e9e11..25c83c4927 100644
--- a/Documentation/git-repack.txt
+++ b/Documentation/git-repack.txt
@@ -90,7 +90,9 @@ other objects in that pack they already have locally.
 	space. `--depth` limits the maximum delta depth; making it too deep
 	affects the performance on the unpacker side, because delta data needs
 	to be applied that many times to get to the necessary object.
-	The default value for --window is 10 and --depth is 50.
++
+The default value for --window is 10 and --depth is 50. The maximum
+depth is 4095.
 
 --threads=<n>::
 	This option is passed through to `git pack-objects`.
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 83f8154865..829c80ffcc 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -3068,6 +3068,10 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 	if (pack_to_stdout != !base_name || argc)
 		usage_with_options(pack_usage, pack_objects_options);
 
+	if (depth >= (1 << OE_DEPTH_BITS))
+		die(_("delta chain depth %d is greater than maximum limit %d"),
+		    depth, (1 << OE_DEPTH_BITS));
+
 	argv_array_push(&rp, "pack-objects");
 	if (thin) {
 		use_internal_rev_list = 1;
diff --git a/pack-objects.h b/pack-objects.h
index 4c6b73a4d6..a4d8d29c04 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -2,6 +2,7 @@
 #define PACK_OBJECTS_H
 
 #define OE_DFS_STATE_BITS	2
+#define OE_DEPTH_BITS		12
 
 /*
  * State flags for depth-first search used for analyzing delta cycles.
@@ -89,9 +90,7 @@ struct object_entry {
 	unsigned tagged:1; /* near the very tip of refs */
 	unsigned filled:1; /* assigned write-order */
 	unsigned dfs_state:OE_DFS_STATE_BITS;
-
-	int depth;
-
+	unsigned depth:OE_DEPTH_BITS;
 };
 
 struct packing_data {
-- 
2.17.0.rc0.347.gf9cf61673a


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v5 05/11] pack-objects: move in_pack_pos out of struct object_entry
  2018-03-17 14:10         ` [PATCH v5 00/11] nd/pack-objects-pack-struct updates Nguyễn Thái Ngọc Duy
                             ` (3 preceding siblings ...)
  2018-03-17 14:10           ` [PATCH v5 04/11] pack-objects: use bitfield for object_entry::depth Nguyễn Thái Ngọc Duy
@ 2018-03-17 14:10           ` Nguyễn Thái Ngọc Duy
  2018-03-17 14:10           ` [PATCH v5 06/11] pack-objects: move in_pack " Nguyễn Thái Ngọc Duy
                             ` (7 subsequent siblings)
  12 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-17 14:10 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

This field is only need for pack-bitmap, which is an optional
feature. Move it to a separate array that is only allocated when
pack-bitmap is used (it's not freed in the same way that objects[] is
not).

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c |  3 ++-
 pack-bitmap-write.c    |  8 +++++---
 pack-bitmap.c          |  2 +-
 pack-bitmap.h          |  4 +++-
 pack-objects.h         | 16 +++++++++++++++-
 5 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 829c80ffcc..727d200770 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -879,7 +879,8 @@ static void write_pack_file(void)
 
 			if (write_bitmap_index) {
 				bitmap_writer_set_checksum(oid.hash);
-				bitmap_writer_build_type_index(written_list, nr_written);
+				bitmap_writer_build_type_index(
+					&to_pack, written_list, nr_written);
 			}
 
 			finish_tmp_packfile(&tmpname, pack_tmp_name,
diff --git a/pack-bitmap-write.c b/pack-bitmap-write.c
index fd11f08940..f7c897515b 100644
--- a/pack-bitmap-write.c
+++ b/pack-bitmap-write.c
@@ -48,7 +48,8 @@ void bitmap_writer_show_progress(int show)
 /**
  * Build the initial type index for the packfile
  */
-void bitmap_writer_build_type_index(struct pack_idx_entry **index,
+void bitmap_writer_build_type_index(struct packing_data *to_pack,
+				    struct pack_idx_entry **index,
 				    uint32_t index_nr)
 {
 	uint32_t i;
@@ -57,12 +58,13 @@ void bitmap_writer_build_type_index(struct pack_idx_entry **index,
 	writer.trees = ewah_new();
 	writer.blobs = ewah_new();
 	writer.tags = ewah_new();
+	ALLOC_ARRAY(to_pack->in_pack_pos, to_pack->nr_objects);
 
 	for (i = 0; i < index_nr; ++i) {
 		struct object_entry *entry = (struct object_entry *)index[i];
 		enum object_type real_type;
 
-		entry->in_pack_pos = i;
+		oe_set_in_pack_pos(to_pack, entry, i);
 
 		switch (oe_type(entry)) {
 		case OBJ_COMMIT:
@@ -147,7 +149,7 @@ static uint32_t find_object_pos(const unsigned char *sha1)
 			"(object %s is missing)", sha1_to_hex(sha1));
 	}
 
-	return entry->in_pack_pos;
+	return oe_in_pack_pos(writer.to_pack, entry);
 }
 
 static void show_object(struct object *object, const char *name, void *data)
diff --git a/pack-bitmap.c b/pack-bitmap.c
index 9270983e5f..865d9ecc4e 100644
--- a/pack-bitmap.c
+++ b/pack-bitmap.c
@@ -1032,7 +1032,7 @@ int rebuild_existing_bitmaps(struct packing_data *mapping,
 		oe = packlist_find(mapping, sha1, NULL);
 
 		if (oe)
-			reposition[i] = oe->in_pack_pos + 1;
+			reposition[i] = oe_in_pack_pos(mapping, oe) + 1;
 	}
 
 	rebuild = bitmap_new();
diff --git a/pack-bitmap.h b/pack-bitmap.h
index 3742a00e14..5ded2f139a 100644
--- a/pack-bitmap.h
+++ b/pack-bitmap.h
@@ -44,7 +44,9 @@ int rebuild_existing_bitmaps(struct packing_data *mapping, khash_sha1 *reused_bi
 
 void bitmap_writer_show_progress(int show);
 void bitmap_writer_set_checksum(unsigned char *sha1);
-void bitmap_writer_build_type_index(struct pack_idx_entry **index, uint32_t index_nr);
+void bitmap_writer_build_type_index(struct packing_data *to_pack,
+				    struct pack_idx_entry **index,
+				    uint32_t index_nr);
 void bitmap_writer_reuse_bitmaps(struct packing_data *to_pack);
 void bitmap_writer_select_commits(struct commit **indexed_commits,
 		unsigned int indexed_commits_nr, int max_bitmaps);
diff --git a/pack-objects.h b/pack-objects.h
index a4d8d29c04..b832ee2b5e 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -79,7 +79,6 @@ struct object_entry {
 	unsigned in_pack_type:TYPE_BITS; /* could be delta */
 	unsigned type_valid:1;
 	uint32_t hash;			/* name hint hash */
-	unsigned int in_pack_pos;
 	unsigned char in_pack_header_size;
 	unsigned preferred_base:1; /*
 				    * we do not pack this, but is available
@@ -99,6 +98,8 @@ struct packing_data {
 
 	int32_t *index;
 	uint32_t index_size;
+
+	unsigned int *in_pack_pos;
 };
 
 struct object_entry *packlist_alloc(struct packing_data *pdata,
@@ -144,4 +145,17 @@ static inline void oe_set_type(struct object_entry *e,
 	e->type_ = (unsigned)type;
 }
 
+static inline unsigned int oe_in_pack_pos(const struct packing_data *pack,
+					  const struct object_entry *e)
+{
+	return pack->in_pack_pos[e - pack->objects];
+}
+
+static inline void oe_set_in_pack_pos(const struct packing_data *pack,
+				      const struct object_entry *e,
+				      unsigned int pos)
+{
+	pack->in_pack_pos[e - pack->objects] = pos;
+}
+
 #endif
-- 
2.17.0.rc0.347.gf9cf61673a


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v5 06/11] pack-objects: move in_pack out of struct object_entry
  2018-03-17 14:10         ` [PATCH v5 00/11] nd/pack-objects-pack-struct updates Nguyễn Thái Ngọc Duy
                             ` (4 preceding siblings ...)
  2018-03-17 14:10           ` [PATCH v5 05/11] pack-objects: move in_pack_pos out of struct object_entry Nguyễn Thái Ngọc Duy
@ 2018-03-17 14:10           ` Nguyễn Thái Ngọc Duy
  2018-03-17 14:10           ` [PATCH v5 07/11] pack-objects: refer to delta objects by index instead of pointer Nguyễn Thái Ngọc Duy
                             ` (6 subsequent siblings)
  12 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-17 14:10 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

Instead of using 8 bytes (on 64 bit arch) to store a pointer to a
pack. Use an index instead since the number of packs should be
relatively small.

This limits the number of packs we can handle to 16k. For now if you hit
16k pack files limit, pack-objects will simply fail [1].

[1] The escape hatch is .keep file to limit the non-kept pack files
    below 16k limit. Then you can go for another pack-objects run to
    combine another 16k pack files. Repeat until you're satisfied.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 Documentation/git-pack-objects.txt |  9 ++++++
 builtin/pack-objects.c             | 40 +++++++++++++++++----------
 cache.h                            |  1 +
 pack-objects.h                     | 44 +++++++++++++++++++++++++++++-
 4 files changed, 79 insertions(+), 15 deletions(-)

diff --git a/Documentation/git-pack-objects.txt b/Documentation/git-pack-objects.txt
index 3503c9e3e6..b8d936ccf5 100644
--- a/Documentation/git-pack-objects.txt
+++ b/Documentation/git-pack-objects.txt
@@ -269,6 +269,15 @@ Unexpected missing object will raise an error.
 	locally created objects [without .promisor] and objects from the
 	promisor remote [with .promisor].)  This is used with partial clone.
 
+LIMITATIONS
+-----------
+
+This command could only handle 16384 existing pack files at a time.
+If you have more than this, you need to exclude some pack files with
+".keep" file and --honor-pack-keep option, to combine 16k pack files
+in one, then remove these .keep files and run pack-objects one more
+time.
+
 SEE ALSO
 --------
 linkgit:git-rev-list[1]
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 727d200770..eaf78fa41a 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -29,6 +29,8 @@
 #include "list.h"
 #include "packfile.h"
 
+#define IN_PACK(obj) oe_in_pack(&to_pack, obj)
+
 static const char *pack_usage[] = {
 	N_("git pack-objects --stdout [<options>...] [< <ref-list> | < <object-list>]"),
 	N_("git pack-objects [<options>...] <base-name> [< <ref-list> | < <object-list>]"),
@@ -367,7 +369,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 				unsigned long limit, int usable_delta)
 {
-	struct packed_git *p = entry->in_pack;
+	struct packed_git *p = IN_PACK(entry);
 	struct pack_window *w_curs = NULL;
 	struct revindex_entry *revidx;
 	off_t offset;
@@ -478,7 +480,7 @@ static off_t write_object(struct hashfile *f,
 
 	if (!reuse_object)
 		to_reuse = 0;	/* explicit */
-	else if (!entry->in_pack)
+	else if (!IN_PACK(entry))
 		to_reuse = 0;	/* can't reuse what we don't have */
 	else if (oe_type(entry) == OBJ_REF_DELTA ||
 		 oe_type(entry) == OBJ_OFS_DELTA)
@@ -1025,7 +1027,7 @@ static int want_object_in_pack(const struct object_id *oid,
 	if (*found_pack) {
 		want = want_found_object(exclude, *found_pack);
 		if (want != -1)
-			return want;
+			goto done;
 	}
 
 	list_for_each(pos, &packed_git_mru) {
@@ -1048,11 +1050,16 @@ static int want_object_in_pack(const struct object_id *oid,
 			if (!exclude && want > 0)
 				list_move(&p->mru, &packed_git_mru);
 			if (want != -1)
-				return want;
+				goto done;
 		}
 	}
 
-	return 1;
+	want = 1;
+done:
+	if (want && *found_pack && !(*found_pack)->index)
+		oe_add_pack(&to_pack, *found_pack);
+
+	return want;
 }
 
 static void create_object_entry(const struct object_id *oid,
@@ -1074,7 +1081,7 @@ static void create_object_entry(const struct object_id *oid,
 	else
 		nr_result++;
 	if (found_pack) {
-		entry->in_pack = found_pack;
+		oe_set_in_pack(entry, found_pack);
 		entry->in_pack_offset = found_offset;
 	}
 
@@ -1399,8 +1406,8 @@ static void cleanup_preferred_base(void)
 
 static void check_object(struct object_entry *entry)
 {
-	if (entry->in_pack) {
-		struct packed_git *p = entry->in_pack;
+	if (IN_PACK(entry)) {
+		struct packed_git *p = IN_PACK(entry);
 		struct pack_window *w_curs = NULL;
 		const unsigned char *base_ref = NULL;
 		struct object_entry *base_entry;
@@ -1535,14 +1542,16 @@ static int pack_offset_sort(const void *_a, const void *_b)
 {
 	const struct object_entry *a = *(struct object_entry **)_a;
 	const struct object_entry *b = *(struct object_entry **)_b;
+	const struct packed_git *a_in_pack = IN_PACK(a);
+	const struct packed_git *b_in_pack = IN_PACK(b);
 
 	/* avoid filesystem trashing with loose objects */
-	if (!a->in_pack && !b->in_pack)
+	if (!a_in_pack && !b_in_pack)
 		return oidcmp(&a->idx.oid, &b->idx.oid);
 
-	if (a->in_pack < b->in_pack)
+	if (a_in_pack < b_in_pack)
 		return -1;
-	if (a->in_pack > b->in_pack)
+	if (a_in_pack > b_in_pack)
 		return 1;
 	return a->in_pack_offset < b->in_pack_offset ? -1 :
 			(a->in_pack_offset > b->in_pack_offset);
@@ -1578,7 +1587,7 @@ static void drop_reused_delta(struct object_entry *entry)
 
 	oi.sizep = &entry->size;
 	oi.typep = &type;
-	if (packed_object_info(entry->in_pack, entry->in_pack_offset, &oi) < 0) {
+	if (packed_object_info(IN_PACK(entry), entry->in_pack_offset, &oi) < 0) {
 		/*
 		 * We failed to get the info from this pack for some reason;
 		 * fall back to sha1_object_info, which may find another copy.
@@ -1848,8 +1857,8 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	 * it, we will still save the transfer cost, as we already know
 	 * the other side has it and we won't send src_entry at all.
 	 */
-	if (reuse_delta && trg_entry->in_pack &&
-	    trg_entry->in_pack == src_entry->in_pack &&
+	if (reuse_delta && IN_PACK(trg_entry) &&
+	    IN_PACK(trg_entry) == IN_PACK(src_entry) &&
 	    !src_entry->preferred_base &&
 	    trg_entry->in_pack_type != OBJ_REF_DELTA &&
 	    trg_entry->in_pack_type != OBJ_OFS_DELTA)
@@ -3191,6 +3200,9 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 		}
 	}
 
+	/* make sure IN_PACK(0) return NULL */
+	oe_add_pack(&to_pack, NULL);
+
 	if (progress)
 		progress_state = start_progress(_("Counting objects"), 0);
 	if (!use_internal_rev_list)
diff --git a/cache.h b/cache.h
index 862bdff83a..b90feb3802 100644
--- a/cache.h
+++ b/cache.h
@@ -1635,6 +1635,7 @@ extern struct packed_git {
 	int index_version;
 	time_t mtime;
 	int pack_fd;
+	int index;		/* for builtin/pack-objects.c */
 	unsigned pack_local:1,
 		 pack_keep:1,
 		 freshened:1,
diff --git a/pack-objects.h b/pack-objects.h
index b832ee2b5e..933f71a86b 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -3,6 +3,7 @@
 
 #define OE_DFS_STATE_BITS	2
 #define OE_DEPTH_BITS		12
+#define OE_IN_PACK_BITS		14
 
 /*
  * State flags for depth-first search used for analyzing delta cycles.
@@ -18,6 +19,10 @@ enum dfs_state {
 };
 
 /*
+ * The size of struct nearly determines pack-objects's memory
+ * consumption. This struct is packed tight for that reason. When you
+ * add or reorder something in this struct, think a bit about this.
+ *
  * basic object info
  * -----------------
  * idx.oid is filled up before delta searching starts. idx.crc32 is
@@ -65,7 +70,7 @@ enum dfs_state {
 struct object_entry {
 	struct pack_idx_entry idx;
 	unsigned long size;	/* uncompressed size */
-	struct packed_git *in_pack;	/* already in pack */
+	unsigned in_pack_idx:OE_IN_PACK_BITS;	/* already in pack */
 	off_t in_pack_offset;
 	struct object_entry *delta;	/* delta base object */
 	struct object_entry *delta_child; /* deltified objects who bases me */
@@ -100,6 +105,8 @@ struct packing_data {
 	uint32_t index_size;
 
 	unsigned int *in_pack_pos;
+	int in_pack_count;
+	struct packed_git *in_pack[1 << OE_IN_PACK_BITS];
 };
 
 struct object_entry *packlist_alloc(struct packing_data *pdata,
@@ -158,4 +165,39 @@ static inline void oe_set_in_pack_pos(const struct packing_data *pack,
 	pack->in_pack_pos[e - pack->objects] = pos;
 }
 
+static inline unsigned int oe_add_pack(struct packing_data *pack,
+				       struct packed_git *p)
+{
+	if (pack->in_pack_count >= (1 << OE_IN_PACK_BITS))
+		die(_("too many packs to handle in one go. "
+		      "Please add .keep files to exclude\n"
+		      "some pack files and keep the number "
+		      "of non-kept files below %d."),
+		    1 << OE_IN_PACK_BITS);
+	if (p) {
+		if (p->index > 0)
+			die("BUG: this packed is already indexed");
+		p->index = pack->in_pack_count;
+	}
+	pack->in_pack[pack->in_pack_count] = p;
+	return pack->in_pack_count++;
+}
+
+static inline struct packed_git *oe_in_pack(const struct packing_data *pack,
+					    const struct object_entry *e)
+{
+	return pack->in_pack[e->in_pack_idx];
+
+}
+
+static inline void oe_set_in_pack(struct object_entry *e,
+				  struct packed_git *p)
+{
+	if (p->index <= 0)
+		die("BUG: found_pack should be NULL "
+		    "instead of having non-positive index");
+	e->in_pack_idx = p->index;
+
+}
+
 #endif
-- 
2.17.0.rc0.347.gf9cf61673a


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v5 07/11] pack-objects: refer to delta objects by index instead of pointer
  2018-03-17 14:10         ` [PATCH v5 00/11] nd/pack-objects-pack-struct updates Nguyễn Thái Ngọc Duy
                             ` (5 preceding siblings ...)
  2018-03-17 14:10           ` [PATCH v5 06/11] pack-objects: move in_pack " Nguyễn Thái Ngọc Duy
@ 2018-03-17 14:10           ` Nguyễn Thái Ngọc Duy
  2018-03-17 14:10           ` [PATCH v5 08/11] pack-objects: shrink z_delta_size field in struct object_entry Nguyễn Thái Ngọc Duy
                             ` (5 subsequent siblings)
  12 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-17 14:10 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

These delta pointers always point to elements in the objects[] array
in packing_data struct. We can only hold maximum 4G of those objects
because the array size in nr_objects is uint32_t. We could use
uint32_t indexes to address these elements instead of pointers. On
64-bit architecture (8 bytes per pointer) this would save 4 bytes per
pointer.

Convert these delta pointers to indexes. Since we need to handle NULL
pointers as well, the index is shifted by one [1].

[1] This means we can only index 2^32-2 objects even though nr_objects
    could contain 2^32-1 objects. It should not be a problem in
    practice because when we grow objects[], nr_alloc would probably
    blow up long before nr_objects hits the wall.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 116 ++++++++++++++++++++++-------------------
 pack-objects.h         |  67 ++++++++++++++++++++++--
 2 files changed, 124 insertions(+), 59 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index eaf78fa41a..379bd1ab92 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -30,6 +30,12 @@
 #include "packfile.h"
 
 #define IN_PACK(obj) oe_in_pack(&to_pack, obj)
+#define DELTA(obj) oe_delta(&to_pack, obj)
+#define DELTA_CHILD(obj) oe_delta_child(&to_pack, obj)
+#define DELTA_SIBLING(obj) oe_delta_sibling(&to_pack, obj)
+#define SET_DELTA(obj, val) oe_set_delta(&to_pack, obj, val)
+#define SET_DELTA_CHILD(obj, val) oe_set_delta_child(&to_pack, obj, val)
+#define SET_DELTA_SIBLING(obj, val) oe_set_delta_sibling(&to_pack, obj, val)
 
 static const char *pack_usage[] = {
 	N_("git pack-objects --stdout [<options>...] [< <ref-list> | < <object-list>]"),
@@ -127,11 +133,11 @@ static void *get_delta(struct object_entry *entry)
 	buf = read_sha1_file(entry->idx.oid.hash, &type, &size);
 	if (!buf)
 		die("unable to read %s", oid_to_hex(&entry->idx.oid));
-	base_buf = read_sha1_file(entry->delta->idx.oid.hash, &type,
+	base_buf = read_sha1_file(DELTA(entry)->idx.oid.hash, &type,
 				  &base_size);
 	if (!base_buf)
 		die("unable to read %s",
-		    oid_to_hex(&entry->delta->idx.oid));
+		    oid_to_hex(&DELTA(entry)->idx.oid));
 	delta_buf = diff_delta(base_buf, base_size,
 			       buf, size, &delta_size, 0);
 	if (!delta_buf || delta_size != entry->delta_size)
@@ -288,12 +294,12 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 		size = entry->delta_size;
 		buf = entry->delta_data;
 		entry->delta_data = NULL;
-		type = (allow_ofs_delta && entry->delta->idx.offset) ?
+		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	} else {
 		buf = get_delta(entry);
 		size = entry->delta_size;
-		type = (allow_ofs_delta && entry->delta->idx.offset) ?
+		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	}
 
@@ -317,7 +323,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 		 * encoding of the relative offset for the delta
 		 * base from this object's position in the pack.
 		 */
-		off_t ofs = entry->idx.offset - entry->delta->idx.offset;
+		off_t ofs = entry->idx.offset - DELTA(entry)->idx.offset;
 		unsigned pos = sizeof(dheader) - 1;
 		dheader[pos] = ofs & 127;
 		while (ofs >>= 7)
@@ -343,7 +349,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 			return 0;
 		}
 		hashwrite(f, header, hdrlen);
-		hashwrite(f, entry->delta->idx.oid.hash, 20);
+		hashwrite(f, DELTA(entry)->idx.oid.hash, 20);
 		hdrlen += 20;
 	} else {
 		if (limit && hdrlen + datalen + 20 >= limit) {
@@ -379,8 +385,8 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 		      dheader[MAX_PACK_OBJECT_HEADER];
 	unsigned hdrlen;
 
-	if (entry->delta)
-		type = (allow_ofs_delta && entry->delta->idx.offset) ?
+	if (DELTA(entry))
+		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	hdrlen = encode_in_pack_object_header(header, sizeof(header),
 					      type, entry->size);
@@ -408,7 +414,7 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 	}
 
 	if (type == OBJ_OFS_DELTA) {
-		off_t ofs = entry->idx.offset - entry->delta->idx.offset;
+		off_t ofs = entry->idx.offset - DELTA(entry)->idx.offset;
 		unsigned pos = sizeof(dheader) - 1;
 		dheader[pos] = ofs & 127;
 		while (ofs >>= 7)
@@ -427,7 +433,7 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 			return 0;
 		}
 		hashwrite(f, header, hdrlen);
-		hashwrite(f, entry->delta->idx.oid.hash, 20);
+		hashwrite(f, DELTA(entry)->idx.oid.hash, 20);
 		hdrlen += 20;
 		reused_delta++;
 	} else {
@@ -467,13 +473,13 @@ static off_t write_object(struct hashfile *f,
 	else
 		limit = pack_size_limit - write_offset;
 
-	if (!entry->delta)
+	if (!DELTA(entry))
 		usable_delta = 0;	/* no delta */
 	else if (!pack_size_limit)
 	       usable_delta = 1;	/* unlimited packfile */
-	else if (entry->delta->idx.offset == (off_t)-1)
+	else if (DELTA(entry)->idx.offset == (off_t)-1)
 		usable_delta = 0;	/* base was written to another pack */
-	else if (entry->delta->idx.offset)
+	else if (DELTA(entry)->idx.offset)
 		usable_delta = 1;	/* base already exists in this pack */
 	else
 		usable_delta = 0;	/* base could end up in another pack */
@@ -489,7 +495,7 @@ static off_t write_object(struct hashfile *f,
 				/* ... but pack split may override that */
 	else if (oe_type(entry) != entry->in_pack_type)
 		to_reuse = 0;	/* pack has delta which is unusable */
-	else if (entry->delta)
+	else if (DELTA(entry))
 		to_reuse = 0;	/* we want to pack afresh */
 	else
 		to_reuse = 1;	/* we have it in-pack undeltified,
@@ -541,12 +547,12 @@ static enum write_one_status write_one(struct hashfile *f,
 	}
 
 	/* if we are deltified, write out base object first. */
-	if (e->delta) {
+	if (DELTA(e)) {
 		e->idx.offset = 1; /* now recurse */
-		switch (write_one(f, e->delta, offset)) {
+		switch (write_one(f, DELTA(e), offset)) {
 		case WRITE_ONE_RECURSIVE:
 			/* we cannot depend on this one */
-			e->delta = NULL;
+			SET_DELTA(e, NULL);
 			break;
 		default:
 			break;
@@ -608,34 +614,34 @@ static void add_descendants_to_write_order(struct object_entry **wo,
 			/* add this node... */
 			add_to_write_order(wo, endp, e);
 			/* all its siblings... */
-			for (s = e->delta_sibling; s; s = s->delta_sibling) {
+			for (s = DELTA_SIBLING(e); s; s = DELTA_SIBLING(s)) {
 				add_to_write_order(wo, endp, s);
 			}
 		}
 		/* drop down a level to add left subtree nodes if possible */
-		if (e->delta_child) {
+		if (DELTA_CHILD(e)) {
 			add_to_order = 1;
-			e = e->delta_child;
+			e = DELTA_CHILD(e);
 		} else {
 			add_to_order = 0;
 			/* our sibling might have some children, it is next */
-			if (e->delta_sibling) {
-				e = e->delta_sibling;
+			if (DELTA_SIBLING(e)) {
+				e = DELTA_SIBLING(e);
 				continue;
 			}
 			/* go back to our parent node */
-			e = e->delta;
-			while (e && !e->delta_sibling) {
+			e = DELTA(e);
+			while (e && !DELTA_SIBLING(e)) {
 				/* we're on the right side of a subtree, keep
 				 * going up until we can go right again */
-				e = e->delta;
+				e = DELTA(e);
 			}
 			if (!e) {
 				/* done- we hit our original root node */
 				return;
 			}
 			/* pass it off to sibling at this level */
-			e = e->delta_sibling;
+			e = DELTA_SIBLING(e);
 		}
 	};
 }
@@ -646,7 +652,7 @@ static void add_family_to_write_order(struct object_entry **wo,
 {
 	struct object_entry *root;
 
-	for (root = e; root->delta; root = root->delta)
+	for (root = e; DELTA(root); root = DELTA(root))
 		; /* nothing */
 	add_descendants_to_write_order(wo, endp, root);
 }
@@ -661,8 +667,8 @@ static struct object_entry **compute_write_order(void)
 	for (i = 0; i < to_pack.nr_objects; i++) {
 		objects[i].tagged = 0;
 		objects[i].filled = 0;
-		objects[i].delta_child = NULL;
-		objects[i].delta_sibling = NULL;
+		SET_DELTA_CHILD(&objects[i], NULL);
+		SET_DELTA_SIBLING(&objects[i], NULL);
 	}
 
 	/*
@@ -672,11 +678,11 @@ static struct object_entry **compute_write_order(void)
 	 */
 	for (i = to_pack.nr_objects; i > 0;) {
 		struct object_entry *e = &objects[--i];
-		if (!e->delta)
+		if (!DELTA(e))
 			continue;
 		/* Mark me as the first child */
-		e->delta_sibling = e->delta->delta_child;
-		e->delta->delta_child = e;
+		e->delta_sibling_idx = DELTA(e)->delta_child_idx;
+		SET_DELTA_CHILD(DELTA(e), e);
 	}
 
 	/*
@@ -1498,10 +1504,10 @@ static void check_object(struct object_entry *entry)
 			 * circular deltas.
 			 */
 			oe_set_type(entry, entry->in_pack_type);
-			entry->delta = base_entry;
+			SET_DELTA(entry, base_entry);
 			entry->delta_size = entry->size;
-			entry->delta_sibling = base_entry->delta_child;
-			base_entry->delta_child = entry;
+			entry->delta_sibling_idx = base_entry->delta_child_idx;
+			SET_DELTA_CHILD(base_entry, entry);
 			unuse_pack(&w_curs);
 			return;
 		}
@@ -1572,17 +1578,19 @@ static int pack_offset_sort(const void *_a, const void *_b)
  */
 static void drop_reused_delta(struct object_entry *entry)
 {
-	struct object_entry **p = &entry->delta->delta_child;
+	unsigned *idx = &to_pack.objects[entry->delta_idx - 1].delta_child_idx;
 	struct object_info oi = OBJECT_INFO_INIT;
 	enum object_type type;
 
-	while (*p) {
-		if (*p == entry)
-			*p = (*p)->delta_sibling;
+	while (*idx) {
+		struct object_entry *oe = &to_pack.objects[*idx - 1];
+
+		if (oe == entry)
+			*idx = oe->delta_sibling_idx;
 		else
-			p = &(*p)->delta_sibling;
+			idx = &oe->delta_sibling_idx;
 	}
-	entry->delta = NULL;
+	SET_DELTA(entry, NULL);
 	entry->depth = 0;
 
 	oi.sizep = &entry->size;
@@ -1622,7 +1630,7 @@ static void break_delta_chains(struct object_entry *entry)
 
 	for (cur = entry, total_depth = 0;
 	     cur;
-	     cur = cur->delta, total_depth++) {
+	     cur = DELTA(cur), total_depth++) {
 		if (cur->dfs_state == DFS_DONE) {
 			/*
 			 * We've already seen this object and know it isn't
@@ -1647,7 +1655,7 @@ static void break_delta_chains(struct object_entry *entry)
 		 * it's not a delta, we're done traversing, but we'll mark it
 		 * done to save time on future traversals.
 		 */
-		if (!cur->delta) {
+		if (!DELTA(cur)) {
 			cur->dfs_state = DFS_DONE;
 			break;
 		}
@@ -1670,7 +1678,7 @@ static void break_delta_chains(struct object_entry *entry)
 		 * We keep all commits in the chain that we examined.
 		 */
 		cur->dfs_state = DFS_ACTIVE;
-		if (cur->delta->dfs_state == DFS_ACTIVE) {
+		if (DELTA(cur)->dfs_state == DFS_ACTIVE) {
 			drop_reused_delta(cur);
 			cur->dfs_state = DFS_DONE;
 			break;
@@ -1685,7 +1693,7 @@ static void break_delta_chains(struct object_entry *entry)
 	 * an extra "next" pointer to keep going after we reset cur->delta.
 	 */
 	for (cur = entry; cur; cur = next) {
-		next = cur->delta;
+		next = DELTA(cur);
 
 		/*
 		 * We should have a chain of zero or more ACTIVE states down to
@@ -1870,7 +1878,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 
 	/* Now some size filtering heuristics. */
 	trg_size = trg_entry->size;
-	if (!trg_entry->delta) {
+	if (!DELTA(trg_entry)) {
 		max_size = trg_size/2 - 20;
 		ref_depth = 1;
 	} else {
@@ -1946,7 +1954,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	if (!delta_buf)
 		return 0;
 
-	if (trg_entry->delta) {
+	if (DELTA(trg_entry)) {
 		/* Prefer only shallower same-sized deltas. */
 		if (delta_size == trg_entry->delta_size &&
 		    src->depth + 1 >= trg->depth) {
@@ -1975,7 +1983,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 		free(delta_buf);
 	}
 
-	trg_entry->delta = src_entry;
+	SET_DELTA(trg_entry, src_entry);
 	trg_entry->delta_size = delta_size;
 	trg->depth = src->depth + 1;
 
@@ -1984,13 +1992,13 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 
 static unsigned int check_delta_limit(struct object_entry *me, unsigned int n)
 {
-	struct object_entry *child = me->delta_child;
+	struct object_entry *child = DELTA_CHILD(me);
 	unsigned int m = n;
 	while (child) {
 		unsigned int c = check_delta_limit(child, n + 1);
 		if (m < c)
 			m = c;
-		child = child->delta_sibling;
+		child = DELTA_SIBLING(child);
 	}
 	return m;
 }
@@ -2059,7 +2067,7 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
 		 * otherwise they would become too deep.
 		 */
 		max_depth = depth;
-		if (entry->delta_child) {
+		if (DELTA_CHILD(entry)) {
 			max_depth -= check_delta_limit(entry, 0);
 			if (max_depth <= 0)
 				goto next;
@@ -2109,7 +2117,7 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
 		 * depth, leaving it in the window is pointless.  we
 		 * should evict it first.
 		 */
-		if (entry->delta && max_depth <= n->depth)
+		if (DELTA(entry) && max_depth <= n->depth)
 			continue;
 
 		/*
@@ -2117,7 +2125,7 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
 		 * currently deltified object, to keep it longer.  It will
 		 * be the first base object to be attempted next.
 		 */
-		if (entry->delta) {
+		if (DELTA(entry)) {
 			struct unpacked swap = array[best_base];
 			int dist = (window + idx - best_base) % window;
 			int dst = best_base;
@@ -2438,7 +2446,7 @@ static void prepare_pack(int window, int depth)
 	for (i = 0; i < to_pack.nr_objects; i++) {
 		struct object_entry *entry = to_pack.objects + i;
 
-		if (entry->delta)
+		if (DELTA(entry))
 			/* This happens if we decided to reuse existing
 			 * delta from a pack.  "reuse_delta &&" is implied.
 			 */
diff --git a/pack-objects.h b/pack-objects.h
index 933f71a86b..0b831c8f12 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -72,11 +72,11 @@ struct object_entry {
 	unsigned long size;	/* uncompressed size */
 	unsigned in_pack_idx:OE_IN_PACK_BITS;	/* already in pack */
 	off_t in_pack_offset;
-	struct object_entry *delta;	/* delta base object */
-	struct object_entry *delta_child; /* deltified objects who bases me */
-	struct object_entry *delta_sibling; /* other deltified objects who
-					     * uses the same base as me
-					     */
+	uint32_t delta_idx;	/* delta base object */
+	uint32_t delta_child_idx; /* deltified objects who bases me */
+	uint32_t delta_sibling_idx; /* other deltified objects who
+				     * uses the same base as me
+				     */
 	void *delta_data;	/* cached delta (uncompressed) */
 	unsigned long delta_size;	/* delta data size (uncompressed) */
 	unsigned long z_delta_size;	/* delta data size (compressed) */
@@ -200,4 +200,61 @@ static inline void oe_set_in_pack(struct object_entry *e,
 
 }
 
+static inline struct object_entry *oe_delta(
+		const struct packing_data *pack,
+		const struct object_entry *e)
+{
+	if (e->delta_idx)
+		return &pack->objects[e->delta_idx - 1];
+	return NULL;
+}
+
+static inline void oe_set_delta(struct packing_data *pack,
+				struct object_entry *e,
+				struct object_entry *delta)
+{
+	if (delta)
+		e->delta_idx = (delta - pack->objects) + 1;
+	else
+		e->delta_idx = 0;
+}
+
+static inline struct object_entry *oe_delta_child(
+		const struct packing_data *pack,
+		const struct object_entry *e)
+{
+	if (e->delta_child_idx)
+		return &pack->objects[e->delta_child_idx - 1];
+	return NULL;
+}
+
+static inline void oe_set_delta_child(struct packing_data *pack,
+				      struct object_entry *e,
+				      struct object_entry *delta)
+{
+	if (delta)
+		e->delta_child_idx = (delta - pack->objects) + 1;
+	else
+		e->delta_child_idx = 0;
+}
+
+static inline struct object_entry *oe_delta_sibling(
+		const struct packing_data *pack,
+		const struct object_entry *e)
+{
+	if (e->delta_sibling_idx)
+		return &pack->objects[e->delta_sibling_idx - 1];
+	return NULL;
+}
+
+static inline void oe_set_delta_sibling(struct packing_data *pack,
+					struct object_entry *e,
+					struct object_entry *delta)
+{
+	if (delta)
+		e->delta_sibling_idx = (delta - pack->objects) + 1;
+	else
+		e->delta_sibling_idx = 0;
+}
+
 #endif
-- 
2.17.0.rc0.347.gf9cf61673a


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v5 08/11] pack-objects: shrink z_delta_size field in struct object_entry
  2018-03-17 14:10         ` [PATCH v5 00/11] nd/pack-objects-pack-struct updates Nguyễn Thái Ngọc Duy
                             ` (6 preceding siblings ...)
  2018-03-17 14:10           ` [PATCH v5 07/11] pack-objects: refer to delta objects by index instead of pointer Nguyễn Thái Ngọc Duy
@ 2018-03-17 14:10           ` Nguyễn Thái Ngọc Duy
  2018-03-17 14:10           ` [PATCH v5 09/11] pack-objects: shrink size " Nguyễn Thái Ngọc Duy
                             ` (4 subsequent siblings)
  12 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-17 14:10 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

We only cache deltas when it's smaller than a certain limit. This limit
defaults to 1000 but save its compressed length in a 64-bit field.
Shrink that field down to 16 bits, so you can only cache 65kb deltas.
Larger deltas must be recomputed at when the pack is written down.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 Documentation/config.txt |  3 ++-
 builtin/pack-objects.c   | 22 ++++++++++++++++------
 pack-objects.h           |  3 ++-
 3 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index 9bd3f5a789..00fa824448 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -2449,7 +2449,8 @@ pack.deltaCacheLimit::
 	The maximum size of a delta, that is cached in
 	linkgit:git-pack-objects[1]. This cache is used to speed up the
 	writing object phase by not having to recompute the final delta
-	result once the best match for all objects is found. Defaults to 1000.
+	result once the best match for all objects is found.
+	Defaults to 1000. Maximum value is 65535.
 
 pack.threads::
 	Specifies the number of threads to spawn when searching for best
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 379bd1ab92..71ca1ba2ce 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -2105,12 +2105,19 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
 		 * between writes at that moment.
 		 */
 		if (entry->delta_data && !pack_to_stdout) {
-			entry->z_delta_size = do_compress(&entry->delta_data,
-							  entry->delta_size);
-			cache_lock();
-			delta_cache_size -= entry->delta_size;
-			delta_cache_size += entry->z_delta_size;
-			cache_unlock();
+			unsigned long size;
+
+			size = do_compress(&entry->delta_data, entry->delta_size);
+			if (size < (1 << OE_Z_DELTA_BITS)) {
+				entry->z_delta_size = size;
+				cache_lock();
+				delta_cache_size -= entry->delta_size;
+				delta_cache_size += entry->z_delta_size;
+				cache_unlock();
+			} else {
+				FREE_AND_NULL(entry->delta_data);
+				entry->z_delta_size = 0;
+			}
 		}
 
 		/* if we made n a delta, and if n is already at max
@@ -3089,6 +3096,9 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 	if (depth >= (1 << OE_DEPTH_BITS))
 		die(_("delta chain depth %d is greater than maximum limit %d"),
 		    depth, (1 << OE_DEPTH_BITS));
+	if (cache_max_small_delta_size >= (1 << OE_Z_DELTA_BITS))
+		die(_("pack.deltaCacheLimit is greater than maximum limit %d"),
+		    1 << OE_Z_DELTA_BITS);
 
 	argv_array_push(&rp, "pack-objects");
 	if (thin) {
diff --git a/pack-objects.h b/pack-objects.h
index 0b831c8f12..63222a76b0 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -4,6 +4,7 @@
 #define OE_DFS_STATE_BITS	2
 #define OE_DEPTH_BITS		12
 #define OE_IN_PACK_BITS		14
+#define OE_Z_DELTA_BITS		16
 
 /*
  * State flags for depth-first search used for analyzing delta cycles.
@@ -79,7 +80,7 @@ struct object_entry {
 				     */
 	void *delta_data;	/* cached delta (uncompressed) */
 	unsigned long delta_size;	/* delta data size (uncompressed) */
-	unsigned long z_delta_size;	/* delta data size (compressed) */
+	unsigned z_delta_size:OE_Z_DELTA_BITS;
 	unsigned type_:TYPE_BITS;
 	unsigned in_pack_type:TYPE_BITS; /* could be delta */
 	unsigned type_valid:1;
-- 
2.17.0.rc0.347.gf9cf61673a


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v5 09/11] pack-objects: shrink size field in struct object_entry
  2018-03-17 14:10         ` [PATCH v5 00/11] nd/pack-objects-pack-struct updates Nguyễn Thái Ngọc Duy
                             ` (7 preceding siblings ...)
  2018-03-17 14:10           ` [PATCH v5 08/11] pack-objects: shrink z_delta_size field in struct object_entry Nguyễn Thái Ngọc Duy
@ 2018-03-17 14:10           ` Nguyễn Thái Ngọc Duy
  2018-03-17 19:57             ` Ævar Arnfjörð Bjarmason
  2018-03-18  5:09             ` Junio C Hamano
  2018-03-17 14:10           ` [PATCH v5 10/11] pack-objects: shrink delta_size " Nguyễn Thái Ngọc Duy
                             ` (3 subsequent siblings)
  12 siblings, 2 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-17 14:10 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

It's very very rare that an uncompressd object is larger than 4GB
(partly because Git does not handle those large files very well to
begin with). Let's optimize it for the common case where object size
is smaller than this limit.

Shrink size field down to 32 bits [1] and one overflow bit. If the size
is too large, we read it back from disk.

Add two compare helpers that can take advantage of the overflow
bit (e.g. if the file is 4GB+, chances are it's already larger than
core.bigFileThreshold and there's no point in comparing the actual
value).

A small note about the conditional oe_set_size() in
check_object(). Technically if we don't get a valid type, it's not
wrong if we set uninitialized value "size" (we don't pre-initialize
this and sha1_object_info will not assign anything when it fails to
get the info).

This how changes the writing code path slightly which emits different
error messages (either way we die). One of our tests in t5530 depends
on this specific error message. Let's just keep the test as-is and
play safe by not assigning random value. That might trigger valgrind
anyway.

[1] it's actually already 32 bits on Windows

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 49 ++++++++++++++++++++++++++---------------
 pack-objects.h         | 50 +++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 80 insertions(+), 19 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 71ca1ba2ce..887e12c556 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -274,7 +274,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 
 	if (!usable_delta) {
 		if (oe_type(entry) == OBJ_BLOB &&
-		    entry->size > big_file_threshold &&
+		    oe_size_greater_than(entry, big_file_threshold) &&
 		    (st = open_istream(entry->idx.oid.hash, &type, &size, NULL)) != NULL)
 			buf = NULL;
 		else {
@@ -384,12 +384,13 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 	unsigned char header[MAX_PACK_OBJECT_HEADER],
 		      dheader[MAX_PACK_OBJECT_HEADER];
 	unsigned hdrlen;
+	unsigned long entry_size = oe_size(entry);
 
 	if (DELTA(entry))
 		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	hdrlen = encode_in_pack_object_header(header, sizeof(header),
-					      type, entry->size);
+					      type, entry_size);
 
 	offset = entry->in_pack_offset;
 	revidx = find_pack_revindex(p, offset);
@@ -406,7 +407,7 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 	datalen -= entry->in_pack_header_size;
 
 	if (!pack_to_stdout && p->index_version == 1 &&
-	    check_pack_inflate(p, &w_curs, offset, datalen, entry->size)) {
+	    check_pack_inflate(p, &w_curs, offset, datalen, entry_size)) {
 		error("corrupt packed object for %s",
 		      oid_to_hex(&entry->idx.oid));
 		unuse_pack(&w_curs);
@@ -1412,6 +1413,8 @@ static void cleanup_preferred_base(void)
 
 static void check_object(struct object_entry *entry)
 {
+	unsigned long size;
+
 	if (IN_PACK(entry)) {
 		struct packed_git *p = IN_PACK(entry);
 		struct pack_window *w_curs = NULL;
@@ -1431,13 +1434,14 @@ static void check_object(struct object_entry *entry)
 		 */
 		used = unpack_object_header_buffer(buf, avail,
 						   &type,
-						   &entry->size);
+						   &size);
 		if (used == 0)
 			goto give_up;
 
 		if (type < 0)
 			die("BUG: invalid type %d", type);
 		entry->in_pack_type = type;
+		oe_set_size(entry, size);
 
 		/*
 		 * Determine if this is a delta and if so whether we can
@@ -1505,7 +1509,7 @@ static void check_object(struct object_entry *entry)
 			 */
 			oe_set_type(entry, entry->in_pack_type);
 			SET_DELTA(entry, base_entry);
-			entry->delta_size = entry->size;
+			entry->delta_size = oe_size(entry);
 			entry->delta_sibling_idx = base_entry->delta_child_idx;
 			SET_DELTA_CHILD(base_entry, entry);
 			unuse_pack(&w_curs);
@@ -1513,14 +1517,17 @@ static void check_object(struct object_entry *entry)
 		}
 
 		if (oe_type(entry)) {
+			unsigned long size;
+
+			size = get_size_from_delta(p, &w_curs,
+				entry->in_pack_offset + entry->in_pack_header_size);
 			/*
 			 * This must be a delta and we already know what the
 			 * final object type is.  Let's extract the actual
 			 * object size from the delta header.
 			 */
-			entry->size = get_size_from_delta(p, &w_curs,
-					entry->in_pack_offset + entry->in_pack_header_size);
-			if (entry->size == 0)
+			oe_set_size(entry, size);
+			if (oe_size_less_than(entry, 1))
 				goto give_up;
 			unuse_pack(&w_curs);
 			return;
@@ -1535,13 +1542,15 @@ static void check_object(struct object_entry *entry)
 		unuse_pack(&w_curs);
 	}
 
-	oe_set_type(entry, sha1_object_info(entry->idx.oid.hash, &entry->size));
+	oe_set_type(entry, sha1_object_info(entry->idx.oid.hash, &size));
 	/*
 	 * The error condition is checked in prepare_pack().  This is
 	 * to permit a missing preferred base object to be ignored
 	 * as a preferred base.  Doing so can result in a larger
 	 * pack file, but the transfer will still take place.
 	 */
+	if (entry->type_valid)
+		oe_set_size(entry, size);
 }
 
 static int pack_offset_sort(const void *_a, const void *_b)
@@ -1581,6 +1590,7 @@ static void drop_reused_delta(struct object_entry *entry)
 	unsigned *idx = &to_pack.objects[entry->delta_idx - 1].delta_child_idx;
 	struct object_info oi = OBJECT_INFO_INIT;
 	enum object_type type;
+	unsigned long size;
 
 	while (*idx) {
 		struct object_entry *oe = &to_pack.objects[*idx - 1];
@@ -1593,7 +1603,7 @@ static void drop_reused_delta(struct object_entry *entry)
 	SET_DELTA(entry, NULL);
 	entry->depth = 0;
 
-	oi.sizep = &entry->size;
+	oi.sizep = &size;
 	oi.typep = &type;
 	if (packed_object_info(IN_PACK(entry), entry->in_pack_offset, &oi) < 0) {
 		/*
@@ -1603,10 +1613,11 @@ static void drop_reused_delta(struct object_entry *entry)
 		 * and dealt with in prepare_pack().
 		 */
 		oe_set_type(entry, sha1_object_info(entry->idx.oid.hash,
-						    &entry->size));
+						    &size));
 	} else {
 		oe_set_type(entry, type);
 	}
+	oe_set_size(entry, size);
 }
 
 /*
@@ -1746,7 +1757,7 @@ static void get_object_details(void)
 	for (i = 0; i < to_pack.nr_objects; i++) {
 		struct object_entry *entry = sorted_by_offset[i];
 		check_object(entry);
-		if (big_file_threshold < entry->size)
+		if (oe_size_greater_than(entry, big_file_threshold))
 			entry->no_try_delta = 1;
 	}
 
@@ -1775,6 +1786,8 @@ static int type_size_sort(const void *_a, const void *_b)
 	const struct object_entry *b = *(struct object_entry **)_b;
 	enum object_type a_type = oe_type(a);
 	enum object_type b_type = oe_type(b);
+	unsigned long a_size = oe_size(a);
+	unsigned long b_size = oe_size(b);
 
 	if (a_type > b_type)
 		return -1;
@@ -1788,9 +1801,9 @@ static int type_size_sort(const void *_a, const void *_b)
 		return -1;
 	if (a->preferred_base < b->preferred_base)
 		return 1;
-	if (a->size > b->size)
+	if (a_size > b_size)
 		return -1;
-	if (a->size < b->size)
+	if (a_size < b_size)
 		return 1;
 	return a < b ? -1 : (a > b);  /* newest first */
 }
@@ -1877,7 +1890,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 		return 0;
 
 	/* Now some size filtering heuristics. */
-	trg_size = trg_entry->size;
+	trg_size = oe_size(trg_entry);
 	if (!DELTA(trg_entry)) {
 		max_size = trg_size/2 - 20;
 		ref_depth = 1;
@@ -1889,7 +1902,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 						(max_depth - ref_depth + 1);
 	if (max_size == 0)
 		return 0;
-	src_size = src_entry->size;
+	src_size = oe_size(src_entry);
 	sizediff = src_size < trg_size ? trg_size - src_size : 0;
 	if (sizediff >= max_size)
 		return 0;
@@ -2009,7 +2022,7 @@ static unsigned long free_unpacked(struct unpacked *n)
 	free_delta_index(n->index);
 	n->index = NULL;
 	if (n->data) {
-		freed_mem += n->entry->size;
+		freed_mem += oe_size(n->entry);
 		FREE_AND_NULL(n->data);
 	}
 	n->entry = NULL;
@@ -2459,7 +2472,7 @@ static void prepare_pack(int window, int depth)
 			 */
 			continue;
 
-		if (entry->size < 50)
+		if (oe_size_less_than(entry, 50))
 			continue;
 
 		if (entry->no_try_delta)
diff --git a/pack-objects.h b/pack-objects.h
index 63222a76b0..9a4ed7fdbe 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -70,7 +70,9 @@ enum dfs_state {
  */
 struct object_entry {
 	struct pack_idx_entry idx;
-	unsigned long size;	/* uncompressed size */
+	/* object uncompressed size _if_ size_valid is true */
+	uint32_t size_;
+	unsigned size_valid:1;
 	unsigned in_pack_idx:OE_IN_PACK_BITS;	/* already in pack */
 	off_t in_pack_offset;
 	uint32_t delta_idx;	/* delta base object */
@@ -258,4 +260,50 @@ static inline void oe_set_delta_sibling(struct packing_data *pack,
 		e->delta_sibling_idx = 0;
 }
 
+static inline unsigned long oe_size(const struct object_entry *e)
+{
+	if (e->size_valid) {
+		return e->size_;
+	} else {
+		unsigned long size;
+
+		sha1_object_info(e->idx.oid.hash, &size);
+		return size;
+	}
+}
+
+static inline int contains_in_32bits(unsigned long limit)
+{
+	uint32_t truncated_limit = (uint32_t)limit;
+
+	return limit == truncated_limit;
+}
+
+static inline int oe_size_less_than(const struct object_entry *e,
+				    unsigned long limit)
+{
+	if (e->size_valid)
+		return e->size_ < limit;
+	if (contains_in_32bits(limit))
+		return 1;
+	return oe_size(e) < limit;
+}
+
+static inline int oe_size_greater_than(const struct object_entry *e,
+				       unsigned long limit)
+{
+	if (e->size_valid)
+		return e->size_ > limit;
+	if (contains_in_32bits(limit))
+		return 0;
+	return oe_size(e) > limit;
+}
+
+static inline void oe_set_size(struct object_entry *e,
+			       unsigned long size)
+{
+	e->size_ = size;
+	e->size_valid = e->size_ == size;
+}
+
 #endif
-- 
2.17.0.rc0.347.gf9cf61673a


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v5 10/11] pack-objects: shrink delta_size field in struct object_entry
  2018-03-17 14:10         ` [PATCH v5 00/11] nd/pack-objects-pack-struct updates Nguyễn Thái Ngọc Duy
                             ` (8 preceding siblings ...)
  2018-03-17 14:10           ` [PATCH v5 09/11] pack-objects: shrink size " Nguyễn Thái Ngọc Duy
@ 2018-03-17 14:10           ` Nguyễn Thái Ngọc Duy
  2018-03-17 14:10           ` [PATCH v5 11/11] pack-objects.h: reorder members to shrink " Nguyễn Thái Ngọc Duy
                             ` (2 subsequent siblings)
  12 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-17 14:10 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

Allowing a delta size of 64 bits is crazy. Shrink this field down to
31 bits with one overflow bit.

If we find an existing delta larger than 2GB, we do not cache
delta_size at all and will get the value from oe_size(), potentially
from disk if it's larger than 4GB.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 24 ++++++++++++++----------
 pack-objects.h         | 23 ++++++++++++++++++++++-
 2 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 887e12c556..fb2aba80bf 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -30,10 +30,12 @@
 #include "packfile.h"
 
 #define IN_PACK(obj) oe_in_pack(&to_pack, obj)
+#define DELTA_SIZE(obj) oe_delta_size(&to_pack, obj)
 #define DELTA(obj) oe_delta(&to_pack, obj)
 #define DELTA_CHILD(obj) oe_delta_child(&to_pack, obj)
 #define DELTA_SIBLING(obj) oe_delta_sibling(&to_pack, obj)
 #define SET_DELTA(obj, val) oe_set_delta(&to_pack, obj, val)
+#define SET_DELTA_SIZE(obj, val) oe_set_delta_size(&to_pack, obj, val)
 #define SET_DELTA_CHILD(obj, val) oe_set_delta_child(&to_pack, obj, val)
 #define SET_DELTA_SIBLING(obj, val) oe_set_delta_sibling(&to_pack, obj, val)
 
@@ -140,7 +142,7 @@ static void *get_delta(struct object_entry *entry)
 		    oid_to_hex(&DELTA(entry)->idx.oid));
 	delta_buf = diff_delta(base_buf, base_size,
 			       buf, size, &delta_size, 0);
-	if (!delta_buf || delta_size != entry->delta_size)
+	if (!delta_buf || delta_size != DELTA_SIZE(entry))
 		die("delta size changed");
 	free(buf);
 	free(base_buf);
@@ -291,14 +293,14 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 		FREE_AND_NULL(entry->delta_data);
 		entry->z_delta_size = 0;
 	} else if (entry->delta_data) {
-		size = entry->delta_size;
+		size = DELTA_SIZE(entry);
 		buf = entry->delta_data;
 		entry->delta_data = NULL;
 		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	} else {
 		buf = get_delta(entry);
-		size = entry->delta_size;
+		size = DELTA_SIZE(entry);
 		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	}
@@ -1509,7 +1511,7 @@ static void check_object(struct object_entry *entry)
 			 */
 			oe_set_type(entry, entry->in_pack_type);
 			SET_DELTA(entry, base_entry);
-			entry->delta_size = oe_size(entry);
+			SET_DELTA_SIZE(entry, oe_size(entry));
 			entry->delta_sibling_idx = base_entry->delta_child_idx;
 			SET_DELTA_CHILD(base_entry, entry);
 			unuse_pack(&w_curs);
@@ -1895,7 +1897,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 		max_size = trg_size/2 - 20;
 		ref_depth = 1;
 	} else {
-		max_size = trg_entry->delta_size;
+		max_size = DELTA_SIZE(trg_entry);
 		ref_depth = trg->depth;
 	}
 	max_size = (uint64_t)max_size * (max_depth - src->depth) /
@@ -1966,10 +1968,12 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	delta_buf = create_delta(src->index, trg->data, trg_size, &delta_size, max_size);
 	if (!delta_buf)
 		return 0;
+	if (delta_size >= (1 << OE_DELTA_SIZE_BITS))
+		return 0;
 
 	if (DELTA(trg_entry)) {
 		/* Prefer only shallower same-sized deltas. */
-		if (delta_size == trg_entry->delta_size &&
+		if (delta_size == DELTA_SIZE(trg_entry) &&
 		    src->depth + 1 >= trg->depth) {
 			free(delta_buf);
 			return 0;
@@ -1984,7 +1988,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	free(trg_entry->delta_data);
 	cache_lock();
 	if (trg_entry->delta_data) {
-		delta_cache_size -= trg_entry->delta_size;
+		delta_cache_size -= DELTA_SIZE(trg_entry);
 		trg_entry->delta_data = NULL;
 	}
 	if (delta_cacheable(src_size, trg_size, delta_size)) {
@@ -1997,7 +2001,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	}
 
 	SET_DELTA(trg_entry, src_entry);
-	trg_entry->delta_size = delta_size;
+	SET_DELTA_SIZE(trg_entry, delta_size);
 	trg->depth = src->depth + 1;
 
 	return 1;
@@ -2120,11 +2124,11 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
 		if (entry->delta_data && !pack_to_stdout) {
 			unsigned long size;
 
-			size = do_compress(&entry->delta_data, entry->delta_size);
+			size = do_compress(&entry->delta_data, DELTA_SIZE(entry));
 			if (size < (1 << OE_Z_DELTA_BITS)) {
 				entry->z_delta_size = size;
 				cache_lock();
-				delta_cache_size -= entry->delta_size;
+				delta_cache_size -= DELTA_SIZE(entry);
 				delta_cache_size += entry->z_delta_size;
 				cache_unlock();
 			} else {
diff --git a/pack-objects.h b/pack-objects.h
index 9a4ed7fdbe..2507b157d5 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -5,6 +5,7 @@
 #define OE_DEPTH_BITS		12
 #define OE_IN_PACK_BITS		14
 #define OE_Z_DELTA_BITS		16
+#define OE_DELTA_SIZE_BITS	31
 
 /*
  * State flags for depth-first search used for analyzing delta cycles.
@@ -81,7 +82,8 @@ struct object_entry {
 				     * uses the same base as me
 				     */
 	void *delta_data;	/* cached delta (uncompressed) */
-	unsigned long delta_size;	/* delta data size (uncompressed) */
+	uint32_t delta_size_:OE_DELTA_SIZE_BITS; /* delta data size (uncompressed) */
+	uint32_t delta_size_valid:1;
 	unsigned z_delta_size:OE_Z_DELTA_BITS;
 	unsigned type_:TYPE_BITS;
 	unsigned in_pack_type:TYPE_BITS; /* could be delta */
@@ -306,4 +308,23 @@ static inline void oe_set_size(struct object_entry *e,
 	e->size_valid = e->size_ == size;
 }
 
+static inline unsigned long oe_delta_size(struct packing_data *pack,
+					  const struct object_entry *e)
+{
+	if (e->delta_size_valid)
+		return e->delta_size_;
+	return oe_size(e);
+}
+
+static inline void oe_set_delta_size(struct packing_data *pack,
+				     struct object_entry *e,
+				     unsigned long size)
+{
+	e->delta_size_ = size;
+	e->delta_size_valid = e->delta_size_ == size;
+	if (!e->delta_size_valid && size != oe_size(e))
+		die("BUG: this can only happen in check_object() "
+		    "where delta size is the same as entry size");
+}
+
 #endif
-- 
2.17.0.rc0.347.gf9cf61673a


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v5 11/11] pack-objects.h: reorder members to shrink struct object_entry
  2018-03-17 14:10         ` [PATCH v5 00/11] nd/pack-objects-pack-struct updates Nguyễn Thái Ngọc Duy
                             ` (9 preceding siblings ...)
  2018-03-17 14:10           ` [PATCH v5 10/11] pack-objects: shrink delta_size " Nguyễn Thái Ngọc Duy
@ 2018-03-17 14:10           ` Nguyễn Thái Ngọc Duy
  2018-03-17 19:53             ` Ævar Arnfjörð Bjarmason
  2018-03-17 19:45           ` [PATCH v5 00/11] nd/pack-objects-pack-struct updates Ævar Arnfjörð Bjarmason
  2018-03-18 14:25           ` [PATCH v6 " Nguyễn Thái Ngọc Duy
  12 siblings, 1 reply; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-17 14:10 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

Previous patches leave lots of holes and padding in this struct. This
patch reorders the members and shrinks the struct down to 80 bytes
(from 136 bytes, before any field shrinking is done) with 16 bits to
spare (and a couple more in in_pack_header_size when we really run out
of bits).

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 pack-objects.h | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/pack-objects.h b/pack-objects.h
index 2507b157d5..8979289f5f 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -71,35 +71,36 @@ enum dfs_state {
  */
 struct object_entry {
 	struct pack_idx_entry idx;
-	/* object uncompressed size _if_ size_valid is true */
-	uint32_t size_;
-	unsigned size_valid:1;
-	unsigned in_pack_idx:OE_IN_PACK_BITS;	/* already in pack */
+	void *delta_data;	/* cached delta (uncompressed) */
 	off_t in_pack_offset;
+	uint32_t hash;			/* name hint hash */
+	uint32_t size_;	/* object uncompressed size _if_ size_valid is true */
 	uint32_t delta_idx;	/* delta base object */
 	uint32_t delta_child_idx; /* deltified objects who bases me */
 	uint32_t delta_sibling_idx; /* other deltified objects who
 				     * uses the same base as me
 				     */
-	void *delta_data;	/* cached delta (uncompressed) */
 	uint32_t delta_size_:OE_DELTA_SIZE_BITS; /* delta data size (uncompressed) */
 	uint32_t delta_size_valid:1;
+	unsigned in_pack_idx:OE_IN_PACK_BITS;	/* already in pack */
+	unsigned size_valid:1;
 	unsigned z_delta_size:OE_Z_DELTA_BITS;
+	unsigned type_valid:1;
 	unsigned type_:TYPE_BITS;
 	unsigned in_pack_type:TYPE_BITS; /* could be delta */
-	unsigned type_valid:1;
-	uint32_t hash;			/* name hint hash */
-	unsigned char in_pack_header_size;
 	unsigned preferred_base:1; /*
 				    * we do not pack this, but is available
 				    * to be used as the base object to delta
 				    * objects against.
 				    */
 	unsigned no_try_delta:1;
+	unsigned char in_pack_header_size;
 	unsigned tagged:1; /* near the very tip of refs */
 	unsigned filled:1; /* assigned write-order */
 	unsigned dfs_state:OE_DFS_STATE_BITS;
 	unsigned depth:OE_DEPTH_BITS;
+
+	/* size: 80, bit_padding: 16 bits */
 };
 
 struct packing_data {
-- 
2.17.0.rc0.347.gf9cf61673a


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* Re: [PATCH v5 00/11] nd/pack-objects-pack-struct updates
  2018-03-17 14:10         ` [PATCH v5 00/11] nd/pack-objects-pack-struct updates Nguyễn Thái Ngọc Duy
                             ` (10 preceding siblings ...)
  2018-03-17 14:10           ` [PATCH v5 11/11] pack-objects.h: reorder members to shrink " Nguyễn Thái Ngọc Duy
@ 2018-03-17 19:45           ` Ævar Arnfjörð Bjarmason
  2018-03-17 19:47             ` Ævar Arnfjörð Bjarmason
  2018-03-18 14:25           ` [PATCH v6 " Nguyễn Thái Ngọc Duy
  12 siblings, 1 reply; 273+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2018-03-17 19:45 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: e, git, gitster, peff


On Sat, Mar 17 2018, Nguyễn Thái Ngọc Duy jotted:

> v5 changes are small enough that the interdiff is pretty self
> explanatory (there's also a couple commit msg updates).

I've been testing this and it's definitely an improvement. I think it
would be good to get some mention in the commit messages themselves of
the incremental improvement, to that end I wrote this:

    $ cat /tmp/howmuch-mem.sh
    #!/bin/sh
    cd /tmp &&
    (
        for i in {1..3}
        do
            /usr/bin/time -f MaxRSS:%M ~/g/git/git --git-dir=/tmp/linux.git --exec-path=/home/avar/g/git repack -A -d 2>&1
        done | grep MaxRSS: | sort -n | head -n 1 | tr '\n' '\t' &&
        git git-commit-summary &&
        echo
    ) | tee -a /tmp/git-memory.log

I.e. we repack linux.git (I'd already repacked it once) and do three
runs, and take the lowest RSS size. This yields (I rebased your series
on top of git@github.com:git/git.git master and pushed it to
git@github.com:avar/git.git pack-objects-reduce-memory-footprint), via:

    git rebase --exec='make -j8 CFLAGS="-O3" && /tmp/howmuch-mem.sh' -i

That gave me, kb on the first column:

    MaxRSS:3746648  f23a196dd9 ("pack-objects: a bit of document about struct object_entry", 2018-03-01)
    MaxRSS:3700696  953b6473d7 ("pack-objects: turn type and in_pack_type to bitfields", 2018-03-01)
    MaxRSS:3700404  6cbe573539 ("pack-objects: use bitfield for object_entry::dfs_state", 2018-03-01)
    MaxRSS:3654044  0b93ebcae9 ("pack-objects: use bitfield for object_entry::depth", 2018-03-01)
    MaxRSS:3654040  67a4d48773 ("pack-objects: move in_pack_pos out of struct object_entry", 2018-03-01) [X]
    MaxRSS:3654104  e77319c65a ("pack-objects: move in_pack out of struct object_entry", 2018-03-01) [X]
    MaxRSS:3608096  a72cfcfea3 ("pack-objects: refer to delta objects by index instead of pointer", 2018-03-01)
    MaxRSS:3562212  76eaa779eb ("pack-objects: shrink z_delta_size field in struct object_entry", 2018-03-05)
    MaxRSS:3515164  42e28dd4b3 ("pack-objects: shrink size field in struct object_entry", 2018-03-05)
    MaxRSS:3469440  26eba3ded4 ("pack-objects: shrink delta_size field in struct object_entry", 2018-03-05)
    MaxRSS:3423704  c6493de964 ("pack-objects.h: reorder members to shrink struct object_entry", 2018-03-12)

I.e. on git.git we end up with just over a a 8.5% reduction, and
interestingly have a slight increase over a past commit in one change,
and one that just makes 4kb of difference (marked via [X] above).

Also, your v0 says it overall saves 260MB of memory. According to this
it's 320MB. You did note some reductions in subsequent patches, but it's
worth calling that out explicitly.

I have a bigger in-house repo that looks like this with this change:

    MaxRSS:4753120  f23a196dd9 ("pack-objects: a bit of document about struct object_entry", 2018-03-01)
    MaxRSS:4699084  953b6473d7 ("pack-objects: turn type and in_pack_type to bitfields", 2018-03-01)
    MaxRSS:4699028  6cbe573539 ("pack-objects: use bitfield for object_entry::dfs_state", 2018-03-01)
    MaxRSS:4645452  0b93ebcae9 ("pack-objects: use bitfield for object_entry::depth", 2018-03-01)
    MaxRSS:4645288  67a4d48773 ("pack-objects: move in_pack_pos out of struct object_entry", 2018-03-01)
    MaxRSS:4645548  e77319c65a ("pack-objects: move in_pack out of struct object_entry", 2018-03-01)
    MaxRSS:4591484  a72cfcfea3 ("pack-objects: refer to delta objects by index instead of pointer", 2018-03-01)
    MaxRSS:4537980  76eaa779eb ("pack-objects: shrink z_delta_size field in struct object_entry", 2018-03-05)
    MaxRSS:4484148  42e28dd4b3 ("pack-objects: shrink size field in struct object_entry", 2018-03-05)
    MaxRSS:4430404  26eba3ded4 ("pack-objects: shrink delta_size field in struct object_entry", 2018-03-05)
    MaxRSS:4376148  c6493de964 ("pack-objects.h: reorder members to shrink struct object_entry", 2018-03-12)

I.e. a tad more than a 7.9% reduction in memory use.

This series also doesn't make a difference to the total runtime (which
is good, just wanted to make sure). On linux.git on my box best out of
three is 1:15.74 before and 1:14.93 after, which is within the margin of
random error.

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v5 00/11] nd/pack-objects-pack-struct updates
  2018-03-17 19:45           ` [PATCH v5 00/11] nd/pack-objects-pack-struct updates Ævar Arnfjörð Bjarmason
@ 2018-03-17 19:47             ` Ævar Arnfjörð Bjarmason
  0 siblings, 0 replies; 273+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2018-03-17 19:47 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: e, git, gitster, peff


On Sat, Mar 17 2018, Ævar Arnfjörð Bjarmason jotted:

> [...]I.e. on git.git we end up with just over a a 8.5% reduction, and[...]

Urgh, sorry, this should say "on linux.git...". None of these numbers
came from testing git.git.

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v5 11/11] pack-objects.h: reorder members to shrink struct object_entry
  2018-03-17 14:10           ` [PATCH v5 11/11] pack-objects.h: reorder members to shrink " Nguyễn Thái Ngọc Duy
@ 2018-03-17 19:53             ` Ævar Arnfjörð Bjarmason
  2018-03-18  8:49               ` Duy Nguyen
  0 siblings, 1 reply; 273+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2018-03-17 19:53 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: e, git, gitster, peff


On Sat, Mar 17 2018, Nguyễn Thái Ngọc Duy jotted:

> Previous patches leave lots of holes and padding in this struct. This
> patch reorders the members and shrinks the struct down to 80 bytes
> (from 136 bytes, before any field shrinking is done) with 16 bits to
> spare (and a couple more in in_pack_header_size when we really run out
> of bits).

Given what I mentioned in 87po42cwql.fsf@evledraar.gmail.com just now I
think we should add this to the commit message.

    This is the last in a series of memory reduction patches (see
    "pack-objects: a bit of document about struct object_entry" for the
    first one).

    Overall they've reduced repack memory size on linux.git from 3.747G
    to 3.424G, or by around 320M, a decrease of 8.5%. The runtime of
    repack has stayed the same throughout this series. Ævar's testing on
    a big monorepo he has access to (bigger than linux.git) has shown a
    7.9% reduction, so the overall expected improvement should be
    somewhere around 8%.

    See 87po42cwql.fsf@evledraar.gmail.com on-list
    (https://public-inbox.org/git/87po42cwql.fsf@evledraar.gmail.com/)
    for more detailed numbers and a test script used to produce the
    numbers cited above.

Thanks again for working on this.

> Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
> ---
>  pack-objects.h | 17 +++++++++--------
>  1 file changed, 9 insertions(+), 8 deletions(-)
>
> diff --git a/pack-objects.h b/pack-objects.h
> index 2507b157d5..8979289f5f 100644
> --- a/pack-objects.h
> +++ b/pack-objects.h
> @@ -71,35 +71,36 @@ enum dfs_state {
>   */
>  struct object_entry {
>  	struct pack_idx_entry idx;
> -	/* object uncompressed size _if_ size_valid is true */
> -	uint32_t size_;
> -	unsigned size_valid:1;
> -	unsigned in_pack_idx:OE_IN_PACK_BITS;	/* already in pack */
> +	void *delta_data;	/* cached delta (uncompressed) */
>  	off_t in_pack_offset;
> +	uint32_t hash;			/* name hint hash */
> +	uint32_t size_;	/* object uncompressed size _if_ size_valid is true */
>  	uint32_t delta_idx;	/* delta base object */
>  	uint32_t delta_child_idx; /* deltified objects who bases me */
>  	uint32_t delta_sibling_idx; /* other deltified objects who
>  				     * uses the same base as me
>  				     */
> -	void *delta_data;	/* cached delta (uncompressed) */
>  	uint32_t delta_size_:OE_DELTA_SIZE_BITS; /* delta data size (uncompressed) */
>  	uint32_t delta_size_valid:1;
> +	unsigned in_pack_idx:OE_IN_PACK_BITS;	/* already in pack */
> +	unsigned size_valid:1;
>  	unsigned z_delta_size:OE_Z_DELTA_BITS;
> +	unsigned type_valid:1;
>  	unsigned type_:TYPE_BITS;
>  	unsigned in_pack_type:TYPE_BITS; /* could be delta */
> -	unsigned type_valid:1;
> -	uint32_t hash;			/* name hint hash */
> -	unsigned char in_pack_header_size;
>  	unsigned preferred_base:1; /*
>  				    * we do not pack this, but is available
>  				    * to be used as the base object to delta
>  				    * objects against.
>  				    */
>  	unsigned no_try_delta:1;
> +	unsigned char in_pack_header_size;
>  	unsigned tagged:1; /* near the very tip of refs */
>  	unsigned filled:1; /* assigned write-order */
>  	unsigned dfs_state:OE_DFS_STATE_BITS;
>  	unsigned depth:OE_DEPTH_BITS;
> +
> +	/* size: 80, bit_padding: 16 bits */
>  };
>
>  struct packing_data {

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v5 09/11] pack-objects: shrink size field in struct object_entry
  2018-03-17 14:10           ` [PATCH v5 09/11] pack-objects: shrink size " Nguyễn Thái Ngọc Duy
@ 2018-03-17 19:57             ` Ævar Arnfjörð Bjarmason
  2018-03-18  5:09             ` Junio C Hamano
  1 sibling, 0 replies; 273+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2018-03-17 19:57 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: e, git, gitster, peff


On Sat, Mar 17 2018, Nguyễn Thái Ngọc Duy jotted:

> It's very very rare that an uncompressd object is larger than 4GB

s/uncompressd/uncompressed/

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v5 04/11] pack-objects: use bitfield for object_entry::depth
  2018-03-17 14:10           ` [PATCH v5 04/11] pack-objects: use bitfield for object_entry::depth Nguyễn Thái Ngọc Duy
@ 2018-03-17 21:26             ` Ævar Arnfjörð Bjarmason
  0 siblings, 0 replies; 273+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2018-03-17 21:26 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: e, git, gitster, peff


On Sat, Mar 17 2018, Nguyễn Thái Ngọc Duy jotted:

> Because of struct packing from now on we can only handle max depth
> 4095
> [...]
> +	if (depth >= (1 << OE_DEPTH_BITS))
> +		die(_("delta chain depth %d is greater than maximum limit %d"),
> +		    depth, (1 << OE_DEPTH_BITS));
> +

This has a off-by-one error:

    $ git repack --depth=4096
    fatal: delta chain depth 4096 is greater than maximum limit 4096

Per the check we should be feeding `(1 << OE_DEPTH_BITS) - 1` to die().

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Why does pack-objects use so much memory on incremental packing?
  2018-02-28  9:27 Reduce pack-objects memory footprint? Duy Nguyen
                   ` (4 preceding siblings ...)
  2018-03-02 10:18 ` Reduce pack-objects memory footprint? Duy Nguyen
@ 2018-03-17 22:05 ` Ævar Arnfjörð Bjarmason
  2018-03-18  8:37   ` Duy Nguyen
  2018-03-20  5:28   ` Jeff King
  5 siblings, 2 replies; 273+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2018-03-17 22:05 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: git, Jeff King, Jonathan Tan, Jeff Hostetler


On Wed, Feb 28 2018, Duy Nguyen jotted:

> linux-2.6.git current has 6483999 objects. "git gc" on my poor laptop
> consumes 1.7G out of 4G RAM, pushing lots of data to swap and making
> all apps nearly unusuable (granted the problem is partly Linux I/O
> scheduler too). So I wonder if we can reduce pack-objects memory
> footprint a bit.
>
> This demonstration patch (probably breaks some tests) would reduce the
> size of struct object_entry from from 136 down to 112 bytes on
> x86-64. There are 6483999 of these objects, so the saving is 17% or
> 148 MB.

Splitting this off into its own thread. Aside from the improvements in
your repack memory reduction (20180317141033.21545-1-pclouds@gmail.com)
and gc config (20180316192745.19557-1-pclouds@gmail.com) series's I'm
wondering why repack takes so much memory to incrementally repack new
stuff when you leave out the base pack.

Repacking git.git takes around 290MB of memory on my system, but I'd
think that this would make it take a mere few megabytes, since all I'm
asking it to do is pack up the few loose objects that got added and keep
the base pack:

    (
        rm -rf /tmp/git &&
        git clone git@github.com:git/git.git /tmp/git &&
        cd /tmp/git &&
        touch $(ls .git/objects/pack/*pack | sed 's/\.pack$/.keep/') &&
        for i in {1..10}
        do
            touch $i &&
            git add $i &&
            git commit -m$i
        done &&
        /usr/bin/time -f %M git repack -A -d
    )

But no, it takes around 230MB. But thinking about it a bit further:

 * This builds on top of existing history, so that needs to be
   read/consulted

 * We might be reusing (if not directly, skipping re-comuting) deltas
   from the existing pack.

But I get the same result if after cloning I make an orphan branch, and
pass all the "do this as cheaply as possible" branches I can find down
to git-repack:

    (
        rm -rf /tmp/git &&
        git clone git@github.com:git/git.git /tmp/git &&
        cd /tmp/git &&
        touch $(ls .git/objects/pack/*pack | sed 's/\.pack$/.keep/') &&
        git checkout --orphan new &&
        git reset --hard &&
        for i in {1..10}
        do
            touch $i &&
            git add $i &&
            git commit -m$i
        done &&
        git tag -d $(git tag -l) &&
        /usr/bin/time -f %M git repack -A -d -f -F --window=1 --depth=1
    )

But the memory use barely changes, my first example used 227924 kb, but
this one uses 226788.

Of course nobody's going to clone a huge repo and then right away create
an --orphan branch, but is there an inherent reason for why this
couldn't be taking as much memory as if the repo was cloned with
--depth=1?

I.e. when I have a *.keep on an existing pack we would have some
low-memory mode to copy the trees/blobs needed for the current commit
over to the new pack, and use that as the basis for packing everything
going forward.

Jeff: Is this something ref islands[1] could be (ab)used to do, or have
I misunderstood that concept?

1. https://public-inbox.org/git/20130626051117.GB26755@sigill.intra.peff.net/
   https://public-inbox.org/git/20160304153359.GA16300@sigill.intra.peff.net/
   https://public-inbox.org/git/20160809174528.2ydgkhd7ayclat3t@sigill.intra.peff.net/

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v5 09/11] pack-objects: shrink size field in struct object_entry
  2018-03-17 14:10           ` [PATCH v5 09/11] pack-objects: shrink size " Nguyễn Thái Ngọc Duy
  2018-03-17 19:57             ` Ævar Arnfjörð Bjarmason
@ 2018-03-18  5:09             ` Junio C Hamano
  2018-03-18  8:23               ` Duy Nguyen
  1 sibling, 1 reply; 273+ messages in thread
From: Junio C Hamano @ 2018-03-18  5:09 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: avarab, e, git, peff

Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:

> +static inline int contains_in_32bits(unsigned long limit)
> +{

This name somehow does not sound right.

If the verb "contain" must be used, the way to phrase what this
function does with the verb is to say "limit can be contained in a
32-bit int", so "contains" is probably where the funniness comes
from.

"fits in 32bits" is OK, I think.

> +	uint32_t truncated_limit = (uint32_t)limit;
> +
> +	return limit == truncated_limit;
> +}

I am guessing that a compiler that is clever enough will make this
function a no-op on a 32-bit arch and that is why it is a static
inline function?

> +static inline int oe_size_less_than(const struct object_entry *e,
> +				    unsigned long limit)
> +{
> +	if (e->size_valid)
> +		return e->size_ < limit;

e->size_ is the true size so we can compare it to see if it is smaller
than limit.

> +	if (contains_in_32bits(limit))
> +		return 1;

If limit is small enough, and because e->size_valid means e->size_
does not fit in 32-bit, we know size is larger than limit.
Shouldn't we be returning 0 that means "no, the size is not less
than limit" from here?

> +	return oe_size(e) < limit;
> +}
> +
> +static inline int oe_size_greater_than(const struct object_entry *e,
> +				       unsigned long limit)
> +{
> +	if (e->size_valid)
> +		return e->size_ > limit;

e->size_ is the true size so we compare and return if it is larger
than limit.

> +	if (contains_in_32bits(limit))
> +		return 0;

Now e->size_ is larger than what would fit within 32-bit.  If limit
fits within 32-bit, then size must be larger than limit.  Again,
shouldn't we be returning 1 that means "yes, the size is greater
than limit" from here?

> +	return oe_size(e) > limit;
> +}
> +
> +static inline void oe_set_size(struct object_entry *e,
> +			       unsigned long size)
> +{
> +	e->size_ = size;
> +	e->size_valid = e->size_ == size;
> +}
> +
>  #endif

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v5 09/11] pack-objects: shrink size field in struct object_entry
  2018-03-18  5:09             ` Junio C Hamano
@ 2018-03-18  8:23               ` Duy Nguyen
  0 siblings, 0 replies; 273+ messages in thread
From: Duy Nguyen @ 2018-03-18  8:23 UTC (permalink / raw)
  To: Junio C Hamano
  Cc: Ævar Arnfjörð Bjarmason, Eric Wong,
	Git Mailing List, Jeff King

On Sun, Mar 18, 2018 at 6:09 AM, Junio C Hamano <gitster@pobox.com> wrote:
>> +     uint32_t truncated_limit = (uint32_t)limit;
>> +
>> +     return limit == truncated_limit;
>> +}
>
> I am guessing that a compiler that is clever enough will make this
> function a no-op on a 32-bit arch and that is why it is a static
> inline function?

It's a separate function because I don't want to duplicate this ==
logic twice. Even if the compiler does not optimize this, it's still
much cheaper than oe_sze() which involves disk access.

>> +static inline int oe_size_less_than(const struct object_entry *e,
>> +                                 unsigned long limit)
>> +{
>> +     if (e->size_valid)
>> +             return e->size_ < limit;
>
> e->size_ is the true size so we can compare it to see if it is smaller
> than limit.
>
>> +     if (contains_in_32bits(limit))
>> +             return 1;
>
> If limit is small enough, and because e->size_valid means e->size_
> does not fit in 32-bit, we know size is larger than limit.
> Shouldn't we be returning 0 that means "no, the size is not less
> than limit" from here?

Argh!!! This logic keeps messing with my brain.
-- 
Duy

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: Why does pack-objects use so much memory on incremental packing?
  2018-03-17 22:05 ` Why does pack-objects use so much memory on incremental packing? Ævar Arnfjörð Bjarmason
@ 2018-03-18  8:37   ` Duy Nguyen
  2018-03-20  5:28   ` Jeff King
  1 sibling, 0 replies; 273+ messages in thread
From: Duy Nguyen @ 2018-03-18  8:37 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Git Mailing List, Jeff King, Jonathan Tan, Jeff Hostetler

On Sat, Mar 17, 2018 at 11:05 PM, Ævar Arnfjörð Bjarmason
<avarab@gmail.com> wrote:
>
> On Wed, Feb 28 2018, Duy Nguyen jotted:
>
>> linux-2.6.git current has 6483999 objects. "git gc" on my poor laptop
>> consumes 1.7G out of 4G RAM, pushing lots of data to swap and making
>> all apps nearly unusuable (granted the problem is partly Linux I/O
>> scheduler too). So I wonder if we can reduce pack-objects memory
>> footprint a bit.
>>
>> This demonstration patch (probably breaks some tests) would reduce the
>> size of struct object_entry from from 136 down to 112 bytes on
>> x86-64. There are 6483999 of these objects, so the saving is 17% or
>> 148 MB.
>
> Splitting this off into its own thread. Aside from the improvements in
> your repack memory reduction (20180317141033.21545-1-pclouds@gmail.com)
> and gc config (20180316192745.19557-1-pclouds@gmail.com) series's I'm
> wondering why repack takes so much memory to incrementally repack new
> stuff when you leave out the base pack.
>
> Repacking git.git takes around 290MB of memory on my system, but I'd
> think that this would make it take a mere few megabytes, since all I'm
> asking it to do is pack up the few loose objects that got added and keep
> the base pack:
>
> ...
>

I left some clue in the new estimate_repack_memory() function in my gc
series that could help you find this out. I haven't really tested this
case but my guess is the two cache pools we have will likely be filled
up close to full anyway and hit delta_base_cache_limit and
max_delta_cache_size limits. When these are really full on default
configuration, they'll take roughly ~300mb.

The second is, I think we still go through all objects to mark which
one is included in the new pack, which one not (and probably which one
can be delta base candidates). Try calling alloc_report() function at
the end of repack to see exactly how much memory is locked in there.
This we could perhaps improve for incremental repack by avoiding
running rev-list.
-- 
Duy

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v5 11/11] pack-objects.h: reorder members to shrink struct object_entry
  2018-03-17 19:53             ` Ævar Arnfjörð Bjarmason
@ 2018-03-18  8:49               ` Duy Nguyen
  0 siblings, 0 replies; 273+ messages in thread
From: Duy Nguyen @ 2018-03-18  8:49 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Eric Wong, Git Mailing List, Junio C Hamano, Jeff King

On Sat, Mar 17, 2018 at 8:53 PM, Ævar Arnfjörð Bjarmason
<avarab@gmail.com> wrote:
>
> On Sat, Mar 17 2018, Nguyễn Thái Ngọc Duy jotted:
>
>> Previous patches leave lots of holes and padding in this struct. This
>> patch reorders the members and shrinks the struct down to 80 bytes
>> (from 136 bytes, before any field shrinking is done) with 16 bits to
>> spare (and a couple more in in_pack_header_size when we really run out
>> of bits).
>
> Given what I mentioned in 87po42cwql.fsf@evledraar.gmail.com just now I
> think we should add this to the commit message.
>
>     This is the last in a series of memory reduction patches (see
>     "pack-objects: a bit of document about struct object_entry" for the
>     first one).
>
>     Overall they've reduced repack memory size on linux.git from 3.747G
>     to 3.424G, or by around 320M, a decrease of 8.5%. The runtime of
>     repack has stayed the same throughout this series. Ævar's testing on
>     a big monorepo he has access to (bigger than linux.git) has shown a
>     7.9% reduction, so the overall expected improvement should be
>     somewhere around 8%.
>
>     See 87po42cwql.fsf@evledraar.gmail.com on-list
>     (https://public-inbox.org/git/87po42cwql.fsf@evledraar.gmail.com/)
>     for more detailed numbers and a test script used to produce the
>     numbers cited above.

Yeah.

I probably should add something that was on my mind but never written
out. These shrinking and packing definitely slow down access to these
struct members (more instructions to read or write). However, since
pack-objects is mostly IO-bound, and when it's CPU-bound, I think the
hot path is either inflating objects or generating deltas, these
slowdowns do not matter (and smaller cache footprint helps too)
-- 
Duy

^ permalink raw reply	[flat|nested] 273+ messages in thread

* [PATCH v6 00/11] nd/pack-objects-pack-struct updates
  2018-03-17 14:10         ` [PATCH v5 00/11] nd/pack-objects-pack-struct updates Nguyễn Thái Ngọc Duy
                             ` (11 preceding siblings ...)
  2018-03-17 19:45           ` [PATCH v5 00/11] nd/pack-objects-pack-struct updates Ævar Arnfjörð Bjarmason
@ 2018-03-18 14:25           ` Nguyễn Thái Ngọc Duy
  2018-03-18 14:25             ` [PATCH v6 01/11] pack-objects: a bit of document about struct object_entry Nguyễn Thái Ngọc Duy
                               ` (13 more replies)
  12 siblings, 14 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-18 14:25 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

v6 fixes the one optimization that I just couldn't get right, fixes
two off-by-one error messages and a couple commit message update
(biggest change is in 11/11 to record some numbers from AEvar)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index fb2aba80bf..4406af640f 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -3112,10 +3112,10 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 
 	if (depth >= (1 << OE_DEPTH_BITS))
 		die(_("delta chain depth %d is greater than maximum limit %d"),
-		    depth, (1 << OE_DEPTH_BITS));
+		    depth, (1 << OE_DEPTH_BITS) - 1);
 	if (cache_max_small_delta_size >= (1 << OE_Z_DELTA_BITS))
 		die(_("pack.deltaCacheLimit is greater than maximum limit %d"),
-		    1 << OE_Z_DELTA_BITS);
+		    (1 << OE_Z_DELTA_BITS) - 1);
 
 	argv_array_push(&rp, "pack-objects");
 	if (thin) {
diff --git a/pack-objects.h b/pack-objects.h
index 55358da9f3..af40211105 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -275,7 +275,7 @@ static inline unsigned long oe_size(const struct object_entry *e)
 	}
 }
 
-static inline int contains_in_32bits(unsigned long limit)
+static inline int oe_fits_in_32bits(unsigned long limit)
 {
 	uint32_t truncated_limit = (uint32_t)limit;
 
@@ -287,8 +287,8 @@ static inline int oe_size_less_than(const struct object_entry *e,
 {
 	if (e->size_valid)
 		return e->size_ < limit;
-	if (contains_in_32bits(limit))
-		return 1;
+	if (oe_fits_in_32bits(limit)) /* limit < 2^32 <= size ? */
+		return 0;
 	return oe_size(e) < limit;
 }
 
@@ -297,8 +297,8 @@ static inline int oe_size_greater_than(const struct object_entry *e,
 {
 	if (e->size_valid)
 		return e->size_ > limit;
-	if (contains_in_32bits(limit))
-		return 0;
+	if (oe_fits_in_32bits(limit)) /* limit < 2^32 <= size ? */
+		return 1;
 	return oe_size(e) > limit;
 }
 
@@ -307,6 +307,14 @@ static inline void oe_set_size(struct object_entry *e,
 {
 	e->size_ = size;
 	e->size_valid = e->size_ == size;
+
+	if (!e->size_valid) {
+		unsigned long real_size;
+
+		if (sha1_object_info(e->idx.oid.hash, &real_size) < 0 ||
+		    size != real_size)
+			die("BUG: 'size' is supposed to be the object size!");
+	}
 }
 
 static inline unsigned long oe_delta_size(struct packing_data *pack,

-- 
2.17.0.rc0.347.gf9cf61673a


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v6 01/11] pack-objects: a bit of document about struct object_entry
  2018-03-18 14:25           ` [PATCH v6 " Nguyễn Thái Ngọc Duy
@ 2018-03-18 14:25             ` Nguyễn Thái Ngọc Duy
  2018-03-18 14:25             ` [PATCH v6 02/11] pack-objects: turn type and in_pack_type to bitfields Nguyễn Thái Ngọc Duy
                               ` (12 subsequent siblings)
  13 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-18 14:25 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

The role of this comment block becomes more important after we shuffle
fields around to shrink this struct. It will be much harder to see what
field is related to what.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 pack-objects.h | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/pack-objects.h b/pack-objects.h
index 03f1191659..c0a1f61aac 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -1,6 +1,51 @@
 #ifndef PACK_OBJECTS_H
 #define PACK_OBJECTS_H
 
+/*
+ * basic object info
+ * -----------------
+ * idx.oid is filled up before delta searching starts. idx.crc32 is
+ * only valid after the object is written out and will be used for
+ * generating the index. idx.offset will be both gradually set and
+ * used in writing phase (base objects get offset first, then deltas
+ * refer to them)
+ *
+ * "size" is the uncompressed object size. Compressed size of the raw
+ * data for an object in a pack is not stored anywhere but is computed
+ * and made available when reverse .idx is made.
+ *
+ * "hash" contains a path name hash which is used for sorting the
+ * delta list and also during delta searching. Once prepare_pack()
+ * returns it's no longer needed.
+ *
+ * source pack info
+ * ----------------
+ * The (in_pack, in_pack_offset) tuple contains the location of the
+ * object in the source pack. in_pack_header_size allows quickly
+ * skipping the header and going straight to the zlib stream.
+ *
+ * "type" and "in_pack_type" both describe object type. in_pack_type
+ * may contain a delta type, while type is always the canonical type.
+ *
+ * deltas
+ * ------
+ * Delta links (delta, delta_child and delta_sibling) are created to
+ * reflect that delta graph from the source pack then updated or added
+ * during delta searching phase when we find better deltas.
+ *
+ * delta_child and delta_sibling are last needed in
+ * compute_write_order(). "delta" and "delta_size" must remain valid
+ * at object writing phase in case the delta is not cached.
+ *
+ * If a delta is cached in memory and is compressed, delta_data points
+ * to the data and z_delta_size contains the compressed size. If it's
+ * uncompressed [1], z_delta_size must be zero. delta_size is always
+ * the uncompressed size and must be valid even if the delta is not
+ * cached.
+ *
+ * [1] during try_delta phase we don't bother with compressing because
+ * the delta could be quickly replaced with a better one.
+ */
 struct object_entry {
 	struct pack_idx_entry idx;
 	unsigned long size;	/* uncompressed size */
-- 
2.17.0.rc0.347.gf9cf61673a


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v6 02/11] pack-objects: turn type and in_pack_type to bitfields
  2018-03-18 14:25           ` [PATCH v6 " Nguyễn Thái Ngọc Duy
  2018-03-18 14:25             ` [PATCH v6 01/11] pack-objects: a bit of document about struct object_entry Nguyễn Thái Ngọc Duy
@ 2018-03-18 14:25             ` Nguyễn Thái Ngọc Duy
  2018-03-18 14:25             ` [PATCH v6 03/11] pack-objects: use bitfield for object_entry::dfs_state Nguyễn Thái Ngọc Duy
                               ` (11 subsequent siblings)
  13 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-18 14:25 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

An extra field type_valid is added to carry the equivalent of OBJ_BAD
in the original "type" field. in_pack_type always contains a valid
type so we only need 3 bits for it.

A note about accepting OBJ_NONE as "valid" type. The function
read_object_list_from_stdin() can pass this value [1] and it
eventually calls create_object_entry() where current code skip setting
"type" field if the incoming type is zero. This does not have any bad
side effects because "type" field should be memset()'d anyway.

But since we also need to set type_valid now, skipping oe_set_type()
leaves type_valid zero/false, which will make oe_type() return
OBJ_BAD, not OBJ_NONE anymore. Apparently we do care about OBJ_NONE in
prepare_pack(). This switch from OBJ_NONE to OBJ_BAD may trigger

    fatal: unable to get type of object ...

Accepting OBJ_NONE [2] does sound wrong, but this is how it is has
been for a very long time and I haven't time to dig in further.

[1] See 5c49c11686 (pack-objects: better check_object() performances -
    2007-04-16)

[2] 21666f1aae (convert object type handling from a string to a number
    - 2007-02-26)

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 60 ++++++++++++++++++++++++------------------
 cache.h                |  2 ++
 object.h               |  1 -
 pack-bitmap-write.c    |  6 ++---
 pack-objects.h         | 20 ++++++++++++--
 5 files changed, 58 insertions(+), 31 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 5c674b2843..647c01ea34 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -265,7 +265,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 	struct git_istream *st = NULL;
 
 	if (!usable_delta) {
-		if (entry->type == OBJ_BLOB &&
+		if (oe_type(entry) == OBJ_BLOB &&
 		    entry->size > big_file_threshold &&
 		    (st = open_istream(entry->idx.oid.hash, &type, &size, NULL)) != NULL)
 			buf = NULL;
@@ -371,7 +371,7 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 	struct pack_window *w_curs = NULL;
 	struct revindex_entry *revidx;
 	off_t offset;
-	enum object_type type = entry->type;
+	enum object_type type = oe_type(entry);
 	off_t datalen;
 	unsigned char header[MAX_PACK_OBJECT_HEADER],
 		      dheader[MAX_PACK_OBJECT_HEADER];
@@ -480,11 +480,12 @@ static off_t write_object(struct hashfile *f,
 		to_reuse = 0;	/* explicit */
 	else if (!entry->in_pack)
 		to_reuse = 0;	/* can't reuse what we don't have */
-	else if (entry->type == OBJ_REF_DELTA || entry->type == OBJ_OFS_DELTA)
+	else if (oe_type(entry) == OBJ_REF_DELTA ||
+		 oe_type(entry) == OBJ_OFS_DELTA)
 				/* check_object() decided it for us ... */
 		to_reuse = usable_delta;
 				/* ... but pack split may override that */
-	else if (entry->type != entry->in_pack_type)
+	else if (oe_type(entry) != entry->in_pack_type)
 		to_reuse = 0;	/* pack has delta which is unusable */
 	else if (entry->delta)
 		to_reuse = 0;	/* we want to pack afresh */
@@ -705,8 +706,8 @@ static struct object_entry **compute_write_order(void)
 	 * And then all remaining commits and tags.
 	 */
 	for (i = last_untagged; i < to_pack.nr_objects; i++) {
-		if (objects[i].type != OBJ_COMMIT &&
-		    objects[i].type != OBJ_TAG)
+		if (oe_type(&objects[i]) != OBJ_COMMIT &&
+		    oe_type(&objects[i]) != OBJ_TAG)
 			continue;
 		add_to_write_order(wo, &wo_end, &objects[i]);
 	}
@@ -715,7 +716,7 @@ static struct object_entry **compute_write_order(void)
 	 * And then all the trees.
 	 */
 	for (i = last_untagged; i < to_pack.nr_objects; i++) {
-		if (objects[i].type != OBJ_TREE)
+		if (oe_type(&objects[i]) != OBJ_TREE)
 			continue;
 		add_to_write_order(wo, &wo_end, &objects[i]);
 	}
@@ -1066,8 +1067,7 @@ static void create_object_entry(const struct object_id *oid,
 
 	entry = packlist_alloc(&to_pack, oid->hash, index_pos);
 	entry->hash = hash;
-	if (type)
-		entry->type = type;
+	oe_set_type(entry, type);
 	if (exclude)
 		entry->preferred_base = 1;
 	else
@@ -1407,6 +1407,7 @@ static void check_object(struct object_entry *entry)
 		unsigned long avail;
 		off_t ofs;
 		unsigned char *buf, c;
+		enum object_type type;
 
 		buf = use_pack(p, &w_curs, entry->in_pack_offset, &avail);
 
@@ -1415,11 +1416,15 @@ static void check_object(struct object_entry *entry)
 		 * since non-delta representations could still be reused.
 		 */
 		used = unpack_object_header_buffer(buf, avail,
-						   &entry->in_pack_type,
+						   &type,
 						   &entry->size);
 		if (used == 0)
 			goto give_up;
 
+		if (type < 0)
+			die("BUG: invalid type %d", type);
+		entry->in_pack_type = type;
+
 		/*
 		 * Determine if this is a delta and if so whether we can
 		 * reuse it or not.  Otherwise let's find out as cheaply as
@@ -1428,9 +1433,9 @@ static void check_object(struct object_entry *entry)
 		switch (entry->in_pack_type) {
 		default:
 			/* Not a delta hence we've already got all we need. */
-			entry->type = entry->in_pack_type;
+			oe_set_type(entry, entry->in_pack_type);
 			entry->in_pack_header_size = used;
-			if (entry->type < OBJ_COMMIT || entry->type > OBJ_BLOB)
+			if (oe_type(entry) < OBJ_COMMIT || oe_type(entry) > OBJ_BLOB)
 				goto give_up;
 			unuse_pack(&w_curs);
 			return;
@@ -1484,7 +1489,7 @@ static void check_object(struct object_entry *entry)
 			 * deltify other objects against, in order to avoid
 			 * circular deltas.
 			 */
-			entry->type = entry->in_pack_type;
+			oe_set_type(entry, entry->in_pack_type);
 			entry->delta = base_entry;
 			entry->delta_size = entry->size;
 			entry->delta_sibling = base_entry->delta_child;
@@ -1493,7 +1498,7 @@ static void check_object(struct object_entry *entry)
 			return;
 		}
 
-		if (entry->type) {
+		if (oe_type(entry)) {
 			/*
 			 * This must be a delta and we already know what the
 			 * final object type is.  Let's extract the actual
@@ -1516,7 +1521,7 @@ static void check_object(struct object_entry *entry)
 		unuse_pack(&w_curs);
 	}
 
-	entry->type = sha1_object_info(entry->idx.oid.hash, &entry->size);
+	oe_set_type(entry, sha1_object_info(entry->idx.oid.hash, &entry->size));
 	/*
 	 * The error condition is checked in prepare_pack().  This is
 	 * to permit a missing preferred base object to be ignored
@@ -1559,6 +1564,7 @@ static void drop_reused_delta(struct object_entry *entry)
 {
 	struct object_entry **p = &entry->delta->delta_child;
 	struct object_info oi = OBJECT_INFO_INIT;
+	enum object_type type;
 
 	while (*p) {
 		if (*p == entry)
@@ -1570,16 +1576,18 @@ static void drop_reused_delta(struct object_entry *entry)
 	entry->depth = 0;
 
 	oi.sizep = &entry->size;
-	oi.typep = &entry->type;
+	oi.typep = &type;
 	if (packed_object_info(entry->in_pack, entry->in_pack_offset, &oi) < 0) {
 		/*
 		 * We failed to get the info from this pack for some reason;
 		 * fall back to sha1_object_info, which may find another copy.
-		 * And if that fails, the error will be recorded in entry->type
+		 * And if that fails, the error will be recorded in oe_type(entry)
 		 * and dealt with in prepare_pack().
 		 */
-		entry->type = sha1_object_info(entry->idx.oid.hash,
-					       &entry->size);
+		oe_set_type(entry, sha1_object_info(entry->idx.oid.hash,
+						    &entry->size));
+	} else {
+		oe_set_type(entry, type);
 	}
 }
 
@@ -1747,10 +1755,12 @@ static int type_size_sort(const void *_a, const void *_b)
 {
 	const struct object_entry *a = *(struct object_entry **)_a;
 	const struct object_entry *b = *(struct object_entry **)_b;
+	enum object_type a_type = oe_type(a);
+	enum object_type b_type = oe_type(b);
 
-	if (a->type > b->type)
+	if (a_type > b_type)
 		return -1;
-	if (a->type < b->type)
+	if (a_type < b_type)
 		return 1;
 	if (a->hash > b->hash)
 		return -1;
@@ -1826,7 +1836,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	void *delta_buf;
 
 	/* Don't bother doing diffs between different types */
-	if (trg_entry->type != src_entry->type)
+	if (oe_type(trg_entry) != oe_type(src_entry))
 		return -1;
 
 	/*
@@ -2432,11 +2442,11 @@ static void prepare_pack(int window, int depth)
 
 		if (!entry->preferred_base) {
 			nr_deltas++;
-			if (entry->type < 0)
+			if (oe_type(entry) < 0)
 				die("unable to get type of object %s",
 				    oid_to_hex(&entry->idx.oid));
 		} else {
-			if (entry->type < 0) {
+			if (oe_type(entry) < 0) {
 				/*
 				 * This object is not found, but we
 				 * don't have to include it anyway.
@@ -2545,7 +2555,7 @@ static void read_object_list_from_stdin(void)
 			die("expected object ID, got garbage:\n %s", line);
 
 		add_preferred_base_object(p + 1);
-		add_object_entry(&oid, 0, p + 1, 0);
+		add_object_entry(&oid, OBJ_NONE, p + 1, 0);
 	}
 }
 
diff --git a/cache.h b/cache.h
index 21fbcc2414..862bdff83a 100644
--- a/cache.h
+++ b/cache.h
@@ -373,6 +373,8 @@ extern void free_name_hash(struct index_state *istate);
 #define read_blob_data_from_cache(path, sz) read_blob_data_from_index(&the_index, (path), (sz))
 #endif
 
+#define TYPE_BITS 3
+
 enum object_type {
 	OBJ_BAD = -1,
 	OBJ_NONE = 0,
diff --git a/object.h b/object.h
index 87563d9056..8ce294d6ec 100644
--- a/object.h
+++ b/object.h
@@ -25,7 +25,6 @@ struct object_array {
 
 #define OBJECT_ARRAY_INIT { 0, 0, NULL }
 
-#define TYPE_BITS   3
 /*
  * object flag allocation:
  * revision.h:      0---------10                                26
diff --git a/pack-bitmap-write.c b/pack-bitmap-write.c
index e01f992884..fd11f08940 100644
--- a/pack-bitmap-write.c
+++ b/pack-bitmap-write.c
@@ -64,12 +64,12 @@ void bitmap_writer_build_type_index(struct pack_idx_entry **index,
 
 		entry->in_pack_pos = i;
 
-		switch (entry->type) {
+		switch (oe_type(entry)) {
 		case OBJ_COMMIT:
 		case OBJ_TREE:
 		case OBJ_BLOB:
 		case OBJ_TAG:
-			real_type = entry->type;
+			real_type = oe_type(entry);
 			break;
 
 		default:
@@ -98,7 +98,7 @@ void bitmap_writer_build_type_index(struct pack_idx_entry **index,
 		default:
 			die("Missing type information for %s (%d/%d)",
 			    oid_to_hex(&entry->idx.oid), real_type,
-			    entry->type);
+			    oe_type(entry));
 		}
 	}
 }
diff --git a/pack-objects.h b/pack-objects.h
index c0a1f61aac..b883d7aa10 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -59,8 +59,9 @@ struct object_entry {
 	void *delta_data;	/* cached delta (uncompressed) */
 	unsigned long delta_size;	/* delta data size (uncompressed) */
 	unsigned long z_delta_size;	/* delta data size (compressed) */
-	enum object_type type;
-	enum object_type in_pack_type;	/* could be delta */
+	unsigned type_:TYPE_BITS;
+	unsigned in_pack_type:TYPE_BITS; /* could be delta */
+	unsigned type_valid:1;
 	uint32_t hash;			/* name hint hash */
 	unsigned int in_pack_pos;
 	unsigned char in_pack_header_size;
@@ -123,4 +124,19 @@ static inline uint32_t pack_name_hash(const char *name)
 	return hash;
 }
 
+static inline enum object_type oe_type(const struct object_entry *e)
+{
+	return e->type_valid ? e->type_ : OBJ_BAD;
+}
+
+static inline void oe_set_type(struct object_entry *e,
+			       enum object_type type)
+{
+	if (type >= OBJ_ANY)
+		die("BUG: OBJ_ANY cannot be set in pack-objects code");
+
+	e->type_valid = type >= OBJ_NONE;
+	e->type_ = (unsigned)type;
+}
+
 #endif
-- 
2.17.0.rc0.347.gf9cf61673a


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v6 03/11] pack-objects: use bitfield for object_entry::dfs_state
  2018-03-18 14:25           ` [PATCH v6 " Nguyễn Thái Ngọc Duy
  2018-03-18 14:25             ` [PATCH v6 01/11] pack-objects: a bit of document about struct object_entry Nguyễn Thái Ngọc Duy
  2018-03-18 14:25             ` [PATCH v6 02/11] pack-objects: turn type and in_pack_type to bitfields Nguyễn Thái Ngọc Duy
@ 2018-03-18 14:25             ` Nguyễn Thái Ngọc Duy
  2018-03-18 14:25             ` [PATCH v6 04/11] pack-objects: use bitfield for object_entry::depth Nguyễn Thái Ngọc Duy
                               ` (10 subsequent siblings)
  13 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-18 14:25 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c |  3 +++
 pack-objects.h         | 28 +++++++++++++++++-----------
 2 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 647c01ea34..83f8154865 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -3049,6 +3049,9 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 		OPT_END(),
 	};
 
+	if (DFS_NUM_STATES > (1 << OE_DFS_STATE_BITS))
+		die("BUG: too many dfs states, increase OE_DFS_STATE_BITS");
+
 	check_replace_refs = 0;
 
 	reset_pack_idx_option(&pack_idx_opts);
diff --git a/pack-objects.h b/pack-objects.h
index b883d7aa10..8507e1b869 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -1,6 +1,21 @@
 #ifndef PACK_OBJECTS_H
 #define PACK_OBJECTS_H
 
+#define OE_DFS_STATE_BITS	2
+
+/*
+ * State flags for depth-first search used for analyzing delta cycles.
+ *
+ * The depth is measured in delta-links to the base (so if A is a delta
+ * against B, then A has a depth of 1, and B a depth of 0).
+ */
+enum dfs_state {
+	DFS_NONE = 0,
+	DFS_ACTIVE,
+	DFS_DONE,
+	DFS_NUM_STATES
+};
+
 /*
  * basic object info
  * -----------------
@@ -73,19 +88,10 @@ struct object_entry {
 	unsigned no_try_delta:1;
 	unsigned tagged:1; /* near the very tip of refs */
 	unsigned filled:1; /* assigned write-order */
+	unsigned dfs_state:OE_DFS_STATE_BITS;
 
-	/*
-	 * State flags for depth-first search used for analyzing delta cycles.
-	 *
-	 * The depth is measured in delta-links to the base (so if A is a delta
-	 * against B, then A has a depth of 1, and B a depth of 0).
-	 */
-	enum {
-		DFS_NONE = 0,
-		DFS_ACTIVE,
-		DFS_DONE
-	} dfs_state;
 	int depth;
+
 };
 
 struct packing_data {
-- 
2.17.0.rc0.347.gf9cf61673a


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v6 04/11] pack-objects: use bitfield for object_entry::depth
  2018-03-18 14:25           ` [PATCH v6 " Nguyễn Thái Ngọc Duy
                               ` (2 preceding siblings ...)
  2018-03-18 14:25             ` [PATCH v6 03/11] pack-objects: use bitfield for object_entry::dfs_state Nguyễn Thái Ngọc Duy
@ 2018-03-18 14:25             ` Nguyễn Thái Ngọc Duy
  2018-03-18 14:25             ` [PATCH v6 05/11] pack-objects: move in_pack_pos out of struct object_entry Nguyễn Thái Ngọc Duy
                               ` (9 subsequent siblings)
  13 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-18 14:25 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

Because of struct packing from now on we can only handle max depth
4095 (or even lower when new booleans are added in this struct). This
should be ok since long delta chain will cause significant slow down
anyway.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 Documentation/config.txt           | 1 +
 Documentation/git-pack-objects.txt | 4 +++-
 Documentation/git-repack.txt       | 4 +++-
 builtin/pack-objects.c             | 4 ++++
 pack-objects.h                     | 5 ++---
 5 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index f57e9cf10c..9bd3f5a789 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -2412,6 +2412,7 @@ pack.window::
 pack.depth::
 	The maximum delta depth used by linkgit:git-pack-objects[1] when no
 	maximum depth is given on the command line. Defaults to 50.
+	Maximum value is 4095.
 
 pack.windowMemory::
 	The maximum size of memory that is consumed by each thread
diff --git a/Documentation/git-pack-objects.txt b/Documentation/git-pack-objects.txt
index 81bc490ac5..3503c9e3e6 100644
--- a/Documentation/git-pack-objects.txt
+++ b/Documentation/git-pack-objects.txt
@@ -96,7 +96,9 @@ base-name::
 	it too deep affects the performance on the unpacker
 	side, because delta data needs to be applied that many
 	times to get to the necessary object.
-	The default value for --window is 10 and --depth is 50.
++
+The default value for --window is 10 and --depth is 50. The maximum
+depth is 4095.
 
 --window-memory=<n>::
 	This option provides an additional limit on top of `--window`;
diff --git a/Documentation/git-repack.txt b/Documentation/git-repack.txt
index ae750e9e11..25c83c4927 100644
--- a/Documentation/git-repack.txt
+++ b/Documentation/git-repack.txt
@@ -90,7 +90,9 @@ other objects in that pack they already have locally.
 	space. `--depth` limits the maximum delta depth; making it too deep
 	affects the performance on the unpacker side, because delta data needs
 	to be applied that many times to get to the necessary object.
-	The default value for --window is 10 and --depth is 50.
++
+The default value for --window is 10 and --depth is 50. The maximum
+depth is 4095.
 
 --threads=<n>::
 	This option is passed through to `git pack-objects`.
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 83f8154865..205e1f646c 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -3068,6 +3068,10 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 	if (pack_to_stdout != !base_name || argc)
 		usage_with_options(pack_usage, pack_objects_options);
 
+	if (depth >= (1 << OE_DEPTH_BITS))
+		die(_("delta chain depth %d is greater than maximum limit %d"),
+		    depth, (1 << OE_DEPTH_BITS) - 1);
+
 	argv_array_push(&rp, "pack-objects");
 	if (thin) {
 		use_internal_rev_list = 1;
diff --git a/pack-objects.h b/pack-objects.h
index 8507e1b869..59407aae3c 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -2,6 +2,7 @@
 #define PACK_OBJECTS_H
 
 #define OE_DFS_STATE_BITS	2
+#define OE_DEPTH_BITS		12
 
 /*
  * State flags for depth-first search used for analyzing delta cycles.
@@ -89,9 +90,7 @@ struct object_entry {
 	unsigned tagged:1; /* near the very tip of refs */
 	unsigned filled:1; /* assigned write-order */
 	unsigned dfs_state:OE_DFS_STATE_BITS;
-
-	int depth;
-
+	unsigned depth:OE_DEPTH_BITS;
 };
 
 struct packing_data {
-- 
2.17.0.rc0.347.gf9cf61673a


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v6 05/11] pack-objects: move in_pack_pos out of struct object_entry
  2018-03-18 14:25           ` [PATCH v6 " Nguyễn Thái Ngọc Duy
                               ` (3 preceding siblings ...)
  2018-03-18 14:25             ` [PATCH v6 04/11] pack-objects: use bitfield for object_entry::depth Nguyễn Thái Ngọc Duy
@ 2018-03-18 14:25             ` Nguyễn Thái Ngọc Duy
  2018-03-18 14:25             ` [PATCH v6 06/11] pack-objects: move in_pack " Nguyễn Thái Ngọc Duy
                               ` (8 subsequent siblings)
  13 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-18 14:25 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

This field is only need for pack-bitmap, which is an optional
feature. Move it to a separate array that is only allocated when
pack-bitmap is used (it's not freed in the same way that objects[] is
not).

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c |  3 ++-
 pack-bitmap-write.c    |  8 +++++---
 pack-bitmap.c          |  2 +-
 pack-bitmap.h          |  4 +++-
 pack-objects.h         | 16 +++++++++++++++-
 5 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 205e1f646c..e1244918a5 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -879,7 +879,8 @@ static void write_pack_file(void)
 
 			if (write_bitmap_index) {
 				bitmap_writer_set_checksum(oid.hash);
-				bitmap_writer_build_type_index(written_list, nr_written);
+				bitmap_writer_build_type_index(
+					&to_pack, written_list, nr_written);
 			}
 
 			finish_tmp_packfile(&tmpname, pack_tmp_name,
diff --git a/pack-bitmap-write.c b/pack-bitmap-write.c
index fd11f08940..f7c897515b 100644
--- a/pack-bitmap-write.c
+++ b/pack-bitmap-write.c
@@ -48,7 +48,8 @@ void bitmap_writer_show_progress(int show)
 /**
  * Build the initial type index for the packfile
  */
-void bitmap_writer_build_type_index(struct pack_idx_entry **index,
+void bitmap_writer_build_type_index(struct packing_data *to_pack,
+				    struct pack_idx_entry **index,
 				    uint32_t index_nr)
 {
 	uint32_t i;
@@ -57,12 +58,13 @@ void bitmap_writer_build_type_index(struct pack_idx_entry **index,
 	writer.trees = ewah_new();
 	writer.blobs = ewah_new();
 	writer.tags = ewah_new();
+	ALLOC_ARRAY(to_pack->in_pack_pos, to_pack->nr_objects);
 
 	for (i = 0; i < index_nr; ++i) {
 		struct object_entry *entry = (struct object_entry *)index[i];
 		enum object_type real_type;
 
-		entry->in_pack_pos = i;
+		oe_set_in_pack_pos(to_pack, entry, i);
 
 		switch (oe_type(entry)) {
 		case OBJ_COMMIT:
@@ -147,7 +149,7 @@ static uint32_t find_object_pos(const unsigned char *sha1)
 			"(object %s is missing)", sha1_to_hex(sha1));
 	}
 
-	return entry->in_pack_pos;
+	return oe_in_pack_pos(writer.to_pack, entry);
 }
 
 static void show_object(struct object *object, const char *name, void *data)
diff --git a/pack-bitmap.c b/pack-bitmap.c
index 9270983e5f..865d9ecc4e 100644
--- a/pack-bitmap.c
+++ b/pack-bitmap.c
@@ -1032,7 +1032,7 @@ int rebuild_existing_bitmaps(struct packing_data *mapping,
 		oe = packlist_find(mapping, sha1, NULL);
 
 		if (oe)
-			reposition[i] = oe->in_pack_pos + 1;
+			reposition[i] = oe_in_pack_pos(mapping, oe) + 1;
 	}
 
 	rebuild = bitmap_new();
diff --git a/pack-bitmap.h b/pack-bitmap.h
index 3742a00e14..5ded2f139a 100644
--- a/pack-bitmap.h
+++ b/pack-bitmap.h
@@ -44,7 +44,9 @@ int rebuild_existing_bitmaps(struct packing_data *mapping, khash_sha1 *reused_bi
 
 void bitmap_writer_show_progress(int show);
 void bitmap_writer_set_checksum(unsigned char *sha1);
-void bitmap_writer_build_type_index(struct pack_idx_entry **index, uint32_t index_nr);
+void bitmap_writer_build_type_index(struct packing_data *to_pack,
+				    struct pack_idx_entry **index,
+				    uint32_t index_nr);
 void bitmap_writer_reuse_bitmaps(struct packing_data *to_pack);
 void bitmap_writer_select_commits(struct commit **indexed_commits,
 		unsigned int indexed_commits_nr, int max_bitmaps);
diff --git a/pack-objects.h b/pack-objects.h
index 59407aae3c..4a11653657 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -79,7 +79,6 @@ struct object_entry {
 	unsigned in_pack_type:TYPE_BITS; /* could be delta */
 	unsigned type_valid:1;
 	uint32_t hash;			/* name hint hash */
-	unsigned int in_pack_pos;
 	unsigned char in_pack_header_size;
 	unsigned preferred_base:1; /*
 				    * we do not pack this, but is available
@@ -99,6 +98,8 @@ struct packing_data {
 
 	int32_t *index;
 	uint32_t index_size;
+
+	unsigned int *in_pack_pos;
 };
 
 struct object_entry *packlist_alloc(struct packing_data *pdata,
@@ -144,4 +145,17 @@ static inline void oe_set_type(struct object_entry *e,
 	e->type_ = (unsigned)type;
 }
 
+static inline unsigned int oe_in_pack_pos(const struct packing_data *pack,
+					  const struct object_entry *e)
+{
+	return pack->in_pack_pos[e - pack->objects];
+}
+
+static inline void oe_set_in_pack_pos(const struct packing_data *pack,
+				      const struct object_entry *e,
+				      unsigned int pos)
+{
+	pack->in_pack_pos[e - pack->objects] = pos;
+}
+
 #endif
-- 
2.17.0.rc0.347.gf9cf61673a


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v6 06/11] pack-objects: move in_pack out of struct object_entry
  2018-03-18 14:25           ` [PATCH v6 " Nguyễn Thái Ngọc Duy
                               ` (4 preceding siblings ...)
  2018-03-18 14:25             ` [PATCH v6 05/11] pack-objects: move in_pack_pos out of struct object_entry Nguyễn Thái Ngọc Duy
@ 2018-03-18 14:25             ` Nguyễn Thái Ngọc Duy
  2018-03-18 14:25             ` [PATCH v6 07/11] pack-objects: refer to delta objects by index instead of pointer Nguyễn Thái Ngọc Duy
                               ` (7 subsequent siblings)
  13 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-18 14:25 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

Instead of using 8 bytes (on 64 bit arch) to store a pointer to a
pack. Use an index instead since the number of packs should be
relatively small.

This limits the number of packs we can handle to 16k. For now if you hit
16k pack files limit, pack-objects will simply fail [1].

[1] The escape hatch is .keep file to limit the non-kept pack files
    below 16k limit. Then you can go for another pack-objects run to
    combine another 16k pack files. Repeat until you're satisfied.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 Documentation/git-pack-objects.txt |  9 ++++++
 builtin/pack-objects.c             | 40 +++++++++++++++++----------
 cache.h                            |  1 +
 pack-objects.h                     | 44 +++++++++++++++++++++++++++++-
 4 files changed, 79 insertions(+), 15 deletions(-)

diff --git a/Documentation/git-pack-objects.txt b/Documentation/git-pack-objects.txt
index 3503c9e3e6..b8d936ccf5 100644
--- a/Documentation/git-pack-objects.txt
+++ b/Documentation/git-pack-objects.txt
@@ -269,6 +269,15 @@ Unexpected missing object will raise an error.
 	locally created objects [without .promisor] and objects from the
 	promisor remote [with .promisor].)  This is used with partial clone.
 
+LIMITATIONS
+-----------
+
+This command could only handle 16384 existing pack files at a time.
+If you have more than this, you need to exclude some pack files with
+".keep" file and --honor-pack-keep option, to combine 16k pack files
+in one, then remove these .keep files and run pack-objects one more
+time.
+
 SEE ALSO
 --------
 linkgit:git-rev-list[1]
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index e1244918a5..9792d31e46 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -29,6 +29,8 @@
 #include "list.h"
 #include "packfile.h"
 
+#define IN_PACK(obj) oe_in_pack(&to_pack, obj)
+
 static const char *pack_usage[] = {
 	N_("git pack-objects --stdout [<options>...] [< <ref-list> | < <object-list>]"),
 	N_("git pack-objects [<options>...] <base-name> [< <ref-list> | < <object-list>]"),
@@ -367,7 +369,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 				unsigned long limit, int usable_delta)
 {
-	struct packed_git *p = entry->in_pack;
+	struct packed_git *p = IN_PACK(entry);
 	struct pack_window *w_curs = NULL;
 	struct revindex_entry *revidx;
 	off_t offset;
@@ -478,7 +480,7 @@ static off_t write_object(struct hashfile *f,
 
 	if (!reuse_object)
 		to_reuse = 0;	/* explicit */
-	else if (!entry->in_pack)
+	else if (!IN_PACK(entry))
 		to_reuse = 0;	/* can't reuse what we don't have */
 	else if (oe_type(entry) == OBJ_REF_DELTA ||
 		 oe_type(entry) == OBJ_OFS_DELTA)
@@ -1025,7 +1027,7 @@ static int want_object_in_pack(const struct object_id *oid,
 	if (*found_pack) {
 		want = want_found_object(exclude, *found_pack);
 		if (want != -1)
-			return want;
+			goto done;
 	}
 
 	list_for_each(pos, &packed_git_mru) {
@@ -1048,11 +1050,16 @@ static int want_object_in_pack(const struct object_id *oid,
 			if (!exclude && want > 0)
 				list_move(&p->mru, &packed_git_mru);
 			if (want != -1)
-				return want;
+				goto done;
 		}
 	}
 
-	return 1;
+	want = 1;
+done:
+	if (want && *found_pack && !(*found_pack)->index)
+		oe_add_pack(&to_pack, *found_pack);
+
+	return want;
 }
 
 static void create_object_entry(const struct object_id *oid,
@@ -1074,7 +1081,7 @@ static void create_object_entry(const struct object_id *oid,
 	else
 		nr_result++;
 	if (found_pack) {
-		entry->in_pack = found_pack;
+		oe_set_in_pack(entry, found_pack);
 		entry->in_pack_offset = found_offset;
 	}
 
@@ -1399,8 +1406,8 @@ static void cleanup_preferred_base(void)
 
 static void check_object(struct object_entry *entry)
 {
-	if (entry->in_pack) {
-		struct packed_git *p = entry->in_pack;
+	if (IN_PACK(entry)) {
+		struct packed_git *p = IN_PACK(entry);
 		struct pack_window *w_curs = NULL;
 		const unsigned char *base_ref = NULL;
 		struct object_entry *base_entry;
@@ -1535,14 +1542,16 @@ static int pack_offset_sort(const void *_a, const void *_b)
 {
 	const struct object_entry *a = *(struct object_entry **)_a;
 	const struct object_entry *b = *(struct object_entry **)_b;
+	const struct packed_git *a_in_pack = IN_PACK(a);
+	const struct packed_git *b_in_pack = IN_PACK(b);
 
 	/* avoid filesystem trashing with loose objects */
-	if (!a->in_pack && !b->in_pack)
+	if (!a_in_pack && !b_in_pack)
 		return oidcmp(&a->idx.oid, &b->idx.oid);
 
-	if (a->in_pack < b->in_pack)
+	if (a_in_pack < b_in_pack)
 		return -1;
-	if (a->in_pack > b->in_pack)
+	if (a_in_pack > b_in_pack)
 		return 1;
 	return a->in_pack_offset < b->in_pack_offset ? -1 :
 			(a->in_pack_offset > b->in_pack_offset);
@@ -1578,7 +1587,7 @@ static void drop_reused_delta(struct object_entry *entry)
 
 	oi.sizep = &entry->size;
 	oi.typep = &type;
-	if (packed_object_info(entry->in_pack, entry->in_pack_offset, &oi) < 0) {
+	if (packed_object_info(IN_PACK(entry), entry->in_pack_offset, &oi) < 0) {
 		/*
 		 * We failed to get the info from this pack for some reason;
 		 * fall back to sha1_object_info, which may find another copy.
@@ -1848,8 +1857,8 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	 * it, we will still save the transfer cost, as we already know
 	 * the other side has it and we won't send src_entry at all.
 	 */
-	if (reuse_delta && trg_entry->in_pack &&
-	    trg_entry->in_pack == src_entry->in_pack &&
+	if (reuse_delta && IN_PACK(trg_entry) &&
+	    IN_PACK(trg_entry) == IN_PACK(src_entry) &&
 	    !src_entry->preferred_base &&
 	    trg_entry->in_pack_type != OBJ_REF_DELTA &&
 	    trg_entry->in_pack_type != OBJ_OFS_DELTA)
@@ -3191,6 +3200,9 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 		}
 	}
 
+	/* make sure IN_PACK(0) return NULL */
+	oe_add_pack(&to_pack, NULL);
+
 	if (progress)
 		progress_state = start_progress(_("Counting objects"), 0);
 	if (!use_internal_rev_list)
diff --git a/cache.h b/cache.h
index 862bdff83a..b90feb3802 100644
--- a/cache.h
+++ b/cache.h
@@ -1635,6 +1635,7 @@ extern struct packed_git {
 	int index_version;
 	time_t mtime;
 	int pack_fd;
+	int index;		/* for builtin/pack-objects.c */
 	unsigned pack_local:1,
 		 pack_keep:1,
 		 freshened:1,
diff --git a/pack-objects.h b/pack-objects.h
index 4a11653657..bf905c3f9b 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -3,6 +3,7 @@
 
 #define OE_DFS_STATE_BITS	2
 #define OE_DEPTH_BITS		12
+#define OE_IN_PACK_BITS		14
 
 /*
  * State flags for depth-first search used for analyzing delta cycles.
@@ -18,6 +19,10 @@ enum dfs_state {
 };
 
 /*
+ * The size of struct nearly determines pack-objects's memory
+ * consumption. This struct is packed tight for that reason. When you
+ * add or reorder something in this struct, think a bit about this.
+ *
  * basic object info
  * -----------------
  * idx.oid is filled up before delta searching starts. idx.crc32 is
@@ -65,7 +70,7 @@ enum dfs_state {
 struct object_entry {
 	struct pack_idx_entry idx;
 	unsigned long size;	/* uncompressed size */
-	struct packed_git *in_pack;	/* already in pack */
+	unsigned in_pack_idx:OE_IN_PACK_BITS;	/* already in pack */
 	off_t in_pack_offset;
 	struct object_entry *delta;	/* delta base object */
 	struct object_entry *delta_child; /* deltified objects who bases me */
@@ -100,6 +105,8 @@ struct packing_data {
 	uint32_t index_size;
 
 	unsigned int *in_pack_pos;
+	int in_pack_count;
+	struct packed_git *in_pack[1 << OE_IN_PACK_BITS];
 };
 
 struct object_entry *packlist_alloc(struct packing_data *pdata,
@@ -158,4 +165,39 @@ static inline void oe_set_in_pack_pos(const struct packing_data *pack,
 	pack->in_pack_pos[e - pack->objects] = pos;
 }
 
+static inline unsigned int oe_add_pack(struct packing_data *pack,
+				       struct packed_git *p)
+{
+	if (pack->in_pack_count >= (1 << OE_IN_PACK_BITS))
+		die(_("too many packs to handle in one go. "
+		      "Please add .keep files to exclude\n"
+		      "some pack files and keep the number "
+		      "of non-kept files below %d."),
+		    1 << OE_IN_PACK_BITS);
+	if (p) {
+		if (p->index > 0)
+			die("BUG: this packed is already indexed");
+		p->index = pack->in_pack_count;
+	}
+	pack->in_pack[pack->in_pack_count] = p;
+	return pack->in_pack_count++;
+}
+
+static inline struct packed_git *oe_in_pack(const struct packing_data *pack,
+					    const struct object_entry *e)
+{
+	return pack->in_pack[e->in_pack_idx];
+
+}
+
+static inline void oe_set_in_pack(struct object_entry *e,
+				  struct packed_git *p)
+{
+	if (p->index <= 0)
+		die("BUG: found_pack should be NULL "
+		    "instead of having non-positive index");
+	e->in_pack_idx = p->index;
+
+}
+
 #endif
-- 
2.17.0.rc0.347.gf9cf61673a


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v6 07/11] pack-objects: refer to delta objects by index instead of pointer
  2018-03-18 14:25           ` [PATCH v6 " Nguyễn Thái Ngọc Duy
                               ` (5 preceding siblings ...)
  2018-03-18 14:25             ` [PATCH v6 06/11] pack-objects: move in_pack " Nguyễn Thái Ngọc Duy
@ 2018-03-18 14:25             ` Nguyễn Thái Ngọc Duy
  2018-03-18 14:25             ` [PATCH v6 08/11] pack-objects: shrink z_delta_size field in struct object_entry Nguyễn Thái Ngọc Duy
                               ` (6 subsequent siblings)
  13 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-18 14:25 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

These delta pointers always point to elements in the objects[] array
in packing_data struct. We can only hold maximum 4G of those objects
because the array size in nr_objects is uint32_t. We could use
uint32_t indexes to address these elements instead of pointers. On
64-bit architecture (8 bytes per pointer) this would save 4 bytes per
pointer.

Convert these delta pointers to indexes. Since we need to handle NULL
pointers as well, the index is shifted by one [1].

[1] This means we can only index 2^32-2 objects even though nr_objects
    could contain 2^32-1 objects. It should not be a problem in
    practice because when we grow objects[], nr_alloc would probably
    blow up long before nr_objects hits the wall.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 116 ++++++++++++++++++++++-------------------
 pack-objects.h         |  67 ++++++++++++++++++++++--
 2 files changed, 124 insertions(+), 59 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 9792d31e46..b39234f7fb 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -30,6 +30,12 @@
 #include "packfile.h"
 
 #define IN_PACK(obj) oe_in_pack(&to_pack, obj)
+#define DELTA(obj) oe_delta(&to_pack, obj)
+#define DELTA_CHILD(obj) oe_delta_child(&to_pack, obj)
+#define DELTA_SIBLING(obj) oe_delta_sibling(&to_pack, obj)
+#define SET_DELTA(obj, val) oe_set_delta(&to_pack, obj, val)
+#define SET_DELTA_CHILD(obj, val) oe_set_delta_child(&to_pack, obj, val)
+#define SET_DELTA_SIBLING(obj, val) oe_set_delta_sibling(&to_pack, obj, val)
 
 static const char *pack_usage[] = {
 	N_("git pack-objects --stdout [<options>...] [< <ref-list> | < <object-list>]"),
@@ -127,11 +133,11 @@ static void *get_delta(struct object_entry *entry)
 	buf = read_sha1_file(entry->idx.oid.hash, &type, &size);
 	if (!buf)
 		die("unable to read %s", oid_to_hex(&entry->idx.oid));
-	base_buf = read_sha1_file(entry->delta->idx.oid.hash, &type,
+	base_buf = read_sha1_file(DELTA(entry)->idx.oid.hash, &type,
 				  &base_size);
 	if (!base_buf)
 		die("unable to read %s",
-		    oid_to_hex(&entry->delta->idx.oid));
+		    oid_to_hex(&DELTA(entry)->idx.oid));
 	delta_buf = diff_delta(base_buf, base_size,
 			       buf, size, &delta_size, 0);
 	if (!delta_buf || delta_size != entry->delta_size)
@@ -288,12 +294,12 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 		size = entry->delta_size;
 		buf = entry->delta_data;
 		entry->delta_data = NULL;
-		type = (allow_ofs_delta && entry->delta->idx.offset) ?
+		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	} else {
 		buf = get_delta(entry);
 		size = entry->delta_size;
-		type = (allow_ofs_delta && entry->delta->idx.offset) ?
+		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	}
 
@@ -317,7 +323,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 		 * encoding of the relative offset for the delta
 		 * base from this object's position in the pack.
 		 */
-		off_t ofs = entry->idx.offset - entry->delta->idx.offset;
+		off_t ofs = entry->idx.offset - DELTA(entry)->idx.offset;
 		unsigned pos = sizeof(dheader) - 1;
 		dheader[pos] = ofs & 127;
 		while (ofs >>= 7)
@@ -343,7 +349,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 			return 0;
 		}
 		hashwrite(f, header, hdrlen);
-		hashwrite(f, entry->delta->idx.oid.hash, 20);
+		hashwrite(f, DELTA(entry)->idx.oid.hash, 20);
 		hdrlen += 20;
 	} else {
 		if (limit && hdrlen + datalen + 20 >= limit) {
@@ -379,8 +385,8 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 		      dheader[MAX_PACK_OBJECT_HEADER];
 	unsigned hdrlen;
 
-	if (entry->delta)
-		type = (allow_ofs_delta && entry->delta->idx.offset) ?
+	if (DELTA(entry))
+		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	hdrlen = encode_in_pack_object_header(header, sizeof(header),
 					      type, entry->size);
@@ -408,7 +414,7 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 	}
 
 	if (type == OBJ_OFS_DELTA) {
-		off_t ofs = entry->idx.offset - entry->delta->idx.offset;
+		off_t ofs = entry->idx.offset - DELTA(entry)->idx.offset;
 		unsigned pos = sizeof(dheader) - 1;
 		dheader[pos] = ofs & 127;
 		while (ofs >>= 7)
@@ -427,7 +433,7 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 			return 0;
 		}
 		hashwrite(f, header, hdrlen);
-		hashwrite(f, entry->delta->idx.oid.hash, 20);
+		hashwrite(f, DELTA(entry)->idx.oid.hash, 20);
 		hdrlen += 20;
 		reused_delta++;
 	} else {
@@ -467,13 +473,13 @@ static off_t write_object(struct hashfile *f,
 	else
 		limit = pack_size_limit - write_offset;
 
-	if (!entry->delta)
+	if (!DELTA(entry))
 		usable_delta = 0;	/* no delta */
 	else if (!pack_size_limit)
 	       usable_delta = 1;	/* unlimited packfile */
-	else if (entry->delta->idx.offset == (off_t)-1)
+	else if (DELTA(entry)->idx.offset == (off_t)-1)
 		usable_delta = 0;	/* base was written to another pack */
-	else if (entry->delta->idx.offset)
+	else if (DELTA(entry)->idx.offset)
 		usable_delta = 1;	/* base already exists in this pack */
 	else
 		usable_delta = 0;	/* base could end up in another pack */
@@ -489,7 +495,7 @@ static off_t write_object(struct hashfile *f,
 				/* ... but pack split may override that */
 	else if (oe_type(entry) != entry->in_pack_type)
 		to_reuse = 0;	/* pack has delta which is unusable */
-	else if (entry->delta)
+	else if (DELTA(entry))
 		to_reuse = 0;	/* we want to pack afresh */
 	else
 		to_reuse = 1;	/* we have it in-pack undeltified,
@@ -541,12 +547,12 @@ static enum write_one_status write_one(struct hashfile *f,
 	}
 
 	/* if we are deltified, write out base object first. */
-	if (e->delta) {
+	if (DELTA(e)) {
 		e->idx.offset = 1; /* now recurse */
-		switch (write_one(f, e->delta, offset)) {
+		switch (write_one(f, DELTA(e), offset)) {
 		case WRITE_ONE_RECURSIVE:
 			/* we cannot depend on this one */
-			e->delta = NULL;
+			SET_DELTA(e, NULL);
 			break;
 		default:
 			break;
@@ -608,34 +614,34 @@ static void add_descendants_to_write_order(struct object_entry **wo,
 			/* add this node... */
 			add_to_write_order(wo, endp, e);
 			/* all its siblings... */
-			for (s = e->delta_sibling; s; s = s->delta_sibling) {
+			for (s = DELTA_SIBLING(e); s; s = DELTA_SIBLING(s)) {
 				add_to_write_order(wo, endp, s);
 			}
 		}
 		/* drop down a level to add left subtree nodes if possible */
-		if (e->delta_child) {
+		if (DELTA_CHILD(e)) {
 			add_to_order = 1;
-			e = e->delta_child;
+			e = DELTA_CHILD(e);
 		} else {
 			add_to_order = 0;
 			/* our sibling might have some children, it is next */
-			if (e->delta_sibling) {
-				e = e->delta_sibling;
+			if (DELTA_SIBLING(e)) {
+				e = DELTA_SIBLING(e);
 				continue;
 			}
 			/* go back to our parent node */
-			e = e->delta;
-			while (e && !e->delta_sibling) {
+			e = DELTA(e);
+			while (e && !DELTA_SIBLING(e)) {
 				/* we're on the right side of a subtree, keep
 				 * going up until we can go right again */
-				e = e->delta;
+				e = DELTA(e);
 			}
 			if (!e) {
 				/* done- we hit our original root node */
 				return;
 			}
 			/* pass it off to sibling at this level */
-			e = e->delta_sibling;
+			e = DELTA_SIBLING(e);
 		}
 	};
 }
@@ -646,7 +652,7 @@ static void add_family_to_write_order(struct object_entry **wo,
 {
 	struct object_entry *root;
 
-	for (root = e; root->delta; root = root->delta)
+	for (root = e; DELTA(root); root = DELTA(root))
 		; /* nothing */
 	add_descendants_to_write_order(wo, endp, root);
 }
@@ -661,8 +667,8 @@ static struct object_entry **compute_write_order(void)
 	for (i = 0; i < to_pack.nr_objects; i++) {
 		objects[i].tagged = 0;
 		objects[i].filled = 0;
-		objects[i].delta_child = NULL;
-		objects[i].delta_sibling = NULL;
+		SET_DELTA_CHILD(&objects[i], NULL);
+		SET_DELTA_SIBLING(&objects[i], NULL);
 	}
 
 	/*
@@ -672,11 +678,11 @@ static struct object_entry **compute_write_order(void)
 	 */
 	for (i = to_pack.nr_objects; i > 0;) {
 		struct object_entry *e = &objects[--i];
-		if (!e->delta)
+		if (!DELTA(e))
 			continue;
 		/* Mark me as the first child */
-		e->delta_sibling = e->delta->delta_child;
-		e->delta->delta_child = e;
+		e->delta_sibling_idx = DELTA(e)->delta_child_idx;
+		SET_DELTA_CHILD(DELTA(e), e);
 	}
 
 	/*
@@ -1498,10 +1504,10 @@ static void check_object(struct object_entry *entry)
 			 * circular deltas.
 			 */
 			oe_set_type(entry, entry->in_pack_type);
-			entry->delta = base_entry;
+			SET_DELTA(entry, base_entry);
 			entry->delta_size = entry->size;
-			entry->delta_sibling = base_entry->delta_child;
-			base_entry->delta_child = entry;
+			entry->delta_sibling_idx = base_entry->delta_child_idx;
+			SET_DELTA_CHILD(base_entry, entry);
 			unuse_pack(&w_curs);
 			return;
 		}
@@ -1572,17 +1578,19 @@ static int pack_offset_sort(const void *_a, const void *_b)
  */
 static void drop_reused_delta(struct object_entry *entry)
 {
-	struct object_entry **p = &entry->delta->delta_child;
+	unsigned *idx = &to_pack.objects[entry->delta_idx - 1].delta_child_idx;
 	struct object_info oi = OBJECT_INFO_INIT;
 	enum object_type type;
 
-	while (*p) {
-		if (*p == entry)
-			*p = (*p)->delta_sibling;
+	while (*idx) {
+		struct object_entry *oe = &to_pack.objects[*idx - 1];
+
+		if (oe == entry)
+			*idx = oe->delta_sibling_idx;
 		else
-			p = &(*p)->delta_sibling;
+			idx = &oe->delta_sibling_idx;
 	}
-	entry->delta = NULL;
+	SET_DELTA(entry, NULL);
 	entry->depth = 0;
 
 	oi.sizep = &entry->size;
@@ -1622,7 +1630,7 @@ static void break_delta_chains(struct object_entry *entry)
 
 	for (cur = entry, total_depth = 0;
 	     cur;
-	     cur = cur->delta, total_depth++) {
+	     cur = DELTA(cur), total_depth++) {
 		if (cur->dfs_state == DFS_DONE) {
 			/*
 			 * We've already seen this object and know it isn't
@@ -1647,7 +1655,7 @@ static void break_delta_chains(struct object_entry *entry)
 		 * it's not a delta, we're done traversing, but we'll mark it
 		 * done to save time on future traversals.
 		 */
-		if (!cur->delta) {
+		if (!DELTA(cur)) {
 			cur->dfs_state = DFS_DONE;
 			break;
 		}
@@ -1670,7 +1678,7 @@ static void break_delta_chains(struct object_entry *entry)
 		 * We keep all commits in the chain that we examined.
 		 */
 		cur->dfs_state = DFS_ACTIVE;
-		if (cur->delta->dfs_state == DFS_ACTIVE) {
+		if (DELTA(cur)->dfs_state == DFS_ACTIVE) {
 			drop_reused_delta(cur);
 			cur->dfs_state = DFS_DONE;
 			break;
@@ -1685,7 +1693,7 @@ static void break_delta_chains(struct object_entry *entry)
 	 * an extra "next" pointer to keep going after we reset cur->delta.
 	 */
 	for (cur = entry; cur; cur = next) {
-		next = cur->delta;
+		next = DELTA(cur);
 
 		/*
 		 * We should have a chain of zero or more ACTIVE states down to
@@ -1870,7 +1878,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 
 	/* Now some size filtering heuristics. */
 	trg_size = trg_entry->size;
-	if (!trg_entry->delta) {
+	if (!DELTA(trg_entry)) {
 		max_size = trg_size/2 - 20;
 		ref_depth = 1;
 	} else {
@@ -1946,7 +1954,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	if (!delta_buf)
 		return 0;
 
-	if (trg_entry->delta) {
+	if (DELTA(trg_entry)) {
 		/* Prefer only shallower same-sized deltas. */
 		if (delta_size == trg_entry->delta_size &&
 		    src->depth + 1 >= trg->depth) {
@@ -1975,7 +1983,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 		free(delta_buf);
 	}
 
-	trg_entry->delta = src_entry;
+	SET_DELTA(trg_entry, src_entry);
 	trg_entry->delta_size = delta_size;
 	trg->depth = src->depth + 1;
 
@@ -1984,13 +1992,13 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 
 static unsigned int check_delta_limit(struct object_entry *me, unsigned int n)
 {
-	struct object_entry *child = me->delta_child;
+	struct object_entry *child = DELTA_CHILD(me);
 	unsigned int m = n;
 	while (child) {
 		unsigned int c = check_delta_limit(child, n + 1);
 		if (m < c)
 			m = c;
-		child = child->delta_sibling;
+		child = DELTA_SIBLING(child);
 	}
 	return m;
 }
@@ -2059,7 +2067,7 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
 		 * otherwise they would become too deep.
 		 */
 		max_depth = depth;
-		if (entry->delta_child) {
+		if (DELTA_CHILD(entry)) {
 			max_depth -= check_delta_limit(entry, 0);
 			if (max_depth <= 0)
 				goto next;
@@ -2109,7 +2117,7 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
 		 * depth, leaving it in the window is pointless.  we
 		 * should evict it first.
 		 */
-		if (entry->delta && max_depth <= n->depth)
+		if (DELTA(entry) && max_depth <= n->depth)
 			continue;
 
 		/*
@@ -2117,7 +2125,7 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
 		 * currently deltified object, to keep it longer.  It will
 		 * be the first base object to be attempted next.
 		 */
-		if (entry->delta) {
+		if (DELTA(entry)) {
 			struct unpacked swap = array[best_base];
 			int dist = (window + idx - best_base) % window;
 			int dst = best_base;
@@ -2438,7 +2446,7 @@ static void prepare_pack(int window, int depth)
 	for (i = 0; i < to_pack.nr_objects; i++) {
 		struct object_entry *entry = to_pack.objects + i;
 
-		if (entry->delta)
+		if (DELTA(entry))
 			/* This happens if we decided to reuse existing
 			 * delta from a pack.  "reuse_delta &&" is implied.
 			 */
diff --git a/pack-objects.h b/pack-objects.h
index bf905c3f9b..594a213554 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -72,11 +72,11 @@ struct object_entry {
 	unsigned long size;	/* uncompressed size */
 	unsigned in_pack_idx:OE_IN_PACK_BITS;	/* already in pack */
 	off_t in_pack_offset;
-	struct object_entry *delta;	/* delta base object */
-	struct object_entry *delta_child; /* deltified objects who bases me */
-	struct object_entry *delta_sibling; /* other deltified objects who
-					     * uses the same base as me
-					     */
+	uint32_t delta_idx;	/* delta base object */
+	uint32_t delta_child_idx; /* deltified objects who bases me */
+	uint32_t delta_sibling_idx; /* other deltified objects who
+				     * uses the same base as me
+				     */
 	void *delta_data;	/* cached delta (uncompressed) */
 	unsigned long delta_size;	/* delta data size (uncompressed) */
 	unsigned long z_delta_size;	/* delta data size (compressed) */
@@ -200,4 +200,61 @@ static inline void oe_set_in_pack(struct object_entry *e,
 
 }
 
+static inline struct object_entry *oe_delta(
+		const struct packing_data *pack,
+		const struct object_entry *e)
+{
+	if (e->delta_idx)
+		return &pack->objects[e->delta_idx - 1];
+	return NULL;
+}
+
+static inline void oe_set_delta(struct packing_data *pack,
+				struct object_entry *e,
+				struct object_entry *delta)
+{
+	if (delta)
+		e->delta_idx = (delta - pack->objects) + 1;
+	else
+		e->delta_idx = 0;
+}
+
+static inline struct object_entry *oe_delta_child(
+		const struct packing_data *pack,
+		const struct object_entry *e)
+{
+	if (e->delta_child_idx)
+		return &pack->objects[e->delta_child_idx - 1];
+	return NULL;
+}
+
+static inline void oe_set_delta_child(struct packing_data *pack,
+				      struct object_entry *e,
+				      struct object_entry *delta)
+{
+	if (delta)
+		e->delta_child_idx = (delta - pack->objects) + 1;
+	else
+		e->delta_child_idx = 0;
+}
+
+static inline struct object_entry *oe_delta_sibling(
+		const struct packing_data *pack,
+		const struct object_entry *e)
+{
+	if (e->delta_sibling_idx)
+		return &pack->objects[e->delta_sibling_idx - 1];
+	return NULL;
+}
+
+static inline void oe_set_delta_sibling(struct packing_data *pack,
+					struct object_entry *e,
+					struct object_entry *delta)
+{
+	if (delta)
+		e->delta_sibling_idx = (delta - pack->objects) + 1;
+	else
+		e->delta_sibling_idx = 0;
+}
+
 #endif
-- 
2.17.0.rc0.347.gf9cf61673a


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v6 08/11] pack-objects: shrink z_delta_size field in struct object_entry
  2018-03-18 14:25           ` [PATCH v6 " Nguyễn Thái Ngọc Duy
                               ` (6 preceding siblings ...)
  2018-03-18 14:25             ` [PATCH v6 07/11] pack-objects: refer to delta objects by index instead of pointer Nguyễn Thái Ngọc Duy
@ 2018-03-18 14:25             ` Nguyễn Thái Ngọc Duy
  2018-03-18 14:25             ` [PATCH v6 09/11] pack-objects: shrink size " Nguyễn Thái Ngọc Duy
                               ` (5 subsequent siblings)
  13 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-18 14:25 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

We only cache deltas when it's smaller than a certain limit. This limit
defaults to 1000 but save its compressed length in a 64-bit field.
Shrink that field down to 16 bits, so you can only cache 65kb deltas.
Larger deltas must be recomputed at when the pack is written down.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 Documentation/config.txt |  3 ++-
 builtin/pack-objects.c   | 22 ++++++++++++++++------
 pack-objects.h           |  3 ++-
 3 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index 9bd3f5a789..00fa824448 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -2449,7 +2449,8 @@ pack.deltaCacheLimit::
 	The maximum size of a delta, that is cached in
 	linkgit:git-pack-objects[1]. This cache is used to speed up the
 	writing object phase by not having to recompute the final delta
-	result once the best match for all objects is found. Defaults to 1000.
+	result once the best match for all objects is found.
+	Defaults to 1000. Maximum value is 65535.
 
 pack.threads::
 	Specifies the number of threads to spawn when searching for best
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index b39234f7fb..372afe48c4 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -2105,12 +2105,19 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
 		 * between writes at that moment.
 		 */
 		if (entry->delta_data && !pack_to_stdout) {
-			entry->z_delta_size = do_compress(&entry->delta_data,
-							  entry->delta_size);
-			cache_lock();
-			delta_cache_size -= entry->delta_size;
-			delta_cache_size += entry->z_delta_size;
-			cache_unlock();
+			unsigned long size;
+
+			size = do_compress(&entry->delta_data, entry->delta_size);
+			if (size < (1 << OE_Z_DELTA_BITS)) {
+				entry->z_delta_size = size;
+				cache_lock();
+				delta_cache_size -= entry->delta_size;
+				delta_cache_size += entry->z_delta_size;
+				cache_unlock();
+			} else {
+				FREE_AND_NULL(entry->delta_data);
+				entry->z_delta_size = 0;
+			}
 		}
 
 		/* if we made n a delta, and if n is already at max
@@ -3089,6 +3096,9 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 	if (depth >= (1 << OE_DEPTH_BITS))
 		die(_("delta chain depth %d is greater than maximum limit %d"),
 		    depth, (1 << OE_DEPTH_BITS) - 1);
+	if (cache_max_small_delta_size >= (1 << OE_Z_DELTA_BITS))
+		die(_("pack.deltaCacheLimit is greater than maximum limit %d"),
+		    (1 << OE_Z_DELTA_BITS) - 1);
 
 	argv_array_push(&rp, "pack-objects");
 	if (thin) {
diff --git a/pack-objects.h b/pack-objects.h
index 594a213554..c12219385a 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -4,6 +4,7 @@
 #define OE_DFS_STATE_BITS	2
 #define OE_DEPTH_BITS		12
 #define OE_IN_PACK_BITS		14
+#define OE_Z_DELTA_BITS		16
 
 /*
  * State flags for depth-first search used for analyzing delta cycles.
@@ -79,7 +80,7 @@ struct object_entry {
 				     */
 	void *delta_data;	/* cached delta (uncompressed) */
 	unsigned long delta_size;	/* delta data size (uncompressed) */
-	unsigned long z_delta_size;	/* delta data size (compressed) */
+	unsigned z_delta_size:OE_Z_DELTA_BITS;
 	unsigned type_:TYPE_BITS;
 	unsigned in_pack_type:TYPE_BITS; /* could be delta */
 	unsigned type_valid:1;
-- 
2.17.0.rc0.347.gf9cf61673a


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v6 09/11] pack-objects: shrink size field in struct object_entry
  2018-03-18 14:25           ` [PATCH v6 " Nguyễn Thái Ngọc Duy
                               ` (7 preceding siblings ...)
  2018-03-18 14:25             ` [PATCH v6 08/11] pack-objects: shrink z_delta_size field in struct object_entry Nguyễn Thái Ngọc Duy
@ 2018-03-18 14:25             ` Nguyễn Thái Ngọc Duy
  2018-03-18 14:49               ` Ævar Arnfjörð Bjarmason
                                 ` (2 more replies)
  2018-03-18 14:25             ` [PATCH v6 10/11] pack-objects: shrink delta_size " Nguyễn Thái Ngọc Duy
                               ` (4 subsequent siblings)
  13 siblings, 3 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-18 14:25 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

It's very very rare that an uncompressedd object is larger than 4GB
(partly because Git does not handle those large files very well to
begin with). Let's optimize it for the common case where object size
is smaller than this limit.

Shrink size field down to 32 bits [1] and one overflow bit. If the size
is too large, we read it back from disk.

Add two compare helpers that can take advantage of the overflow
bit (e.g. if the file is 4GB+, chances are it's already larger than
core.bigFileThreshold and there's no point in comparing the actual
value).

A small note about the conditional oe_set_size() in
check_object(). Technically if we don't get a valid type, it's not
wrong if we set uninitialized value "size" (we don't pre-initialize
this and sha1_object_info will not assign anything when it fails to
get the info).

This how changes the writing code path slightly which emits different
error messages (either way we die). One of our tests in t5530 depends
on this specific error message. Let's just keep the test as-is and
play safe by not assigning random value. That might trigger valgrind
anyway.

[1] it's actually already 32 bits on Windows

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 49 ++++++++++++++++++++++-------------
 pack-objects.h         | 58 +++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 88 insertions(+), 19 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 372afe48c4..89ed4b5125 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -274,7 +274,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 
 	if (!usable_delta) {
 		if (oe_type(entry) == OBJ_BLOB &&
-		    entry->size > big_file_threshold &&
+		    oe_size_greater_than(entry, big_file_threshold) &&
 		    (st = open_istream(entry->idx.oid.hash, &type, &size, NULL)) != NULL)
 			buf = NULL;
 		else {
@@ -384,12 +384,13 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 	unsigned char header[MAX_PACK_OBJECT_HEADER],
 		      dheader[MAX_PACK_OBJECT_HEADER];
 	unsigned hdrlen;
+	unsigned long entry_size = oe_size(entry);
 
 	if (DELTA(entry))
 		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	hdrlen = encode_in_pack_object_header(header, sizeof(header),
-					      type, entry->size);
+					      type, entry_size);
 
 	offset = entry->in_pack_offset;
 	revidx = find_pack_revindex(p, offset);
@@ -406,7 +407,7 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 	datalen -= entry->in_pack_header_size;
 
 	if (!pack_to_stdout && p->index_version == 1 &&
-	    check_pack_inflate(p, &w_curs, offset, datalen, entry->size)) {
+	    check_pack_inflate(p, &w_curs, offset, datalen, entry_size)) {
 		error("corrupt packed object for %s",
 		      oid_to_hex(&entry->idx.oid));
 		unuse_pack(&w_curs);
@@ -1412,6 +1413,8 @@ static void cleanup_preferred_base(void)
 
 static void check_object(struct object_entry *entry)
 {
+	unsigned long size;
+
 	if (IN_PACK(entry)) {
 		struct packed_git *p = IN_PACK(entry);
 		struct pack_window *w_curs = NULL;
@@ -1431,13 +1434,14 @@ static void check_object(struct object_entry *entry)
 		 */
 		used = unpack_object_header_buffer(buf, avail,
 						   &type,
-						   &entry->size);
+						   &size);
 		if (used == 0)
 			goto give_up;
 
 		if (type < 0)
 			die("BUG: invalid type %d", type);
 		entry->in_pack_type = type;
+		oe_set_size(entry, size);
 
 		/*
 		 * Determine if this is a delta and if so whether we can
@@ -1505,7 +1509,7 @@ static void check_object(struct object_entry *entry)
 			 */
 			oe_set_type(entry, entry->in_pack_type);
 			SET_DELTA(entry, base_entry);
-			entry->delta_size = entry->size;
+			entry->delta_size = oe_size(entry);
 			entry->delta_sibling_idx = base_entry->delta_child_idx;
 			SET_DELTA_CHILD(base_entry, entry);
 			unuse_pack(&w_curs);
@@ -1513,14 +1517,17 @@ static void check_object(struct object_entry *entry)
 		}
 
 		if (oe_type(entry)) {
+			unsigned long size;
+
+			size = get_size_from_delta(p, &w_curs,
+				entry->in_pack_offset + entry->in_pack_header_size);
 			/*
 			 * This must be a delta and we already know what the
 			 * final object type is.  Let's extract the actual
 			 * object size from the delta header.
 			 */
-			entry->size = get_size_from_delta(p, &w_curs,
-					entry->in_pack_offset + entry->in_pack_header_size);
-			if (entry->size == 0)
+			oe_set_size(entry, size);
+			if (oe_size_less_than(entry, 1))
 				goto give_up;
 			unuse_pack(&w_curs);
 			return;
@@ -1535,13 +1542,15 @@ static void check_object(struct object_entry *entry)
 		unuse_pack(&w_curs);
 	}
 
-	oe_set_type(entry, sha1_object_info(entry->idx.oid.hash, &entry->size));
+	oe_set_type(entry, sha1_object_info(entry->idx.oid.hash, &size));
 	/*
 	 * The error condition is checked in prepare_pack().  This is
 	 * to permit a missing preferred base object to be ignored
 	 * as a preferred base.  Doing so can result in a larger
 	 * pack file, but the transfer will still take place.
 	 */
+	if (entry->type_valid)
+		oe_set_size(entry, size);
 }
 
 static int pack_offset_sort(const void *_a, const void *_b)
@@ -1581,6 +1590,7 @@ static void drop_reused_delta(struct object_entry *entry)
 	unsigned *idx = &to_pack.objects[entry->delta_idx - 1].delta_child_idx;
 	struct object_info oi = OBJECT_INFO_INIT;
 	enum object_type type;
+	unsigned long size;
 
 	while (*idx) {
 		struct object_entry *oe = &to_pack.objects[*idx - 1];
@@ -1593,7 +1603,7 @@ static void drop_reused_delta(struct object_entry *entry)
 	SET_DELTA(entry, NULL);
 	entry->depth = 0;
 
-	oi.sizep = &entry->size;
+	oi.sizep = &size;
 	oi.typep = &type;
 	if (packed_object_info(IN_PACK(entry), entry->in_pack_offset, &oi) < 0) {
 		/*
@@ -1603,10 +1613,11 @@ static void drop_reused_delta(struct object_entry *entry)
 		 * and dealt with in prepare_pack().
 		 */
 		oe_set_type(entry, sha1_object_info(entry->idx.oid.hash,
-						    &entry->size));
+						    &size));
 	} else {
 		oe_set_type(entry, type);
 	}
+	oe_set_size(entry, size);
 }
 
 /*
@@ -1746,7 +1757,7 @@ static void get_object_details(void)
 	for (i = 0; i < to_pack.nr_objects; i++) {
 		struct object_entry *entry = sorted_by_offset[i];
 		check_object(entry);
-		if (big_file_threshold < entry->size)
+		if (oe_size_greater_than(entry, big_file_threshold))
 			entry->no_try_delta = 1;
 	}
 
@@ -1775,6 +1786,8 @@ static int type_size_sort(const void *_a, const void *_b)
 	const struct object_entry *b = *(struct object_entry **)_b;
 	enum object_type a_type = oe_type(a);
 	enum object_type b_type = oe_type(b);
+	unsigned long a_size = oe_size(a);
+	unsigned long b_size = oe_size(b);
 
 	if (a_type > b_type)
 		return -1;
@@ -1788,9 +1801,9 @@ static int type_size_sort(const void *_a, const void *_b)
 		return -1;
 	if (a->preferred_base < b->preferred_base)
 		return 1;
-	if (a->size > b->size)
+	if (a_size > b_size)
 		return -1;
-	if (a->size < b->size)
+	if (a_size < b_size)
 		return 1;
 	return a < b ? -1 : (a > b);  /* newest first */
 }
@@ -1877,7 +1890,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 		return 0;
 
 	/* Now some size filtering heuristics. */
-	trg_size = trg_entry->size;
+	trg_size = oe_size(trg_entry);
 	if (!DELTA(trg_entry)) {
 		max_size = trg_size/2 - 20;
 		ref_depth = 1;
@@ -1889,7 +1902,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 						(max_depth - ref_depth + 1);
 	if (max_size == 0)
 		return 0;
-	src_size = src_entry->size;
+	src_size = oe_size(src_entry);
 	sizediff = src_size < trg_size ? trg_size - src_size : 0;
 	if (sizediff >= max_size)
 		return 0;
@@ -2009,7 +2022,7 @@ static unsigned long free_unpacked(struct unpacked *n)
 	free_delta_index(n->index);
 	n->index = NULL;
 	if (n->data) {
-		freed_mem += n->entry->size;
+		freed_mem += oe_size(n->entry);
 		FREE_AND_NULL(n->data);
 	}
 	n->entry = NULL;
@@ -2459,7 +2472,7 @@ static void prepare_pack(int window, int depth)
 			 */
 			continue;
 
-		if (entry->size < 50)
+		if (oe_size_less_than(entry, 50))
 			continue;
 
 		if (entry->no_try_delta)
diff --git a/pack-objects.h b/pack-objects.h
index c12219385a..0beedbc637 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -70,7 +70,9 @@ enum dfs_state {
  */
 struct object_entry {
 	struct pack_idx_entry idx;
-	unsigned long size;	/* uncompressed size */
+	/* object uncompressed size _if_ size_valid is true */
+	uint32_t size_;
+	unsigned size_valid:1;
 	unsigned in_pack_idx:OE_IN_PACK_BITS;	/* already in pack */
 	off_t in_pack_offset;
 	uint32_t delta_idx;	/* delta base object */
@@ -258,4 +260,58 @@ static inline void oe_set_delta_sibling(struct packing_data *pack,
 		e->delta_sibling_idx = 0;
 }
 
+static inline unsigned long oe_size(const struct object_entry *e)
+{
+	if (e->size_valid) {
+		return e->size_;
+	} else {
+		unsigned long size;
+
+		sha1_object_info(e->idx.oid.hash, &size);
+		return size;
+	}
+}
+
+static inline int oe_fits_in_32bits(unsigned long limit)
+{
+	uint32_t truncated_limit = (uint32_t)limit;
+
+	return limit == truncated_limit;
+}
+
+static inline int oe_size_less_than(const struct object_entry *e,
+				    unsigned long limit)
+{
+	if (e->size_valid)
+		return e->size_ < limit;
+	if (oe_fits_in_32bits(limit)) /* limit < 2^32 <= size ? */
+		return 0;
+	return oe_size(e) < limit;
+}
+
+static inline int oe_size_greater_than(const struct object_entry *e,
+				       unsigned long limit)
+{
+	if (e->size_valid)
+		return e->size_ > limit;
+	if (oe_fits_in_32bits(limit)) /* limit < 2^32 <= size ? */
+		return 1;
+	return oe_size(e) > limit;
+}
+
+static inline void oe_set_size(struct object_entry *e,
+			       unsigned long size)
+{
+	e->size_ = size;
+	e->size_valid = e->size_ == size;
+
+	if (!e->size_valid) {
+		unsigned long real_size;
+
+		if (sha1_object_info(e->idx.oid.hash, &real_size) < 0 ||
+		    size != real_size)
+			die("BUG: 'size' is supposed to be the object size!");
+	}
+}
+
 #endif
-- 
2.17.0.rc0.347.gf9cf61673a


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v6 10/11] pack-objects: shrink delta_size field in struct object_entry
  2018-03-18 14:25           ` [PATCH v6 " Nguyễn Thái Ngọc Duy
                               ` (8 preceding siblings ...)
  2018-03-18 14:25             ` [PATCH v6 09/11] pack-objects: shrink size " Nguyễn Thái Ngọc Duy
@ 2018-03-18 14:25             ` Nguyễn Thái Ngọc Duy
  2018-03-18 14:25             ` [PATCH v6 11/11] pack-objects: reorder members to shrink " Nguyễn Thái Ngọc Duy
                               ` (3 subsequent siblings)
  13 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-18 14:25 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

Allowing a delta size of 64 bits is crazy. Shrink this field down to
31 bits with one overflow bit.

If we find an existing delta larger than 2GB, we do not cache
delta_size at all and will get the value from oe_size(), potentially
from disk if it's larger than 4GB.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 24 ++++++++++++++----------
 pack-objects.h         | 23 ++++++++++++++++++++++-
 2 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 89ed4b5125..4406af640f 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -30,10 +30,12 @@
 #include "packfile.h"
 
 #define IN_PACK(obj) oe_in_pack(&to_pack, obj)
+#define DELTA_SIZE(obj) oe_delta_size(&to_pack, obj)
 #define DELTA(obj) oe_delta(&to_pack, obj)
 #define DELTA_CHILD(obj) oe_delta_child(&to_pack, obj)
 #define DELTA_SIBLING(obj) oe_delta_sibling(&to_pack, obj)
 #define SET_DELTA(obj, val) oe_set_delta(&to_pack, obj, val)
+#define SET_DELTA_SIZE(obj, val) oe_set_delta_size(&to_pack, obj, val)
 #define SET_DELTA_CHILD(obj, val) oe_set_delta_child(&to_pack, obj, val)
 #define SET_DELTA_SIBLING(obj, val) oe_set_delta_sibling(&to_pack, obj, val)
 
@@ -140,7 +142,7 @@ static void *get_delta(struct object_entry *entry)
 		    oid_to_hex(&DELTA(entry)->idx.oid));
 	delta_buf = diff_delta(base_buf, base_size,
 			       buf, size, &delta_size, 0);
-	if (!delta_buf || delta_size != entry->delta_size)
+	if (!delta_buf || delta_size != DELTA_SIZE(entry))
 		die("delta size changed");
 	free(buf);
 	free(base_buf);
@@ -291,14 +293,14 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 		FREE_AND_NULL(entry->delta_data);
 		entry->z_delta_size = 0;
 	} else if (entry->delta_data) {
-		size = entry->delta_size;
+		size = DELTA_SIZE(entry);
 		buf = entry->delta_data;
 		entry->delta_data = NULL;
 		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	} else {
 		buf = get_delta(entry);
-		size = entry->delta_size;
+		size = DELTA_SIZE(entry);
 		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	}
@@ -1509,7 +1511,7 @@ static void check_object(struct object_entry *entry)
 			 */
 			oe_set_type(entry, entry->in_pack_type);
 			SET_DELTA(entry, base_entry);
-			entry->delta_size = oe_size(entry);
+			SET_DELTA_SIZE(entry, oe_size(entry));
 			entry->delta_sibling_idx = base_entry->delta_child_idx;
 			SET_DELTA_CHILD(base_entry, entry);
 			unuse_pack(&w_curs);
@@ -1895,7 +1897,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 		max_size = trg_size/2 - 20;
 		ref_depth = 1;
 	} else {
-		max_size = trg_entry->delta_size;
+		max_size = DELTA_SIZE(trg_entry);
 		ref_depth = trg->depth;
 	}
 	max_size = (uint64_t)max_size * (max_depth - src->depth) /
@@ -1966,10 +1968,12 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	delta_buf = create_delta(src->index, trg->data, trg_size, &delta_size, max_size);
 	if (!delta_buf)
 		return 0;
+	if (delta_size >= (1 << OE_DELTA_SIZE_BITS))
+		return 0;
 
 	if (DELTA(trg_entry)) {
 		/* Prefer only shallower same-sized deltas. */
-		if (delta_size == trg_entry->delta_size &&
+		if (delta_size == DELTA_SIZE(trg_entry) &&
 		    src->depth + 1 >= trg->depth) {
 			free(delta_buf);
 			return 0;
@@ -1984,7 +1988,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	free(trg_entry->delta_data);
 	cache_lock();
 	if (trg_entry->delta_data) {
-		delta_cache_size -= trg_entry->delta_size;
+		delta_cache_size -= DELTA_SIZE(trg_entry);
 		trg_entry->delta_data = NULL;
 	}
 	if (delta_cacheable(src_size, trg_size, delta_size)) {
@@ -1997,7 +2001,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	}
 
 	SET_DELTA(trg_entry, src_entry);
-	trg_entry->delta_size = delta_size;
+	SET_DELTA_SIZE(trg_entry, delta_size);
 	trg->depth = src->depth + 1;
 
 	return 1;
@@ -2120,11 +2124,11 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
 		if (entry->delta_data && !pack_to_stdout) {
 			unsigned long size;
 
-			size = do_compress(&entry->delta_data, entry->delta_size);
+			size = do_compress(&entry->delta_data, DELTA_SIZE(entry));
 			if (size < (1 << OE_Z_DELTA_BITS)) {
 				entry->z_delta_size = size;
 				cache_lock();
-				delta_cache_size -= entry->delta_size;
+				delta_cache_size -= DELTA_SIZE(entry);
 				delta_cache_size += entry->z_delta_size;
 				cache_unlock();
 			} else {
diff --git a/pack-objects.h b/pack-objects.h
index 0beedbc637..cbd5cf61ca 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -5,6 +5,7 @@
 #define OE_DEPTH_BITS		12
 #define OE_IN_PACK_BITS		14
 #define OE_Z_DELTA_BITS		16
+#define OE_DELTA_SIZE_BITS	31
 
 /*
  * State flags for depth-first search used for analyzing delta cycles.
@@ -81,7 +82,8 @@ struct object_entry {
 				     * uses the same base as me
 				     */
 	void *delta_data;	/* cached delta (uncompressed) */
-	unsigned long delta_size;	/* delta data size (uncompressed) */
+	uint32_t delta_size_:OE_DELTA_SIZE_BITS; /* delta data size (uncompressed) */
+	uint32_t delta_size_valid:1;
 	unsigned z_delta_size:OE_Z_DELTA_BITS;
 	unsigned type_:TYPE_BITS;
 	unsigned in_pack_type:TYPE_BITS; /* could be delta */
@@ -314,4 +316,23 @@ static inline void oe_set_size(struct object_entry *e,
 	}
 }
 
+static inline unsigned long oe_delta_size(struct packing_data *pack,
+					  const struct object_entry *e)
+{
+	if (e->delta_size_valid)
+		return e->delta_size_;
+	return oe_size(e);
+}
+
+static inline void oe_set_delta_size(struct packing_data *pack,
+				     struct object_entry *e,
+				     unsigned long size)
+{
+	e->delta_size_ = size;
+	e->delta_size_valid = e->delta_size_ == size;
+	if (!e->delta_size_valid && size != oe_size(e))
+		die("BUG: this can only happen in check_object() "
+		    "where delta size is the same as entry size");
+}
+
 #endif
-- 
2.17.0.rc0.347.gf9cf61673a


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v6 11/11] pack-objects: reorder members to shrink struct object_entry
  2018-03-18 14:25           ` [PATCH v6 " Nguyễn Thái Ngọc Duy
                               ` (9 preceding siblings ...)
  2018-03-18 14:25             ` [PATCH v6 10/11] pack-objects: shrink delta_size " Nguyễn Thái Ngọc Duy
@ 2018-03-18 14:25             ` Nguyễn Thái Ngọc Duy
  2018-03-18 14:51             ` [PATCH v6 00/11] nd/pack-objects-pack-struct updates Ævar Arnfjörð Bjarmason
                               ` (2 subsequent siblings)
  13 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-18 14:25 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

Previous patches leave lots of holes and padding in this struct. This
patch reorders the members and shrinks the struct down to 80 bytes
(from 136 bytes, before any field shrinking is done) with 16 bits to
spare (and a couple more in in_pack_header_size when we really run out
of bits).

This is the last in a series of memory reduction patches (see
"pack-objects: a bit of document about struct object_entry" for the
first one).

Overall they've reduced repack memory size on linux-2.6.git from
3.747G to 3.424G, or by around 320M, a decrease of 8.5%. The runtime
of repack has stayed the same throughout this series. Ævar's testing
on a big monorepo he has access to (bigger than linux-2.6.git) has
shown a 7.9% reduction, so the overall expected improvement should be
somewhere around 8%.

See 87po42cwql.fsf@evledraar.gmail.com on-list
(https://public-inbox.org/git/87po42cwql.fsf@evledraar.gmail.com/) for
more detailed numbers and a test script used to produce the numbers
cited above.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 pack-objects.h | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/pack-objects.h b/pack-objects.h
index cbd5cf61ca..af40211105 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -71,35 +71,36 @@ enum dfs_state {
  */
 struct object_entry {
 	struct pack_idx_entry idx;
-	/* object uncompressed size _if_ size_valid is true */
-	uint32_t size_;
-	unsigned size_valid:1;
-	unsigned in_pack_idx:OE_IN_PACK_BITS;	/* already in pack */
+	void *delta_data;	/* cached delta (uncompressed) */
 	off_t in_pack_offset;
+	uint32_t hash;			/* name hint hash */
+	uint32_t size_;	/* object uncompressed size _if_ size_valid is true */
 	uint32_t delta_idx;	/* delta base object */
 	uint32_t delta_child_idx; /* deltified objects who bases me */
 	uint32_t delta_sibling_idx; /* other deltified objects who
 				     * uses the same base as me
 				     */
-	void *delta_data;	/* cached delta (uncompressed) */
 	uint32_t delta_size_:OE_DELTA_SIZE_BITS; /* delta data size (uncompressed) */
 	uint32_t delta_size_valid:1;
+	unsigned in_pack_idx:OE_IN_PACK_BITS;	/* already in pack */
+	unsigned size_valid:1;
 	unsigned z_delta_size:OE_Z_DELTA_BITS;
+	unsigned type_valid:1;
 	unsigned type_:TYPE_BITS;
 	unsigned in_pack_type:TYPE_BITS; /* could be delta */
-	unsigned type_valid:1;
-	uint32_t hash;			/* name hint hash */
-	unsigned char in_pack_header_size;
 	unsigned preferred_base:1; /*
 				    * we do not pack this, but is available
 				    * to be used as the base object to delta
 				    * objects against.
 				    */
 	unsigned no_try_delta:1;
+	unsigned char in_pack_header_size;
 	unsigned tagged:1; /* near the very tip of refs */
 	unsigned filled:1; /* assigned write-order */
 	unsigned dfs_state:OE_DFS_STATE_BITS;
 	unsigned depth:OE_DEPTH_BITS;
+
+	/* size: 80, bit_padding: 16 bits */
 };
 
 struct packing_data {
-- 
2.17.0.rc0.347.gf9cf61673a


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* Re: [PATCH v6 09/11] pack-objects: shrink size field in struct object_entry
  2018-03-18 14:25             ` [PATCH v6 09/11] pack-objects: shrink size " Nguyễn Thái Ngọc Duy
@ 2018-03-18 14:49               ` Ævar Arnfjörð Bjarmason
  2018-03-19 16:19               ` Junio C Hamano
  2018-03-19 16:43               ` Junio C Hamano
  2 siblings, 0 replies; 273+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2018-03-18 14:49 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: e, git, gitster, peff


On Sun, Mar 18 2018, Nguyễn Thái Ngọc Duy jotted:

> It's very very rare that an uncompressedd object is larger than 4GB

So this went from a typo of "uncompressd" in v5 to "uncompressedd",
needs one less "d": "uncompressed".

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v6 00/11] nd/pack-objects-pack-struct updates
  2018-03-18 14:25           ` [PATCH v6 " Nguyễn Thái Ngọc Duy
                               ` (10 preceding siblings ...)
  2018-03-18 14:25             ` [PATCH v6 11/11] pack-objects: reorder members to shrink " Nguyễn Thái Ngọc Duy
@ 2018-03-18 14:51             ` Ævar Arnfjörð Bjarmason
  2018-03-21  8:24             ` Jeff King
  2018-03-24  6:33             ` [PATCH v7 00/13] " Nguyễn Thái Ngọc Duy
  13 siblings, 0 replies; 273+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2018-03-18 14:51 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: e, git, gitster, peff


On Sun, Mar 18 2018, Nguyễn Thái Ngọc Duy jotted:

> v6 fixes the one optimization that I just couldn't get right, fixes
> two off-by-one error messages and a couple commit message update
> (biggest change is in 11/11 to record some numbers from AEvar)

Thanks, aside from the minor typo I just noted in
https://public-inbox.org/git/878tapcucc.fsf@evledraar.gmail.com/ (which
I trust Junio can fix up) this all looks good to me.

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v6 09/11] pack-objects: shrink size field in struct object_entry
  2018-03-18 14:25             ` [PATCH v6 09/11] pack-objects: shrink size " Nguyễn Thái Ngọc Duy
  2018-03-18 14:49               ` Ævar Arnfjörð Bjarmason
@ 2018-03-19 16:19               ` Junio C Hamano
  2018-03-19 16:23                 ` Duy Nguyen
  2018-03-19 16:43               ` Junio C Hamano
  2 siblings, 1 reply; 273+ messages in thread
From: Junio C Hamano @ 2018-03-19 16:19 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: avarab, e, git, peff

Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:

> +static inline int oe_fits_in_32bits(unsigned long limit)
> +{
> +	uint32_t truncated_limit = (uint32_t)limit;
> +
> +	return limit == truncated_limit;
> +}

I do not think it is worth a reroll (there only are a few
callsites), but the above has nothing to do with "oe" fitting
anything (it is about "limit").  Do you mind if I did this instead?

	static inline int fits_in_32bits(unsigned long size)

... or other suggestions, perhaps?


^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v6 09/11] pack-objects: shrink size field in struct object_entry
  2018-03-19 16:19               ` Junio C Hamano
@ 2018-03-19 16:23                 ` Duy Nguyen
  0 siblings, 0 replies; 273+ messages in thread
From: Duy Nguyen @ 2018-03-19 16:23 UTC (permalink / raw)
  To: Junio C Hamano
  Cc: Ævar Arnfjörð Bjarmason, Eric Wong,
	Git Mailing List, Jeff King

On Mon, Mar 19, 2018 at 5:19 PM, Junio C Hamano <gitster@pobox.com> wrote:
> Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:
>
>> +static inline int oe_fits_in_32bits(unsigned long limit)
>> +{
>> +     uint32_t truncated_limit = (uint32_t)limit;
>> +
>> +     return limit == truncated_limit;
>> +}
>
> I do not think it is worth a reroll (there only are a few
> callsites), but the above has nothing to do with "oe" fitting
> anything (it is about "limit").  Do you mind if I did this instead?
>
>         static inline int fits_in_32bits(unsigned long size)
>
> ... or other suggestions, perhaps?
>

I just tried to not pollute the general namespace too much. That works
for me too.
-- 
Duy

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v6 09/11] pack-objects: shrink size field in struct object_entry
  2018-03-18 14:25             ` [PATCH v6 09/11] pack-objects: shrink size " Nguyễn Thái Ngọc Duy
  2018-03-18 14:49               ` Ævar Arnfjörð Bjarmason
  2018-03-19 16:19               ` Junio C Hamano
@ 2018-03-19 16:43               ` Junio C Hamano
  2018-03-19 16:54                 ` Duy Nguyen
  2018-03-20 18:17                 ` Duy Nguyen
  2 siblings, 2 replies; 273+ messages in thread
From: Junio C Hamano @ 2018-03-19 16:43 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: avarab, e, git, peff

Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:

> +static inline void oe_set_size(struct object_entry *e,
> +			       unsigned long size)
> +{
> +	e->size_ = size;
> +	e->size_valid = e->size_ == size;

A quite similar comment as my earlier one applies here.  I wonder if
this is easier to read?
	
	e->size_valid = fits_in_32bits(size);
	if (e->size_valid)
		e->size_ = size;

Stepping back a bit in a different tangent, 

 - fits_in_32bits() is a good public name if the helper is about
   seeing if the given quantity fits in 32bit uint,

 - but that carves it in stone that our e->size_ *will* be 32bit
   forever, which is not good.

So, it may be a good idea to call it size_cacheable_in_oe(size) or
something to ask "I have this 'size'; is it small enough to fit in
the field in the oe, i.e. allow us to cache it, as opposed to having
to go back to the object every time?"  Of course, this would declare
that the helper can only be used for that particular field, but that
is sort of the point of such a change, to allow us to later define
the e->size_ field to different sizes without affecting other stuff.

> +	if (!e->size_valid) {
> +		unsigned long real_size;
> +
> +		if (sha1_object_info(e->idx.oid.hash, &real_size) < 0 ||
> +		    size != real_size)
> +			die("BUG: 'size' is supposed to be the object size!");
> +	}

If an object that is smaller than 4GB is fed to this function with
an incorrect size, we happily record it in e->size_ and declare it
is valid.  Wouldn't that be equally grave error as we are catching
in this block?

> +}
> +
>  #endif

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v6 09/11] pack-objects: shrink size field in struct object_entry
  2018-03-19 16:43               ` Junio C Hamano
@ 2018-03-19 16:54                 ` Duy Nguyen
  2018-03-19 18:29                   ` Junio C Hamano
  2018-03-20 18:17                 ` Duy Nguyen
  1 sibling, 1 reply; 273+ messages in thread
From: Duy Nguyen @ 2018-03-19 16:54 UTC (permalink / raw)
  To: Junio C Hamano
  Cc: Ævar Arnfjörð Bjarmason, Eric Wong,
	Git Mailing List, Jeff King

On Mon, Mar 19, 2018 at 5:43 PM, Junio C Hamano <gitster@pobox.com> wrote:
> Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:
>
>> +static inline void oe_set_size(struct object_entry *e,
>> +                            unsigned long size)
>> +{
>> +     e->size_ = size;
>> +     e->size_valid = e->size_ == size;
>
> A quite similar comment as my earlier one applies here.  I wonder if
> this is easier to read?
>
>         e->size_valid = fits_in_32bits(size);
>         if (e->size_valid)
>                 e->size_ = size;
>
> Stepping back a bit in a different tangent,
>
>  - fits_in_32bits() is a good public name if the helper is about
>    seeing if the given quantity fits in 32bit uint,
>
>  - but that carves it in stone that our e->size_ *will* be 32bit
>    forever, which is not good.
>
> So, it may be a good idea to call it size_cacheable_in_oe(size) or
> something to ask "I have this 'size'; is it small enough to fit in
> the field in the oe, i.e. allow us to cache it, as opposed to having
> to go back to the object every time?"  Of course, this would declare
> that the helper can only be used for that particular field, but that
> is sort of the point of such a change, to allow us to later define
> the e->size_ field to different sizes without affecting other stuff.

This is why I do "size_valid = size_ == size". In my private build, I
reduced size_ to less than 32 bits and change the "fits_in_32bits"
function to do something like

int fits_in_32bits(unsigned long size)
{
struct object_entry e;
e.size_ = size;
return e.size_ == size.
}

which makes sure it always works. This spreads the use of "valid = xx
== yy"  in more places though. I think if we just limit the use of
this expression in a couple access wrappers than it's not so bad.

>> +     if (!e->size_valid) {
>> +             unsigned long real_size;
>> +
>> +             if (sha1_object_info(e->idx.oid.hash, &real_size) < 0 ||
>> +                 size != real_size)
>> +                     die("BUG: 'size' is supposed to be the object size!");
>> +     }
>
> If an object that is smaller than 4GB is fed to this function with
> an incorrect size, we happily record it in e->size_ and declare it
> is valid.  Wouldn't that be equally grave error as we are catching
> in this block?

That adds an extra sha1_object_info() to all objects and it's
expensive (I think it's one of the reasons we cache values in
object_entry in the first place). I think there are also a few
occasions we reuse even bad in-pack objects (there are even tests for
that) so it's not always safe to die() here.
-- 
Duy

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v3 2/7] gc: add --keep-base-pack
  2018-03-16 21:05         ` Ævar Arnfjörð Bjarmason
@ 2018-03-19 17:26           ` Duy Nguyen
  2018-03-19 19:04             ` Ævar Arnfjörð Bjarmason
  0 siblings, 1 reply; 273+ messages in thread
From: Duy Nguyen @ 2018-03-19 17:26 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Eric Wong, Git Mailing List, Junio C Hamano, Jeff King

On Fri, Mar 16, 2018 at 10:05 PM, Ævar Arnfjörð Bjarmason
<avarab@gmail.com> wrote:
>
> On Fri, Mar 16 2018, Nguyễn Thái Ngọc Duy jotted:
>
>> +--keep-base-pack::
>> +     All packs except the base pack and those marked with a `.keep`
>> +     files are consolidated into a single pack. The largest pack is
>> +     considered the base pack.
>> +
>
> I wonder if all of this would be less confusing as:
>
>> +--keep-biggest-pack::
>> +     All packs except the largest pack and those marked with a `.keep`
>> +     files are consolidated into a single pack.
>
> I.e. just skimming these docs I'd expect "base" to somehow be the thing
> that we initially cloned, of course in almost all cases that *is* the
> largest pack, but not necessarily. So rather than communicate that
> expectation let's just say largest/biggest?

Keeping the term base pack allows us to change its definition later
(something else other than "largest"). But to be honest I can't see
what else can a base pack(s) be. So unless people object I'm changing
this to --keep-biggest-pack (which could take a value <N> to keep <N>
largest packs, but I will refrain from doing things we don't need
right now).

>
> Maybe I'm the only one who finds this confusing...
-- 
Duy

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v6 09/11] pack-objects: shrink size field in struct object_entry
  2018-03-19 16:54                 ` Duy Nguyen
@ 2018-03-19 18:29                   ` Junio C Hamano
  2018-03-19 18:45                     ` Duy Nguyen
  0 siblings, 1 reply; 273+ messages in thread
From: Junio C Hamano @ 2018-03-19 18:29 UTC (permalink / raw)
  To: Duy Nguyen
  Cc: Ævar Arnfjörð Bjarmason, Eric Wong,
	Git Mailing List, Jeff King

Duy Nguyen <pclouds@gmail.com> writes:

> This is why I do "size_valid = size_ == size". In my private build, I
> reduced size_ to less than 32 bits and change the "fits_in_32bits"
> function to do something like
>
> int fits_in_32bits(unsigned long size)
> {
> struct object_entry e;
> e.size_ = size;
> return e.size_ == size.
> }
>
> which makes sure it always works. This spreads the use of "valid = xx
> == yy"  in more places though. I think if we just limit the use of
> this expression in a couple access wrappers than it's not so bad.

Yes, but then we should name the helper so that it is clear that it
is not about 32-bit but is about the width of e.size_ field.
>
>>> +     if (!e->size_valid) {
>>> +             unsigned long real_size;
>>> +
>>> +             if (sha1_object_info(e->idx.oid.hash, &real_size) < 0 ||
>>> +                 size != real_size)
>>> +                     die("BUG: 'size' is supposed to be the object size!");
>>> +     }
>>
>> If an object that is smaller than 4GB is fed to this function with
>> an incorrect size, we happily record it in e->size_ and declare it
>> is valid.  Wouldn't that be equally grave error as we are catching
>> in this block?
>
> That adds an extra sha1_object_info() to all objects and it's
> expensive (I think it's one of the reasons we cache values in
> object_entry in the first place). I think there are also a few
> occasions we reuse even bad in-pack objects (there are even tests for
> that) so it's not always safe to die() here.

So what?  My point is that I do not see the point in checking if the
size is correct on only one side (i.e. size is too big to fit in
e->size_) and not the other.  If it is worth checking (perhaps under
"#ifndef NDEBUG" or some other debug option?) then I'd think we
should spend cycles for all objects and check.


^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v6 09/11] pack-objects: shrink size field in struct object_entry
  2018-03-19 18:29                   ` Junio C Hamano
@ 2018-03-19 18:45                     ` Duy Nguyen
  2018-03-19 20:10                       ` Junio C Hamano
  0 siblings, 1 reply; 273+ messages in thread
From: Duy Nguyen @ 2018-03-19 18:45 UTC (permalink / raw)
  To: Junio C Hamano
  Cc: Ævar Arnfjörð Bjarmason, Eric Wong,
	Git Mailing List, Jeff King

On Mon, Mar 19, 2018 at 7:29 PM, Junio C Hamano <gitster@pobox.com> wrote:
>>>> +     if (!e->size_valid) {
>>>> +             unsigned long real_size;
>>>> +
>>>> +             if (sha1_object_info(e->idx.oid.hash, &real_size) < 0 ||
>>>> +                 size != real_size)
>>>> +                     die("BUG: 'size' is supposed to be the object size!");
>>>> +     }
>>>
>>> If an object that is smaller than 4GB is fed to this function with
>>> an incorrect size, we happily record it in e->size_ and declare it
>>> is valid.  Wouldn't that be equally grave error as we are catching
>>> in this block?
>>
>> That adds an extra sha1_object_info() to all objects and it's
>> expensive (I think it's one of the reasons we cache values in
>> object_entry in the first place). I think there are also a few
>> occasions we reuse even bad in-pack objects (there are even tests for
>> that) so it's not always safe to die() here.
>
> So what?  My point is that I do not see the point in checking if the
> size is correct on only one side (i.e. size is too big to fit in
> e->size_) and not the other.  If it is worth checking (perhaps under
> "#ifndef NDEBUG" or some other debug option?) then I'd think we
> should spend cycles for all objects and check.

There is a difference. For sizes smaller than 2^32, whatever you pass
to oe_set_size() will be returned by oe_size(), consistently. It does
not matter if this size is "good" or not. With sizes > 2^32, we make
the assumption that this size must be the same as one found in the
object database. If it's different, oe_size() will return something
else other than oe_set_size() is given. This check here is to make
sure we do not accidentally let the caller fall into this trap.

Yes, it may be a good thing to check anyway even for sizes < 2^32. I'm
a bit uncomfortable doing that though. I was trying to exercise this
code the other day by reducing size_ field down to 4 bits, and a
couple tests broke but I still don't understand how. It's probably
just me pushing the limits too hard, not a bug in these changes. But
it does tell me that I don't understand pack-objects enough to assert
that "all calls to oe_set_size() give good size".
-- 
Duy

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v3 2/7] gc: add --keep-base-pack
  2018-03-19 17:26           ` Duy Nguyen
@ 2018-03-19 19:04             ` Ævar Arnfjörð Bjarmason
  0 siblings, 0 replies; 273+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2018-03-19 19:04 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Eric Wong, Git Mailing List, Junio C Hamano, Jeff King


On Mon, Mar 19 2018, Duy Nguyen jotted:

> On Fri, Mar 16, 2018 at 10:05 PM, Ævar Arnfjörð Bjarmason
> <avarab@gmail.com> wrote:
>>
>> On Fri, Mar 16 2018, Nguyễn Thái Ngọc Duy jotted:
>>
>>> +--keep-base-pack::
>>> +     All packs except the base pack and those marked with a `.keep`
>>> +     files are consolidated into a single pack. The largest pack is
>>> +     considered the base pack.
>>> +
>>
>> I wonder if all of this would be less confusing as:
>>
>>> +--keep-biggest-pack::
>>> +     All packs except the largest pack and those marked with a `.keep`
>>> +     files are consolidated into a single pack.
>>
>> I.e. just skimming these docs I'd expect "base" to somehow be the thing
>> that we initially cloned, of course in almost all cases that *is* the
>> largest pack, but not necessarily. So rather than communicate that
>> expectation let's just say largest/biggest?
>
> Keeping the term base pack allows us to change its definition later
> (something else other than "largest"). But to be honest I can't see
> what else can a base pack(s) be. So unless people object I'm changing
> this to --keep-biggest-pack (which could take a value <N> to keep <N>
> largest packs, but I will refrain from doing things we don't need
> right now).

Maybe I've just been reading this differently, but to me the "base" pack
means the pack that holds the basis of our history, i.e. the thing we
first cloned. As in the base of the history.

Let's say we have a 100MB pack that we cloned, and someone adds a 200MB
(uncompressible) binary file to the repo, then we'll have a "base" pack
that's smaller than the "largest" pack.

So when I was initially reading this series I kept looking for some
discovery of *that* pack, but of course it turned out that it's just
looking for the largest pack.

I just think it's best to avoid that confusion since we really mean
largest, and maybe in the future it would be legitimate to treat the
"base" pack differently, i.e. as you pull down more updates you're
likely to need to be consulting it less and less as time goes on, so
maybe we should have some mode to explicitly exclude just *that* pack
eventually. I.e. as an optimization to keep the more relevant stuff in
cache.

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v6 09/11] pack-objects: shrink size field in struct object_entry
  2018-03-19 18:45                     ` Duy Nguyen
@ 2018-03-19 20:10                       ` Junio C Hamano
  2018-03-20 18:08                         ` Duy Nguyen
  0 siblings, 1 reply; 273+ messages in thread
From: Junio C Hamano @ 2018-03-19 20:10 UTC (permalink / raw)
  To: Duy Nguyen
  Cc: Ævar Arnfjörð Bjarmason, Eric Wong,
	Git Mailing List, Jeff King

Duy Nguyen <pclouds@gmail.com> writes:

> There is a difference. For sizes smaller than 2^32, whatever you
> pass to oe_set_size() will be returned by oe_size(),
> consistently. It does not matter if this size is "good" .... If
> it's different, oe_size() will return something else other than
> oe_set_size() is given.

OK, fair enough.

> ... I was trying to exercise this
> code the other day by reducing size_ field down to 4 bits, and a
> couple tests broke but I still don't understand how.

Off by one?  Two or more copies of the same objects available whose
oe_size() are different?


^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: Why does pack-objects use so much memory on incremental packing?
  2018-03-17 22:05 ` Why does pack-objects use so much memory on incremental packing? Ævar Arnfjörð Bjarmason
  2018-03-18  8:37   ` Duy Nguyen
@ 2018-03-20  5:28   ` Jeff King
  1 sibling, 0 replies; 273+ messages in thread
From: Jeff King @ 2018-03-20  5:28 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Duy Nguyen, git, Jonathan Tan, Jeff Hostetler

On Sat, Mar 17, 2018 at 11:05:59PM +0100, Ævar Arnfjörð Bjarmason wrote:

> Splitting this off into its own thread. Aside from the improvements in
> your repack memory reduction (20180317141033.21545-1-pclouds@gmail.com)
> and gc config (20180316192745.19557-1-pclouds@gmail.com) series's I'm
> wondering why repack takes so much memory to incrementally repack new
> stuff when you leave out the base pack.

I think it's a combination of a few issues:

 1. We do a complete history traversal, and then cull out objects which
    our filters reject (e.g., things in a .keep pack). So you pay for
    all of the "struct object", along with the obj_hash table to look
    them up.

    In my measurements of just "git rev-list --objects --all", that's
    about 25MB for git.git. Plus a few misc things (pending object
    structs for the traversal, etc).

 2. The delta-base cache used for the traversal is a fixed size. So
    that's going to be 96MB regardless of your repo size.

I measured a total heap usage of 130MB for "rev-list --objects --all".
That's not 230, but I'm not sure what you're measuring. If it's RSS,
keep in mind that includes the mmap'd packfiles, too.

Doing a separate "rev-list | pack-objects" should be minorly cheaper
(although it will still have a similar peak cost, since that memory will
just be moved to the rev-list process).

If you _just_ want to pack the loose objects, you could probably do
something like:

  find .git/objects/?? -type f |
  tr -d / |
  git pack-objects .git/objects/pack/pack
  git prune-packed

But you'd get pretty crappy deltas out of that, since the heuristics
rely on knowing the filenames of trees and blobs (which you can only get
by walking the graph).

So you'd do better with something like:

  git rev-list --objects $new_tips --not $old_tips |
  git pack-objects .git/objects/pack/pack

but it's hard to know what "$old_tips" should be, unless you recorded it
last time you did a full repack.

> But no, it takes around 230MB. But thinking about it a bit further:
> 
>  * This builds on top of existing history, so that needs to be
>    read/consulted

Right, I think this is the main thing.

>  * We might be reusing (if not directly, skipping re-comuting) deltas
>    from the existing pack.

I don't think that should matter. We'll reuse deltas if the base is
going into our pack, but otherwise recompute. The delta computation
itself takes some memory, but it should be fairly constant even for a
large repo (it's really average_blob_size * window_size).

So I think most of your memory is just going to the traversal stuff.
Running:

  valgrind --tool=massif git pack-objects --all foo </dev/null
  ms_print massif.out.*

shows 223MB at peak, with 43% of the memory to the delta cache, about
10% to traversal (object structs and hash), 6% to pack revindexes, and
34% for the big packlist array. So for anything focusing on the
packlist, you can at best reclaim 75MB.

I suspect a bigger repository would be more interesting, though, since
the delta cache would remain the same size.

> But I get the same result if after cloning I make an orphan branch, and
> pass all the "do this as cheaply as possible" branches I can find down
> to git-repack:
> 
>     (
>         rm -rf /tmp/git &&
>         git clone git@github.com:git/git.git /tmp/git &&
>         cd /tmp/git &&
>         touch $(ls .git/objects/pack/*pack | sed 's/\.pack$/.keep/') &&
>         git checkout --orphan new &&
>         git reset --hard &&
>         for i in {1..10}
>         do
>             touch $i &&
>             git add $i &&
>             git commit -m$i
>         done &&
>         git tag -d $(git tag -l) &&
>         /usr/bin/time -f %M git repack -A -d -f -F --window=1 --depth=1
>     )
> 
> But the memory use barely changes, my first example used 227924 kb, but
> this one uses 226788.

I think you still had to do the whole history traversal there, because
you have existing refs (the "master" branch, along with refs/remotes) as
well as reflogs.

Try:

  git branch -d master
  git remote rm origin
  rm -rf .git/logs

After that, the repack uses about 5MB.

> Jeff: Is this something ref islands[1] could be (ab)used to do, or have
> I misunderstood that concept?
> 
> 1. https://public-inbox.org/git/20130626051117.GB26755@sigill.intra.peff.net/
>    https://public-inbox.org/git/20160304153359.GA16300@sigill.intra.peff.net/
>    https://public-inbox.org/git/20160809174528.2ydgkhd7ayclat3t@sigill.intra.peff.net/

I think you misunderstood the concept. :)

They are about disallowing deltas between unrelated islands. They
actually require _more_ memory, because you have to storage an island
bitmap for each object (though with some copy-on-write magic, it's not
too bad). But they can never save you memory, since reused deltas are
always cheaper than re-finding new ones.

-Peff

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v6 09/11] pack-objects: shrink size field in struct object_entry
  2018-03-19 20:10                       ` Junio C Hamano
@ 2018-03-20 18:08                         ` Duy Nguyen
  2018-03-20 18:22                           ` Junio C Hamano
  2018-03-21  8:03                           ` Jeff King
  0 siblings, 2 replies; 273+ messages in thread
From: Duy Nguyen @ 2018-03-20 18:08 UTC (permalink / raw)
  To: Junio C Hamano
  Cc: Ævar Arnfjörð Bjarmason, Eric Wong,
	Git Mailing List, Jeff King

On Mon, Mar 19, 2018 at 01:10:49PM -0700, Junio C Hamano wrote:
> > ... I was trying to exercise this
> > code the other day by reducing size_ field down to 4 bits, and a
> > couple tests broke but I still don't understand how.
> 
> Off by one?  Two or more copies of the same objects available whose
> oe_size() are different?
> 

No. I did indeed not understand pack-objects enough :)

This "size" field contains the delta size if the in-pack object is a
delta. So blindly falling back to object_sha1_info() which returns the
canonical object size is definitely wrong. Please eject the series
from 'pu' until I fix this. The bug won't likely affect anyone (since
they must have 4GB+ objects to trigger it) but better safe than sorry.

BTW can you apply this patch? This broken && chain made me think the
problem was in the next test. It would have saved me lots of time if I
saw this "BUG" line coming from the previous test.

-- 8< --
Subject: [PATCH] t9300: fix broken && chain

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 t/t9300-fast-import.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/t/t9300-fast-import.sh b/t/t9300-fast-import.sh
index e4d06accc4..e2a0ae4075 100755
--- a/t/t9300-fast-import.sh
+++ b/t/t9300-fast-import.sh
@@ -348,7 +348,7 @@ test_expect_success 'B: accept branch name "TEMP_TAG"' '
 	INPUT_END
 
 	test_when_finished "rm -f .git/TEMP_TAG
-		git gc
+		git gc &&
 		git prune" &&
 	git fast-import <input &&
 	test -f .git/TEMP_TAG &&
@@ -365,7 +365,7 @@ test_expect_success 'B: accept empty committer' '
 	INPUT_END
 
 	test_when_finished "git update-ref -d refs/heads/empty-committer-1
-		git gc
+		git gc &&
 		git prune" &&
 	git fast-import <input &&
 	out=$(git fsck) &&
-- 
2.17.0.rc0.348.gd5a49e0b6f

-- 8< --

^ permalink raw reply related	[flat|nested] 273+ messages in thread

* Re: [PATCH v6 09/11] pack-objects: shrink size field in struct object_entry
  2018-03-19 16:43               ` Junio C Hamano
  2018-03-19 16:54                 ` Duy Nguyen
@ 2018-03-20 18:17                 ` Duy Nguyen
  1 sibling, 0 replies; 273+ messages in thread
From: Duy Nguyen @ 2018-03-20 18:17 UTC (permalink / raw)
  To: Junio C Hamano
  Cc: Ævar Arnfjörð Bjarmason, Eric Wong,
	Git Mailing List, Jeff King

On Mon, Mar 19, 2018 at 5:43 PM, Junio C Hamano <gitster@pobox.com> wrote:
> Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:
>
>> +static inline void oe_set_size(struct object_entry *e,
>> +                            unsigned long size)
>> +{
>> +     e->size_ = size;
>> +     e->size_valid = e->size_ == size;
>
> A quite similar comment as my earlier one applies here.  I wonder if
> this is easier to read?
>
>         e->size_valid = fits_in_32bits(size);
>         if (e->size_valid)
>                 e->size_ = size;

I wonder if wrapping this "==" with something like this would help readability?

#define truncated(a,b) (a) != (b)

Then we could write

e->size_valid = !truncated(e->size_, size);
-- 
Duy

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v6 09/11] pack-objects: shrink size field in struct object_entry
  2018-03-20 18:08                         ` Duy Nguyen
@ 2018-03-20 18:22                           ` Junio C Hamano
  2018-03-21  8:03                           ` Jeff King
  1 sibling, 0 replies; 273+ messages in thread
From: Junio C Hamano @ 2018-03-20 18:22 UTC (permalink / raw)
  To: Duy Nguyen
  Cc: Ævar Arnfjörð Bjarmason, Eric Wong,
	Git Mailing List, Jeff King

Duy Nguyen <pclouds@gmail.com> writes:

> This "size" field contains the delta size if the in-pack object is a
> delta. So blindly falling back to object_sha1_info() which returns the
> canonical object size is definitely wrong.

Yup.  Also we need to be careful when going back to the packfile to
read the size in question.  A different packfile that has the same
object may have delta that was constructed differently and of wrong
size.

> Please eject the series
> from 'pu' until I fix this. The bug won't likely affect anyone (since
> they must have 4GB+ objects to trigger it) but better safe than sorry.

> BTW can you apply this patch? This broken && chain made me think the
> problem was in the next test. It would have saved me lots of time if I
> saw this "BUG" line coming from the previous test.

Thanks, will do.

>
> -- 8< --
> Subject: [PATCH] t9300: fix broken && chain
>
> Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
> ---
>  t/t9300-fast-import.sh | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/t/t9300-fast-import.sh b/t/t9300-fast-import.sh
> index e4d06accc4..e2a0ae4075 100755
> --- a/t/t9300-fast-import.sh
> +++ b/t/t9300-fast-import.sh
> @@ -348,7 +348,7 @@ test_expect_success 'B: accept branch name "TEMP_TAG"' '
>  	INPUT_END
>  
>  	test_when_finished "rm -f .git/TEMP_TAG
> -		git gc
> +		git gc &&
>  		git prune" &&
>  	git fast-import <input &&
>  	test -f .git/TEMP_TAG &&
> @@ -365,7 +365,7 @@ test_expect_success 'B: accept empty committer' '
>  	INPUT_END
>  
>  	test_when_finished "git update-ref -d refs/heads/empty-committer-1
> -		git gc
> +		git gc &&
>  		git prune" &&
>  	git fast-import <input &&
>  	out=$(git fsck) &&

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v6 09/11] pack-objects: shrink size field in struct object_entry
  2018-03-20 18:08                         ` Duy Nguyen
  2018-03-20 18:22                           ` Junio C Hamano
@ 2018-03-21  8:03                           ` Jeff King
  2018-03-21 16:12                             ` Duy Nguyen
  1 sibling, 1 reply; 273+ messages in thread
From: Jeff King @ 2018-03-21  8:03 UTC (permalink / raw)
  To: Duy Nguyen
  Cc: Junio C Hamano, Ævar Arnfjörð Bjarmason,
	Eric Wong, Git Mailing List

On Tue, Mar 20, 2018 at 07:08:07PM +0100, Duy Nguyen wrote:

> BTW can you apply this patch? This broken && chain made me think the
> problem was in the next test. It would have saved me lots of time if I
> saw this "BUG" line coming from the previous test.
> 
> -- 8< --
> Subject: [PATCH] t9300: fix broken && chain
> 
> Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
> ---
>  t/t9300-fast-import.sh | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/t/t9300-fast-import.sh b/t/t9300-fast-import.sh
> index e4d06accc4..e2a0ae4075 100755
> --- a/t/t9300-fast-import.sh
> +++ b/t/t9300-fast-import.sh
> @@ -348,7 +348,7 @@ test_expect_success 'B: accept branch name "TEMP_TAG"' '
>  	INPUT_END
>  
>  	test_when_finished "rm -f .git/TEMP_TAG
> -		git gc
> +		git gc &&
>  		git prune" &&

The &&-chain is broken from the first command, too. It's "rm -f", which
is not that big a deal, but...

> @@ -365,7 +365,7 @@ test_expect_success 'B: accept empty committer' '
>  	INPUT_END
>  
>  	test_when_finished "git update-ref -d refs/heads/empty-committer-1
> -		git gc
> +		git gc &&
>  		git prune" &&

Same here, but we probably care more about noticing update-ref failure.

-Peff

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v6 00/11] nd/pack-objects-pack-struct updates
  2018-03-18 14:25           ` [PATCH v6 " Nguyễn Thái Ngọc Duy
                               ` (11 preceding siblings ...)
  2018-03-18 14:51             ` [PATCH v6 00/11] nd/pack-objects-pack-struct updates Ævar Arnfjörð Bjarmason
@ 2018-03-21  8:24             ` Jeff King
  2018-03-21 15:59               ` Duy Nguyen
  2018-03-21 16:31               ` Ævar Arnfjörð Bjarmason
  2018-03-24  6:33             ` [PATCH v7 00/13] " Nguyễn Thái Ngọc Duy
  13 siblings, 2 replies; 273+ messages in thread
From: Jeff King @ 2018-03-21  8:24 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: avarab, e, git, gitster

On Sun, Mar 18, 2018 at 03:25:15PM +0100, Nguyễn Thái Ngọc Duy wrote:

> v6 fixes the one optimization that I just couldn't get right, fixes
> two off-by-one error messages and a couple commit message update
> (biggest change is in 11/11 to record some numbers from AEvar)

I was traveling during some of the earlier rounds, so I finally got a
chance to take a look at this.

I hate to be a wet blanket, but am I the only one who is wondering
whether the tradeoffs is worth it? 8% memory reduction doesn't seem
mind-bogglingly good, and I'm concerned about two things:

  1. The resulting code is harder to read and reason about (things like
     the DELTA() macros), and seems a lot more brittle (things like the
     new size_valid checks).

  2. There are lots of new limits. Some of these are probably fine
     (e.g., the cacheable delta size), but things like the
     number-of-packs limit don't have very good user-facing behavior.
     Yes, having that many packs is insane, but that's going to be small
     consolation to somebody whose automated maintenance program now
     craps out at 16k packs, when it previously would have just worked
     to fix the situation.

Saving 8% is nice, but the number of objects in linux.git grew over 12%
in the last year. So you've bought yourself 8 months before the problem
is back. Is it worth making these changes that we'll have to deal with
for many years to buy 8 months of memory savings?

I think ultimately to work on low-memory machines we'll need a
fundamentally different approach that scales with the objects since the
last pack, and not with the complete history.

-Peff

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v6 00/11] nd/pack-objects-pack-struct updates
  2018-03-21  8:24             ` Jeff King
@ 2018-03-21 15:59               ` Duy Nguyen
  2018-03-21 16:17                 ` Ævar Arnfjörð Bjarmason
                                   ` (2 more replies)
  2018-03-21 16:31               ` Ævar Arnfjörð Bjarmason
  1 sibling, 3 replies; 273+ messages in thread
From: Duy Nguyen @ 2018-03-21 15:59 UTC (permalink / raw)
  To: Jeff King
  Cc: Ævar Arnfjörð Bjarmason, Eric Wong,
	Git Mailing List, Junio C Hamano

On Wed, Mar 21, 2018 at 9:24 AM, Jeff King <peff@peff.net> wrote:
> On Sun, Mar 18, 2018 at 03:25:15PM +0100, Nguyễn Thái Ngọc Duy wrote:
>
>> v6 fixes the one optimization that I just couldn't get right, fixes
>> two off-by-one error messages and a couple commit message update
>> (biggest change is in 11/11 to record some numbers from AEvar)
>
> I was traveling during some of the earlier rounds, so I finally got a
> chance to take a look at this.
>
> I hate to be a wet blanket, but am I the only one who is wondering
> whether the tradeoffs is worth it? 8% memory reduction doesn't seem
> mind-bogglingly good,

AEvar measured RSS. If we count objects[] array alone, the saving is
40% (136 bytes per entry down to 80). Some is probably eaten up by
mmap in rss.

> and I'm concerned about two things:
>
>   1. The resulting code is harder to read and reason about (things like
>      the DELTA() macros), and seems a lot more brittle (things like the
>      new size_valid checks).
>
>   2. There are lots of new limits. Some of these are probably fine
>      (e.g., the cacheable delta size), but things like the
>      number-of-packs limit don't have very good user-facing behavior.
>      Yes, having that many packs is insane, but that's going to be small
>      consolation to somebody whose automated maintenance program now
>      craps out at 16k packs, when it previously would have just worked
>      to fix the situation.
>
> Saving 8% is nice, but the number of objects in linux.git grew over 12%
> in the last year. So you've bought yourself 8 months before the problem
> is back. Is it worth making these changes that we'll have to deal with
> for many years to buy 8 months of memory savings?

Well, with 40% it buys us a couple more months. The object growth
affects rev-list --all too so the actual "good months" is probably not
super far from 8 months.

Is it worth saving? I don't know. I raised the readability point from
the very first patch and if people believe it makes it much harder to
read, then no it's not worth it.

While pack-objects is simple from the functionality point of view, it
has received lots of optimizations and to me is quite fragile.
Readability does count in this code. Fortunately it still looks quite
ok to me with this series applied (but then it's subjective)

About the 16k limit (and some other limits as well), I'm making these
patches with the assumption that large scale deployment probably will
go with custom builds anyway. Adjusting the limits back should be
quite easy while we can still provide reasonable defaults for most
people.

> I think ultimately to work on low-memory machines we'll need a
> fundamentally different approach that scales with the objects since the
> last pack, and not with the complete history.

Absolutely. Which is covered in a separate "gc --auto" series. Some
memory reduction here may be still nice to have though. Even on beefy
machine, memory can still be reused somewhere other than wasted in
unused bits.
-- 
Duy

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v6 09/11] pack-objects: shrink size field in struct object_entry
  2018-03-21  8:03                           ` Jeff King
@ 2018-03-21 16:12                             ` Duy Nguyen
  0 siblings, 0 replies; 273+ messages in thread
From: Duy Nguyen @ 2018-03-21 16:12 UTC (permalink / raw)
  To: Jeff King
  Cc: Junio C Hamano, Ævar Arnfjörð Bjarmason,
	Eric Wong, Git Mailing List

On Wed, Mar 21, 2018 at 9:03 AM, Jeff King <peff@peff.net> wrote:
> On Tue, Mar 20, 2018 at 07:08:07PM +0100, Duy Nguyen wrote:
>
>> BTW can you apply this patch? This broken && chain made me think the
>> problem was in the next test. It would have saved me lots of time if I
>> saw this "BUG" line coming from the previous test.
>>
>> -- 8< --
>> Subject: [PATCH] t9300: fix broken && chain
>>
>> Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
>> ---
>>  t/t9300-fast-import.sh | 4 ++--
>>  1 file changed, 2 insertions(+), 2 deletions(-)
>>
>> diff --git a/t/t9300-fast-import.sh b/t/t9300-fast-import.sh
>> index e4d06accc4..e2a0ae4075 100755
>> --- a/t/t9300-fast-import.sh
>> +++ b/t/t9300-fast-import.sh
>> @@ -348,7 +348,7 @@ test_expect_success 'B: accept branch name "TEMP_TAG"' '
>>       INPUT_END
>>
>>       test_when_finished "rm -f .git/TEMP_TAG
>> -             git gc
>> +             git gc &&
>>               git prune" &&
>
> The &&-chain is broken from the first command, too. It's "rm -f", which
> is not that big a deal, but...
>
>> @@ -365,7 +365,7 @@ test_expect_success 'B: accept empty committer' '
>>       INPUT_END
>>
>>       test_when_finished "git update-ref -d refs/heads/empty-committer-1
>> -             git gc
>> +             git gc &&
>>               git prune" &&
>
> Same here, but we probably care more about noticing update-ref failure.

Yes. I wasn't sure if that update-ref could fail but did not check
since this was a side issue for me.
-- 
Duy

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v6 00/11] nd/pack-objects-pack-struct updates
  2018-03-21 15:59               ` Duy Nguyen
@ 2018-03-21 16:17                 ` Ævar Arnfjörð Bjarmason
  2018-03-21 16:22                   ` Duy Nguyen
  2018-03-21 16:46                 ` Duy Nguyen
  2018-03-22  9:32                 ` Jeff King
  2 siblings, 1 reply; 273+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2018-03-21 16:17 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Jeff King, Eric Wong, Git Mailing List, Junio C Hamano


On Wed, Mar 21 2018, Duy Nguyen wrote:

> On Wed, Mar 21, 2018 at 9:24 AM, Jeff King <peff@peff.net> wrote:
>> On Sun, Mar 18, 2018 at 03:25:15PM +0100, Nguyễn Thái Ngọc Duy wrote:
>>
>>> v6 fixes the one optimization that I just couldn't get right, fixes
>>> two off-by-one error messages and a couple commit message update
>>> (biggest change is in 11/11 to record some numbers from AEvar)
>>
>> I was traveling during some of the earlier rounds, so I finally got a
>> chance to take a look at this.
>>
>> I hate to be a wet blanket, but am I the only one who is wondering
>> whether the tradeoffs is worth it? 8% memory reduction doesn't seem
>> mind-bogglingly good,
>
> AEvar measured RSS. If we count objects[] array alone, the saving is
> 40% (136 bytes per entry down to 80). Some is probably eaten up by
> mmap in rss.

Yeah, sorry about spreading that confusion.

>> and I'm concerned about two things:
>>
>>   1. The resulting code is harder to read and reason about (things like
>>      the DELTA() macros), and seems a lot more brittle (things like the
>>      new size_valid checks).
>>
>>   2. There are lots of new limits. Some of these are probably fine
>>      (e.g., the cacheable delta size), but things like the
>>      number-of-packs limit don't have very good user-facing behavior.
>>      Yes, having that many packs is insane, but that's going to be small
>>      consolation to somebody whose automated maintenance program now
>>      craps out at 16k packs, when it previously would have just worked
>>      to fix the situation.
>>
>> Saving 8% is nice, but the number of objects in linux.git grew over 12%
>> in the last year. So you've bought yourself 8 months before the problem
>> is back. Is it worth making these changes that we'll have to deal with
>> for many years to buy 8 months of memory savings?
>
> Well, with 40% it buys us a couple more months. The object growth
> affects rev-list --all too so the actual "good months" is probably not
> super far from 8 months.
>
> Is it worth saving? I don't know. I raised the readability point from
> the very first patch and if people believe it makes it much harder to
> read, then no it's not worth it.
>
> While pack-objects is simple from the functionality point of view, it
> has received lots of optimizations and to me is quite fragile.
> Readability does count in this code. Fortunately it still looks quite
> ok to me with this series applied (but then it's subjective)
>
> About the 16k limit (and some other limits as well), I'm making these
> patches with the assumption that large scale deployment probably will
> go with custom builds anyway. Adjusting the limits back should be
> quite easy while we can still provide reasonable defaults for most
> people.
>
>> I think ultimately to work on low-memory machines we'll need a
>> fundamentally different approach that scales with the objects since the
>> last pack, and not with the complete history.
>
> Absolutely. Which is covered in a separate "gc --auto" series. Some
> memory reduction here may be still nice to have though. Even on beefy
> machine, memory can still be reused somewhere other than wasted in
> unused bits.

FWIW I've been running a combination of these two at work (also keeping
the big pack), and they've had a sizable impact on packing our monorepo,
on one of our dev boxes on a real-world checkout with a combo of the
"base" pack and other packs + loose objects, as measured by
/usr/bin/time

 * Reduction in user time by 37%
 * Reduction in system time by 84%
 * Reduction in RSS by 61%
 * Reduction in page faults by 58% & 94% (time(1) reports two different numbers)
 * Reduction in the I of I/O by 58%
 * Reduction in the O of I/O by 94%

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v6 00/11] nd/pack-objects-pack-struct updates
  2018-03-21 16:17                 ` Ævar Arnfjörð Bjarmason
@ 2018-03-21 16:22                   ` Duy Nguyen
  0 siblings, 0 replies; 273+ messages in thread
From: Duy Nguyen @ 2018-03-21 16:22 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Jeff King, Eric Wong, Git Mailing List, Junio C Hamano

On Wed, Mar 21, 2018 at 5:17 PM, Ævar Arnfjörð Bjarmason
<avarab@gmail.com> wrote:
>>> I think ultimately to work on low-memory machines we'll need a
>>> fundamentally different approach that scales with the objects since the
>>> last pack, and not with the complete history.
>>
>> Absolutely. Which is covered in a separate "gc --auto" series. Some
>> memory reduction here may be still nice to have though. Even on beefy
>> machine, memory can still be reused somewhere other than wasted in
>> unused bits.
>
> FWIW I've been running a combination of these two at work (also keeping
> the big pack), and they've had a sizable impact on packing our monorepo,
> on one of our dev boxes on a real-world checkout with a combo of the
> "base" pack and other packs + loose objects, as measured by
> /usr/bin/time
>
>  * Reduction in user time by 37%
>  * Reduction in system time by 84%
>  * Reduction in RSS by 61%
>  * Reduction in page faults by 58% & 94% (time(1) reports two different numbers)
>  * Reduction in the I of I/O by 58%
>  * Reduction in the O of I/O by 94%

The keeping big pack changes very likely contributes to most of this
reduction, so just to be clear these numbers can't be be used as an
argument in favor of this pack-objects series (but otherwise, wow! I
guess I need to finish up the gc series soon, then start the external
rev-list work to reduce even more ;-)
-- 
Duy

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v6 00/11] nd/pack-objects-pack-struct updates
  2018-03-21  8:24             ` Jeff King
  2018-03-21 15:59               ` Duy Nguyen
@ 2018-03-21 16:31               ` Ævar Arnfjörð Bjarmason
  2018-03-21 16:53                 ` Junio C Hamano
  2018-03-22  8:07                 ` Jeff King
  1 sibling, 2 replies; 273+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2018-03-21 16:31 UTC (permalink / raw)
  To: Jeff King; +Cc: Nguyễn Thái Ngọc Duy, e, git, gitster


On Wed, Mar 21 2018, Jeff King wrote:

> On Sun, Mar 18, 2018 at 03:25:15PM +0100, Nguyễn Thái Ngọc Duy wrote:
>
>> v6 fixes the one optimization that I just couldn't get right, fixes
>> two off-by-one error messages and a couple commit message update
>> (biggest change is in 11/11 to record some numbers from AEvar)
>
> [...]Yes, having that many packs is insane, but that's going to be
> small consolation to somebody whose automated maintenance program now
> craps out at 16k packs, when it previously would have just worked to
> fix the situation[...]

That's going to be super rare (and probably nonexisting) edge case, but
(untested) I wonder if something like this on top would alleviate your
concerns, i.e. instead of dying we just take the first N packs up to our
limit:

    diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
    index 4406af640f..49d467ab2a 100644
    --- a/builtin/pack-objects.c
    +++ b/builtin/pack-objects.c
    @@ -1065,8 +1065,9 @@ static int want_object_in_pack(const struct object_id *oid,

            want = 1;
     done:
    -       if (want && *found_pack && !(*found_pack)->index)
    -               oe_add_pack(&to_pack, *found_pack);
    +       if (want && *found_pack && !(*found_pack)->index) {
    +               if (oe_add_pack(&to_pack, *found_pack) == -1)
    +                       return 0;

            return want;
     }
    diff --git a/pack-objects.h b/pack-objects.h
    index 9f8e450e19..50ed2028fb 100644
    --- a/pack-objects.h
    +++ b/pack-objects.h
    @@ -171,15 +171,17 @@ static inline void oe_set_in_pack_pos(const struct packing_data *pack,
            pack->in_pack_pos[e - pack->objects] = pos;
     }

    -static inline unsigned int oe_add_pack(struct packing_data *pack,
    +static inline int oe_add_pack(struct packing_data *pack,
                                           struct packed_git *p)
     {
    -       if (pack->in_pack_count >= (1 << OE_IN_PACK_BITS))
    -               die(_("too many packs to handle in one go. "
    -                     "Please add .keep files to exclude\n"
    -                     "some pack files and keep the number "
    -                     "of non-kept files below %d."),
    +       if (pack->in_pack_count >= (1 << OE_IN_PACK_BITS)) {
    +               warning(_("Too many packs to handle in one go. "
    +                         "Ran into the limit of %d.\n"
    +                         "Limping along by pretending packs beyond that"
    +                         "number have *.keep!"),
                        1 << OE_IN_PACK_BITS);
    +               return -1;
    +       }
            if (p) {
                    if (p->index > 0)
                            die("BUG: this packed is already indexed");

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v6 00/11] nd/pack-objects-pack-struct updates
  2018-03-21 15:59               ` Duy Nguyen
  2018-03-21 16:17                 ` Ævar Arnfjörð Bjarmason
@ 2018-03-21 16:46                 ` Duy Nguyen
  2018-03-21 19:11                   ` Junio C Hamano
  2018-03-22  9:32                 ` Jeff King
  2 siblings, 1 reply; 273+ messages in thread
From: Duy Nguyen @ 2018-03-21 16:46 UTC (permalink / raw)
  To: Jeff King
  Cc: Ævar Arnfjörð Bjarmason, Eric Wong,
	Git Mailing List, Junio C Hamano

On Wed, Mar 21, 2018 at 04:59:19PM +0100, Duy Nguyen wrote:
> About the 16k limit (and some other limits as well), I'm making these
> patches with the assumption that large scale deployment probably will
> go with custom builds anyway. Adjusting the limits back should be
> quite easy while we can still provide reasonable defaults for most
> people.

And we could even do something like this to make custom builds
easier. Some more gluing is needed so you can set this from config.mak
but you get the idea. This removes all limits set by this
series. Readability in pack-objects.c and object_entry struct
declaration is still a concern though.

-- 8< --
diff --git a/pack-objects.h b/pack-objects.h
index af40211105..b6e84c9b48 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -2,10 +2,17 @@
 #define PACK_OBJECTS_H
 
 #define OE_DFS_STATE_BITS	2
+#ifdef PACK_OBJECTS_BIG_MEMORY
+#define OE_DEPTH_BITS		31
+/* OE_IN_PACK_BITS is not defined */
+#define OE_Z_DELTA_BITS		32
+#define OE_DELTA_SIZE_BITS	32
+#else
 #define OE_DEPTH_BITS		12
 #define OE_IN_PACK_BITS		14
 #define OE_Z_DELTA_BITS		16
 #define OE_DELTA_SIZE_BITS	31
+#endif
 
 /*
  * State flags for depth-first search used for analyzing delta cycles.
@@ -82,7 +89,11 @@ struct object_entry {
 				     */
 	uint32_t delta_size_:OE_DELTA_SIZE_BITS; /* delta data size (uncompressed) */
 	uint32_t delta_size_valid:1;
+#ifdef PACK_OBJECTS_BIG_MEMORY
+	struct packed_git *in_pack; /* already in pack */
+#else
 	unsigned in_pack_idx:OE_IN_PACK_BITS;	/* already in pack */
+#endif
 	unsigned size_valid:1;
 	unsigned z_delta_size:OE_Z_DELTA_BITS;
 	unsigned type_valid:1;
@@ -112,7 +123,9 @@ struct packing_data {
 
 	unsigned int *in_pack_pos;
 	int in_pack_count;
+#ifndef PACK_OBJECTS_BIG_MEMORY
 	struct packed_git *in_pack[1 << OE_IN_PACK_BITS];
+#endif
 };
 
 struct object_entry *packlist_alloc(struct packing_data *pdata,
@@ -174,6 +187,9 @@ static inline void oe_set_in_pack_pos(const struct packing_data *pack,
 static inline unsigned int oe_add_pack(struct packing_data *pack,
 				       struct packed_git *p)
 {
+#ifdef PACK_OBJECTS_BIG_MEMORY
+	return 0;
+#else
 	if (pack->in_pack_count >= (1 << OE_IN_PACK_BITS))
 		die(_("too many packs to handle in one go. "
 		      "Please add .keep files to exclude\n"
@@ -187,22 +203,31 @@ static inline unsigned int oe_add_pack(struct packing_data *pack,
 	}
 	pack->in_pack[pack->in_pack_count] = p;
 	return pack->in_pack_count++;
+#endif
 }
 
 static inline struct packed_git *oe_in_pack(const struct packing_data *pack,
 					    const struct object_entry *e)
 {
+#ifdef PACK_OBJECTS_BIG_MEMORY
+	return e->in_pack;
+#else
 	return pack->in_pack[e->in_pack_idx];
+#endif
 
 }
 
 static inline void oe_set_in_pack(struct object_entry *e,
 				  struct packed_git *p)
 {
+#ifdef PACK_OBJECTS_BIG_MEMORY
+	e->in_pack = p;
+#else
 	if (p->index <= 0)
 		die("BUG: found_pack should be NULL "
 		    "instead of having non-positive index");
 	e->in_pack_idx = p->index;
+#endif
 
 }
 
-- 8< --

^ permalink raw reply related	[flat|nested] 273+ messages in thread

* Re: [PATCH v6 00/11] nd/pack-objects-pack-struct updates
  2018-03-21 16:31               ` Ævar Arnfjörð Bjarmason
@ 2018-03-21 16:53                 ` Junio C Hamano
  2018-03-21 17:00                   ` Duy Nguyen
  2018-03-22  8:07                 ` Jeff King
  1 sibling, 1 reply; 273+ messages in thread
From: Junio C Hamano @ 2018-03-21 16:53 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Jeff King, Nguyễn Thái Ngọc Duy, e, git

Ævar Arnfjörð Bjarmason <avarab@gmail.com> writes:

> That's going to be super rare (and probably nonexisting) edge case, but
> (untested) I wonder if something like this on top would alleviate your
> concerns, i.e. instead of dying we just take the first N packs up to our
> limit:
>
>     diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
>     index 4406af640f..49d467ab2a 100644
>     --- a/builtin/pack-objects.c
>     +++ b/builtin/pack-objects.c
>     @@ -1065,8 +1065,9 @@ static int want_object_in_pack(const struct object_id *oid,
>
>             want = 1;
>      done:
>     -       if (want && *found_pack && !(*found_pack)->index)
>     -               oe_add_pack(&to_pack, *found_pack);
>     +       if (want && *found_pack && !(*found_pack)->index) {
>     +               if (oe_add_pack(&to_pack, *found_pack) == -1)
>     +                       return 0;
>
>             return want;
>      }

It is probably a small first step in the right direction, but we'd
need to communicate which packs we ignored with this logic to the
calling program.  I offhand do not know how we would handle the "-d"
part of "repack -a -d" without it.


^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v6 00/11] nd/pack-objects-pack-struct updates
  2018-03-21 16:53                 ` Junio C Hamano
@ 2018-03-21 17:00                   ` Duy Nguyen
  0 siblings, 0 replies; 273+ messages in thread
From: Duy Nguyen @ 2018-03-21 17:00 UTC (permalink / raw)
  To: Junio C Hamano
  Cc: Ævar Arnfjörð Bjarmason, Jeff King, Eric Wong,
	Git Mailing List

On Wed, Mar 21, 2018 at 5:53 PM, Junio C Hamano <gitster@pobox.com> wrote:
> Ævar Arnfjörð Bjarmason <avarab@gmail.com> writes:
>
>> That's going to be super rare (and probably nonexisting) edge case, but
>> (untested) I wonder if something like this on top would alleviate your
>> concerns, i.e. instead of dying we just take the first N packs up to our
>> limit:
>>
>>     diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
>>     index 4406af640f..49d467ab2a 100644
>>     --- a/builtin/pack-objects.c
>>     +++ b/builtin/pack-objects.c
>>     @@ -1065,8 +1065,9 @@ static int want_object_in_pack(const struct object_id *oid,
>>
>>             want = 1;
>>      done:
>>     -       if (want && *found_pack && !(*found_pack)->index)
>>     -               oe_add_pack(&to_pack, *found_pack);
>>     +       if (want && *found_pack && !(*found_pack)->index) {
>>     +               if (oe_add_pack(&to_pack, *found_pack) == -1)
>>     +                       return 0;
>>
>>             return want;
>>      }
>
> It is probably a small first step in the right direction, but we'd
> need to communicate which packs we ignored with this logic to the
> calling program.  I offhand do not know how we would handle the "-d"
> part of "repack -a -d" without it.

repack will delete all the packs except ones with .keep files and ones
created by pack-objects. So this change alone is not enough. I think I
did mention that we could make this work by making repack run
pack-objects multiple times. But I did not do it because I did not
think it could really happen.
-- 
Duy

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v6 00/11] nd/pack-objects-pack-struct updates
  2018-03-21 16:46                 ` Duy Nguyen
@ 2018-03-21 19:11                   ` Junio C Hamano
  0 siblings, 0 replies; 273+ messages in thread
From: Junio C Hamano @ 2018-03-21 19:11 UTC (permalink / raw)
  To: Duy Nguyen
  Cc: Jeff King, Ævar Arnfjörð Bjarmason, Eric Wong,
	Git Mailing List

Duy Nguyen <pclouds@gmail.com> writes:

> And we could even do something like this to make custom builds
> easier. Some more gluing is needed so you can set this from config.mak
> but you get the idea. This removes all limits set by this
> series.

Yes, we _could_, but it would mean we would have many variants of
the codepath that is pretty crucial to the integrity of the data we
keep in the repository, all of which must pretty much be bug-free.

> Readability in pack-objects.c and object_entry struct declaration
> is still a concern though.

Yup, a change like this does not change the readability; personally,
I do not think the original is _too_ bad, though.


^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v6 00/11] nd/pack-objects-pack-struct updates
  2018-03-21 16:31               ` Ævar Arnfjörð Bjarmason
  2018-03-21 16:53                 ` Junio C Hamano
@ 2018-03-22  8:07                 ` Jeff King
  2018-03-22  8:23                   ` Duy Nguyen
  1 sibling, 1 reply; 273+ messages in thread
From: Jeff King @ 2018-03-22  8:07 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Nguyễn Thái Ngọc Duy, e, git, gitster

On Wed, Mar 21, 2018 at 05:31:14PM +0100, Ævar Arnfjörð Bjarmason wrote:

> > [...]Yes, having that many packs is insane, but that's going to be
> > small consolation to somebody whose automated maintenance program now
> > craps out at 16k packs, when it previously would have just worked to
> > fix the situation[...]
> 
> That's going to be super rare (and probably nonexisting) edge case, but
> (untested) I wonder if something like this on top would alleviate your
> concerns, i.e. instead of dying we just take the first N packs up to our
> limit:

I wish you were right about the rarity, but it's unfortunately something
I have seen multiple times in the wild (and why I spent time optimizing
the many-packs case for pack-objects). Unfortunately I don't know how
often it actually comes up, because in theory running "git repack"
cleans it up without further ado. But after these patches, not so much.

I'll admit that my experiences aren't necessarily typical of most git
users. But I wouldn't be surprised if other people hosting their own
repositories run into this, too (e.g., somebody pushing in a loop,
auto-gc disabled or clogged by something silly like the "too many loose
objects" warning).

>     diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
>     index 4406af640f..49d467ab2a 100644
>     --- a/builtin/pack-objects.c
>     +++ b/builtin/pack-objects.c
>     @@ -1065,8 +1065,9 @@ static int want_object_in_pack(const struct object_id *oid,
> 
>             want = 1;
>      done:
>     -       if (want && *found_pack && !(*found_pack)->index)
>     -               oe_add_pack(&to_pack, *found_pack);
>     +       if (want && *found_pack && !(*found_pack)->index) {
>     +               if (oe_add_pack(&to_pack, *found_pack) == -1)
>     +                       return 0;

Something like this does seem like a much better fallback, as we'd make
forward progress instead of aborting (and exacerbating whatever caused
the packs to stack up in the first place).

I think the patch as-is does not work, though. You say "oops, too many
packs" and so the "yes we want this object" return becomes "no, we do
not want it". And it is not included in the resulting packfile.

But what happens after that? After pack-objects finishes, we return to
"git repack", which assumes that pack-objects packed everything it was
told to. And with "-d", it then _deletes_ the old packs, knowing that
anything of value was copied to the new pack. So with this patch, we'd
corrupt the repository if this code is ever hit.

You'd need some way to report back to "git repack" that the pack was
omitted. Or probably more sensibly, you'd need "git repack" to count up
the packs and make sure that it marks anybody beyond the limit manually
as .keep (presumably using Duy's new command-line option rather than
actually writing a file).

-Peff

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v6 00/11] nd/pack-objects-pack-struct updates
  2018-03-22  8:07                 ` Jeff King
@ 2018-03-22  8:23                   ` Duy Nguyen
  2018-03-22 10:01                     ` Jeff King
  0 siblings, 1 reply; 273+ messages in thread
From: Duy Nguyen @ 2018-03-22  8:23 UTC (permalink / raw)
  To: Jeff King
  Cc: Ævar Arnfjörð Bjarmason, Eric Wong,
	Git Mailing List, Junio C Hamano

On Thu, Mar 22, 2018 at 9:07 AM, Jeff King <peff@peff.net> wrote:
> On Wed, Mar 21, 2018 at 05:31:14PM +0100, Ævar Arnfjörð Bjarmason wrote:
>
>> > [...]Yes, having that many packs is insane, but that's going to be
>> > small consolation to somebody whose automated maintenance program now
>> > craps out at 16k packs, when it previously would have just worked to
>> > fix the situation[...]
>>
>> That's going to be super rare (and probably nonexisting) edge case, but
>> (untested) I wonder if something like this on top would alleviate your
>> concerns, i.e. instead of dying we just take the first N packs up to our
>> limit:
>
> I wish you were right about the rarity, but it's unfortunately something
> I have seen multiple times in the wild (and why I spent time optimizing
> the many-packs case for pack-objects). Unfortunately I don't know how
> often it actually comes up, because in theory running "git repack"
> cleans it up without further ado. But after these patches, not so much.

It's good to know this case is real and I can start to fix it
(assuming that the other concern about readability will not stop this
series).

I think I'll try to fix this without involving repack. pack-objects
can produce multiple packs, so if we have more than 16k pack files, we
produce  one new pack per 16k old ones.
-- 
Duy

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v6 00/11] nd/pack-objects-pack-struct updates
  2018-03-21 15:59               ` Duy Nguyen
  2018-03-21 16:17                 ` Ævar Arnfjörð Bjarmason
  2018-03-21 16:46                 ` Duy Nguyen
@ 2018-03-22  9:32                 ` Jeff King
  2018-03-22  9:46                   ` Jeff King
                                     ` (2 more replies)
  2 siblings, 3 replies; 273+ messages in thread
From: Jeff King @ 2018-03-22  9:32 UTC (permalink / raw)
  To: Duy Nguyen
  Cc: Ævar Arnfjörð Bjarmason, Eric Wong,
	Git Mailing List, Junio C Hamano

On Wed, Mar 21, 2018 at 04:59:19PM +0100, Duy Nguyen wrote:

> > I hate to be a wet blanket, but am I the only one who is wondering
> > whether the tradeoffs is worth it? 8% memory reduction doesn't seem
> > mind-bogglingly good,
> 
> AEvar measured RSS. If we count objects[] array alone, the saving is
> 40% (136 bytes per entry down to 80). Some is probably eaten up by
> mmap in rss.

Measuring actual heap usage with massif, I get before/after peak heaps
of 1728 and 1346MB respectively when repacking linux.git. So that's ~22%
savings overall.

Of the used heap after your patches:

 - ~40% of that is from packlist_alloc()
 - ~17% goes to "struct object"
 - ~10% for the object.c hash table to store all the "struct object"
 - ~7% goes to the delta cache
 - ~7% goes to the pack revindex (actually, there's a duplicate 7%
       there, too; I think our peak is when we're sorting the revindex
       and have to keep two copies in memory at once)
 - ~5% goes to the packlist_find() hash table
 - ~3.5% for the get_object_details() sorting list (this is only held
	 for a minute, but again, our peak comes during this sort, which
	 in turn loads the revindex)

So 27% of the total heap goes away if you switch to a separate rev-list.
Though it's mostly just going to a different process, it does help peak
because that process would have exited by the time we get to the
revindex bits.

I suspect you could get the same effect by just teaching pack-objects to
clear obj_hash and all of the allocated object structs. I think that
should be safe to do as long as we clear _all_ of the objects, so there
are no dangling pointers.

> About the 16k limit (and some other limits as well), I'm making these
> patches with the assumption that large scale deployment probably will
> go with custom builds anyway. Adjusting the limits back should be
> quite easy while we can still provide reasonable defaults for most
> people.

I think this 16k limit is the thing I _most_ dislike about the series.
If we could tweak that case such that we always made forward progress, I
think I'd be a lot less nervous. I responded elsewhere in the thread
(before seeing that both Junio and you seemed aware of the issues ;) ),
but I think it would be acceptable to have git-repack enforce the limit.

That would still mean you could get into a broken state for serving
fetches, but you could at least get out of it by running "git repack".

-Peff

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v6 00/11] nd/pack-objects-pack-struct updates
  2018-03-22  9:32                 ` Jeff King
@ 2018-03-22  9:46                   ` Jeff King
  2018-03-22 10:57                   ` Duy Nguyen
  2018-03-23  1:28                   ` Ramsay Jones
  2 siblings, 0 replies; 273+ messages in thread
From: Jeff King @ 2018-03-22  9:46 UTC (permalink / raw)
  To: Duy Nguyen
  Cc: Ævar Arnfjörð Bjarmason, Eric Wong,
	Git Mailing List, Junio C Hamano

On Thu, Mar 22, 2018 at 05:32:12AM -0400, Jeff King wrote:

> So 27% of the total heap goes away if you switch to a separate rev-list.
> Though it's mostly just going to a different process, it does help peak
> because that process would have exited by the time we get to the
> revindex bits.
> 
> I suspect you could get the same effect by just teaching pack-objects to
> clear obj_hash and all of the allocated object structs. I think that
> should be safe to do as long as we clear _all_ of the objects, so there
> are no dangling pointers.

The patch below tries that. It's kind of hacky, but it drops my peak
heap for packing linux.git from 1336MB to 1129MB.

That's not quite as exciting as 27%, because it just moves our peak
earlier, to when we do have all of the object structs in memory (so the
savings are really just that we're not holding the revindex, etc at the
same time as the object structs).

But we also hold that peak for a lot shorter period, because we drop the
memory before we do any delta compression (which itself can be memory
hungry[1]), and don't hold onto it during the write phase (which can be
network-limited when serving a fetch). So during that write phase we're
holding only ~900MB instead of ~1250MB.

-Peff

[1] All of my timings are on noop repacks of a single pack, so there's
    no actual delta compression. On average, it will use something like
    "nr_threads * window * avg_blob_size". For a "normal" repo, that's
    only a few megabytes. But the peak will depend on the large blobs,
    so it could have some outsize cases. I don't know if it's worth
    worrying about too much for this analysis.

---
Here's the patch. It's probably asking for trouble to have this kind of
clearing interface, as a surprising number of things may hold onto
pointers to objects (see the comment below about the bitmap code).

So maybe the separate process is less insane.

diff --git a/alloc.c b/alloc.c
index 12afadfacd..50d444a3b0 100644
--- a/alloc.c
+++ b/alloc.c
@@ -30,15 +30,23 @@ struct alloc_state {
 	int count; /* total number of nodes allocated */
 	int nr;    /* number of nodes left in current allocation */
 	void *p;   /* first free node in current allocation */
+
+	/* book-keeping for clearing */
+	void *start;
+	struct alloc_state *prev;
 };
 
-static inline void *alloc_node(struct alloc_state *s, size_t node_size)
+static inline void *alloc_node(struct alloc_state **sp, size_t node_size)
 {
+	struct alloc_state *s = *sp;
 	void *ret;
 
-	if (!s->nr) {
+	if (!s || !s->nr) {
+		s = xmalloc(sizeof(*s));
 		s->nr = BLOCKING;
-		s->p = xmalloc(BLOCKING * node_size);
+		s->start = s->p = xmalloc(BLOCKING * node_size);
+		s->prev = *sp;
+		*sp = s;
 	}
 	s->nr--;
 	s->count++;
@@ -48,7 +56,7 @@ static inline void *alloc_node(struct alloc_state *s, size_t node_size)
 	return ret;
 }
 
-static struct alloc_state blob_state;
+static struct alloc_state *blob_state;
 void *alloc_blob_node(void)
 {
 	struct blob *b = alloc_node(&blob_state, sizeof(struct blob));
@@ -56,7 +64,7 @@ void *alloc_blob_node(void)
 	return b;
 }
 
-static struct alloc_state tree_state;
+static struct alloc_state *tree_state;
 void *alloc_tree_node(void)
 {
 	struct tree *t = alloc_node(&tree_state, sizeof(struct tree));
@@ -64,7 +72,7 @@ void *alloc_tree_node(void)
 	return t;
 }
 
-static struct alloc_state tag_state;
+static struct alloc_state *tag_state;
 void *alloc_tag_node(void)
 {
 	struct tag *t = alloc_node(&tag_state, sizeof(struct tag));
@@ -72,7 +80,7 @@ void *alloc_tag_node(void)
 	return t;
 }
 
-static struct alloc_state object_state;
+static struct alloc_state *object_state;
 void *alloc_object_node(void)
 {
 	struct object *obj = alloc_node(&object_state, sizeof(union any_object));
@@ -80,7 +88,7 @@ void *alloc_object_node(void)
 	return obj;
 }
 
-static struct alloc_state commit_state;
+static struct alloc_state *commit_state;
 
 unsigned int alloc_commit_index(void)
 {
@@ -103,7 +111,7 @@ static void report(const char *name, unsigned int count, size_t size)
 }
 
 #define REPORT(name, type)	\
-    report(#name, name##_state.count, name##_state.count * sizeof(type) >> 10)
+    report(#name, name##_state->count, name##_state->count * sizeof(type) >> 10)
 
 void alloc_report(void)
 {
@@ -113,3 +121,22 @@ void alloc_report(void)
 	REPORT(tag, struct tag);
 	REPORT(object, union any_object);
 }
+
+static void alloc_clear(struct alloc_state **sp)
+{
+	while (*sp) {
+		struct alloc_state *s = *sp;
+		*sp = s->prev;
+		free(s->start);
+		free(s);
+	}
+}
+
+void alloc_clear_all(void)
+{
+	alloc_clear(&blob_state);
+	alloc_clear(&tree_state);
+	alloc_clear(&commit_state);
+	alloc_clear(&tag_state);
+	alloc_clear(&object_state);
+}
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 4406af640f..7ba8ab07a3 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -2959,6 +2959,13 @@ static void get_object_list(int ac, const char **av)
 				     record_recent_object, NULL);
 	}
 
+	/*
+	 * The bitmap code actually stores the commit pointers for future
+	 * reference, so we can't use this memory optimization there.
+	 */
+	if (!write_bitmap_index)
+		free_all_objects();
+
 	if (keep_unreachable)
 		add_objects_in_unpacked_packs(&revs);
 	if (pack_loose_unreachable)
diff --git a/cache.h b/cache.h
index b90feb3802..605bab31de 100644
--- a/cache.h
+++ b/cache.h
@@ -1872,6 +1872,8 @@ extern void *alloc_object_node(void);
 extern void alloc_report(void);
 extern unsigned int alloc_commit_index(void);
 
+void alloc_clear_all(void);
+
 /* pkt-line.c */
 void packet_trace_identity(const char *prog);
 
diff --git a/object.c b/object.c
index 9e6f9ff20b..6530d6fbde 100644
--- a/object.c
+++ b/object.c
@@ -445,3 +445,12 @@ void clear_commit_marks_all(unsigned int flags)
 			obj->flags &= ~flags;
 	}
 }
+
+void free_all_objects(void)
+{
+	alloc_clear_all();
+	free(obj_hash);
+	obj_hash = NULL;
+	obj_hash_size = 0;
+	nr_objs = 0;
+}
diff --git a/object.h b/object.h
index 8ce294d6ec..3eb85215c2 100644
--- a/object.h
+++ b/object.h
@@ -153,4 +153,6 @@ void clear_object_flags(unsigned flags);
  */
 extern void clear_commit_marks_all(unsigned int flags);
 
+void free_all_objects(void);
+
 #endif /* OBJECT_H */

^ permalink raw reply related	[flat|nested] 273+ messages in thread

* Re: [PATCH v6 00/11] nd/pack-objects-pack-struct updates
  2018-03-22  8:23                   ` Duy Nguyen
@ 2018-03-22 10:01                     ` Jeff King
  0 siblings, 0 replies; 273+ messages in thread
From: Jeff King @ 2018-03-22 10:01 UTC (permalink / raw)
  To: Duy Nguyen
  Cc: Ævar Arnfjörð Bjarmason, Eric Wong,
	Git Mailing List, Junio C Hamano

On Thu, Mar 22, 2018 at 09:23:42AM +0100, Duy Nguyen wrote:

> > I wish you were right about the rarity, but it's unfortunately something
> > I have seen multiple times in the wild (and why I spent time optimizing
> > the many-packs case for pack-objects). Unfortunately I don't know how
> > often it actually comes up, because in theory running "git repack"
> > cleans it up without further ado. But after these patches, not so much.
> 
> It's good to know this case is real and I can start to fix it
> (assuming that the other concern about readability will not stop this
> series).
> 
> I think I'll try to fix this without involving repack. pack-objects
> can produce multiple packs, so if we have more than 16k pack files, we
> produce  one new pack per 16k old ones.

I suspect that's going to be hard given the structure of the code.

Could we perhaps just bump to an auxiliary storage in that case?  I.e.,
allocate the final index number to mean "look in this other table", and
then have another table of uint32 indices?

That would mean we can behave as we did previously, but just use a
little more memory in the uncommon >16k case.

-Peff

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v6 00/11] nd/pack-objects-pack-struct updates
  2018-03-22  9:32                 ` Jeff King
  2018-03-22  9:46                   ` Jeff King
@ 2018-03-22 10:57                   ` Duy Nguyen
  2018-03-22 11:52                     ` Jeff King
  2018-03-23  1:28                   ` Ramsay Jones
  2 siblings, 1 reply; 273+ messages in thread
From: Duy Nguyen @ 2018-03-22 10:57 UTC (permalink / raw)
  To: Jeff King
  Cc: Ævar Arnfjörð Bjarmason, Eric Wong,
	Git Mailing List, Junio C Hamano

On Thu, Mar 22, 2018 at 10:32 AM, Jeff King <peff@peff.net> wrote:
> That would still mean you could get into a broken state for serving
> fetches, but you could at least get out of it by running "git repack".

I was puzzled by this "broken state" statement. But you were right! I
focused on the repack case and forgot about fetch/clone case. I will
probably just drop this patch for now. Then maybe revisit this some
time in fiture when I find out how to deal with this nicely.
-- 
Duy

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v6 00/11] nd/pack-objects-pack-struct updates
  2018-03-22 10:57                   ` Duy Nguyen
@ 2018-03-22 11:52                     ` Jeff King
  2018-03-22 17:04                       ` Duy Nguyen
  0 siblings, 1 reply; 273+ messages in thread
From: Jeff King @ 2018-03-22 11:52 UTC (permalink / raw)
  To: Duy Nguyen
  Cc: Ævar Arnfjörð Bjarmason, Eric Wong,
	Git Mailing List, Junio C Hamano

On Thu, Mar 22, 2018 at 11:57:27AM +0100, Duy Nguyen wrote:

> On Thu, Mar 22, 2018 at 10:32 AM, Jeff King <peff@peff.net> wrote:
> > That would still mean you could get into a broken state for serving
> > fetches, but you could at least get out of it by running "git repack".
> 
> I was puzzled by this "broken state" statement. But you were right! I
> focused on the repack case and forgot about fetch/clone case. I will
> probably just drop this patch for now. Then maybe revisit this some
> time in fiture when I find out how to deal with this nicely.

Here's a sketch of the "separate array" concept I mentioned before, in
case that helps. Not tested at all beyond compiling.

---
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 4406af640f..e4e308b453 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -1090,7 +1090,7 @@ static void create_object_entry(const struct object_id *oid,
 	else
 		nr_result++;
 	if (found_pack) {
-		oe_set_in_pack(entry, found_pack);
+		oe_set_in_pack(&to_pack, entry, found_pack);
 		entry->in_pack_offset = found_offset;
 	}
 
diff --git a/pack-objects.h b/pack-objects.h
index 9f8e450e19..b94b9232fa 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -7,6 +7,8 @@
 #define OE_Z_DELTA_BITS		16
 #define OE_DELTA_SIZE_BITS	31
 
+#define OE_IN_PACK_EXTENDED ((1 << OE_IN_PACK_BITS) - 1)
+
 /*
  * State flags for depth-first search used for analyzing delta cycles.
  *
@@ -111,8 +113,13 @@ struct packing_data {
 	uint32_t index_size;
 
 	unsigned int *in_pack_pos;
-	int in_pack_count;
-	struct packed_git *in_pack[1 << OE_IN_PACK_BITS];
+
+	struct packed_git **in_pack;
+	uint32_t in_pack_count;
+	size_t in_pack_alloc;
+
+	uint32_t *in_pack_extended;
+	size_t in_pack_extended_alloc;
 };
 
 struct object_entry *packlist_alloc(struct packing_data *pdata,
@@ -174,17 +181,13 @@ static inline void oe_set_in_pack_pos(const struct packing_data *pack,
 static inline unsigned int oe_add_pack(struct packing_data *pack,
 				       struct packed_git *p)
 {
-	if (pack->in_pack_count >= (1 << OE_IN_PACK_BITS))
-		die(_("too many packs to handle in one go. "
-		      "Please add .keep files to exclude\n"
-		      "some pack files and keep the number "
-		      "of non-kept files below %d."),
-		    1 << OE_IN_PACK_BITS);
 	if (p) {
 		if (p->index > 0)
 			die("BUG: this packed is already indexed");
 		p->index = pack->in_pack_count;
 	}
+	ALLOC_GROW(pack->in_pack, pack->in_pack_count + 1,
+		   pack->in_pack_alloc);
 	pack->in_pack[pack->in_pack_count] = p;
 	return pack->in_pack_count++;
 }
@@ -192,18 +195,28 @@ static inline unsigned int oe_add_pack(struct packing_data *pack,
 static inline struct packed_git *oe_in_pack(const struct packing_data *pack,
 					    const struct object_entry *e)
 {
-	return pack->in_pack[e->in_pack_idx];
-
+	uint32_t idx = e->in_pack_idx;
+	if (idx == OE_IN_PACK_EXTENDED)
+		idx = pack->in_pack_extended[e - pack->objects];
+	return pack->in_pack[idx];
 }
 
-static inline void oe_set_in_pack(struct object_entry *e,
+static inline void oe_set_in_pack(struct packing_data *pack,
+				  struct object_entry *e,
 				  struct packed_git *p)
 {
 	if (p->index <= 0)
 		die("BUG: found_pack should be NULL "
 		    "instead of having non-positive index");
-	e->in_pack_idx = p->index;
-
+	else if (p->index < OE_IN_PACK_EXTENDED)
+		e->in_pack_idx = p->index;
+	else {
+		size_t index = e - pack->objects;
+		ALLOC_GROW(pack->in_pack_extended,
+			   index, pack->in_pack_extended_alloc);
+		pack->in_pack_extended[index] = p->index;
+		e->in_pack_idx = OE_IN_PACK_EXTENDED;
+	}
 }
 
 static inline struct object_entry *oe_delta(

^ permalink raw reply related	[flat|nested] 273+ messages in thread

* Re: [PATCH v6 00/11] nd/pack-objects-pack-struct updates
  2018-03-22 11:52                     ` Jeff King
@ 2018-03-22 17:04                       ` Duy Nguyen
  0 siblings, 0 replies; 273+ messages in thread
From: Duy Nguyen @ 2018-03-22 17:04 UTC (permalink / raw)
  To: Jeff King
  Cc: Ævar Arnfjörð Bjarmason, Eric Wong,
	Git Mailing List, Junio C Hamano

On Thu, Mar 22, 2018 at 12:52 PM, Jeff King <peff@peff.net> wrote:
> On Thu, Mar 22, 2018 at 11:57:27AM +0100, Duy Nguyen wrote:
>
>> On Thu, Mar 22, 2018 at 10:32 AM, Jeff King <peff@peff.net> wrote:
>> > That would still mean you could get into a broken state for serving
>> > fetches, but you could at least get out of it by running "git repack".
>>
>> I was puzzled by this "broken state" statement. But you were right! I
>> focused on the repack case and forgot about fetch/clone case. I will
>> probably just drop this patch for now. Then maybe revisit this some
>> time in fiture when I find out how to deal with this nicely.
>
> Here's a sketch of the "separate array" concept I mentioned before, in
> case that helps. Not tested at all beyond compiling.

Brilliant! Sorry I couldn't read your suggestion carefully this morning.
-- 
Duy

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v6 00/11] nd/pack-objects-pack-struct updates
  2018-03-22  9:32                 ` Jeff King
  2018-03-22  9:46                   ` Jeff King
  2018-03-22 10:57                   ` Duy Nguyen
@ 2018-03-23  1:28                   ` Ramsay Jones
  2018-03-23  2:46                     ` Jeff King
  2 siblings, 1 reply; 273+ messages in thread
From: Ramsay Jones @ 2018-03-23  1:28 UTC (permalink / raw)
  To: Jeff King, Duy Nguyen
  Cc: Ævar Arnfjörð Bjarmason, Eric Wong,
	Git Mailing List, Junio C Hamano



On 22/03/18 09:32, Jeff King wrote:
> On Wed, Mar 21, 2018 at 04:59:19PM +0100, Duy Nguyen wrote:
> 
>>> I hate to be a wet blanket, but am I the only one who is wondering
>>> whether the tradeoffs is worth it? 8% memory reduction doesn't seem
>>> mind-bogglingly good,
>>
>> AEvar measured RSS. If we count objects[] array alone, the saving is
>> 40% (136 bytes per entry down to 80). Some is probably eaten up by
>> mmap in rss.
> 
> Measuring actual heap usage with massif, I get before/after peak heaps
> of 1728 and 1346MB respectively when repacking linux.git. So that's ~22%
> savings overall.
> 
> Of the used heap after your patches:
> 
>  - ~40% of that is from packlist_alloc()
>  - ~17% goes to "struct object"
>  - ~10% for the object.c hash table to store all the "struct object"
>  - ~7% goes to the delta cache
>  - ~7% goes to the pack revindex (actually, there's a duplicate 7%
>        there, too; I think our peak is when we're sorting the revindex
>        and have to keep two copies in memory at once)

which begs the question, how much slower would it be if we
replaced the radix-sort with an in-place sort (e.g. heapsort).

I hacked up the patch below, just for fun. I don't have any
large repos (or enough disk space) to do any meaningful perf
tests, but I did at least compile it and it passes the test-suite.
(That is no guarantee that I haven't introduced bugs, of course!)

;-)

ATB,
Ramsay Jones

-- >8 --
Subject: [PATCH] pack-revindex: replace radix-sort with in-place heapsort

Signed-off-by: Ramsay Jones <ramsay@ramsayjones.plus.com>
---
 pack-revindex.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/pack-revindex.c b/pack-revindex.c
index ff5f62c03..16f17eac1 100644
--- a/pack-revindex.c
+++ b/pack-revindex.c
@@ -15,6 +15,7 @@
  * get the object sha1 from the main index.
  */
 
+#ifdef DUMMY
 /*
  * This is a least-significant-digit radix sort.
  *
@@ -112,6 +113,65 @@ static void sort_revindex(struct revindex_entry *entries, unsigned n, off_t max)
 #undef BUCKETS
 #undef DIGIT_SIZE
 }
+#endif
+
+static inline void swap(struct revindex_entry *a, int i, int j)
+{
+	struct revindex_entry t;
+
+	t = a[i];
+	a[i] = a[j];
+	a[j] = t;
+}
+
+/*
+ * assume that elements first .. last (array index first-1 .. last-1) obey
+ * the partially ordered tree property, except possibly for the children of
+ * the first element. push down the first element until the partially
+ * ordered tree property is restored.
+ */
+static void push_down(struct revindex_entry *a, int first, int last)
+{
+	int parent = first;
+	int last_node = last / 2;
+
+	while (parent <= last_node) {
+
+		int left = 2 * parent;
+		int right = left + 1;
+		int biggest;
+
+		if (right > last) /* parent only has one child */
+			biggest = left;
+		else {
+			if (a[left-1].offset >= a[right-1].offset)
+				biggest = left;
+			else
+				biggest = right;
+
+		}
+
+		if (a[parent-1].offset >= a[biggest-1].offset)
+			break; /* partially ordered tree property, we're done */
+
+		/* push parent down */
+		swap(a, parent-1, biggest-1);
+		parent = biggest;
+	}
+}
+
+static void sort_revindex(struct revindex_entry *entries, unsigned n, off_t max)
+{
+	int i;
+
+	for (i = n/2; i > 0; i--)
+		push_down(entries, i, n);
+
+	for (i = n; i > 1; i--) {
+		swap(entries, 0, i-1);
+		push_down(entries, 1, i-1);
+	}
+}
 
 /*
  * Ordered list of offsets of objects in the pack.
-- 
2.16.0


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* Re: [PATCH v6 00/11] nd/pack-objects-pack-struct updates
  2018-03-23  1:28                   ` Ramsay Jones
@ 2018-03-23  2:46                     ` Jeff King
  2018-03-23  5:50                       ` Jeff King
                                         ` (2 more replies)
  0 siblings, 3 replies; 273+ messages in thread
From: Jeff King @ 2018-03-23  2:46 UTC (permalink / raw)
  To: Ramsay Jones
  Cc: Duy Nguyen, Ævar Arnfjörð Bjarmason, Eric Wong,
	Git Mailing List, Junio C Hamano

On Fri, Mar 23, 2018 at 01:28:12AM +0000, Ramsay Jones wrote:

> > Of the used heap after your patches:
> > 
> >  - ~40% of that is from packlist_alloc()
> >  - ~17% goes to "struct object"
> >  - ~10% for the object.c hash table to store all the "struct object"
> >  - ~7% goes to the delta cache
> >  - ~7% goes to the pack revindex (actually, there's a duplicate 7%
> >        there, too; I think our peak is when we're sorting the revindex
> >        and have to keep two copies in memory at once)
> 
> which begs the question, how much slower would it be if we
> replaced the radix-sort with an in-place sort (e.g. heapsort).
> 
> I hacked up the patch below, just for fun. I don't have any
> large repos (or enough disk space) to do any meaningful perf
> tests, but I did at least compile it and it passes the test-suite.
> (That is no guarantee that I haven't introduced bugs, of course!)

It might have been easier to just revert 8b8dfd5132 (pack-revindex:
radix-sort the revindex, 2013-07-11). It even includes some performance
numbers. :)

In short, no, I don't think we want to go back to a comparison-sort. The
radix sort back then was around 4 times faster for linux.git. And that
was when there were half as many objects in the repository, so the radix
sort should continue to improve as the repo size grows.

The absolute time savings aren't huge for something as bulky as a
repack, so it's less exciting in this context. But it's also not that
much memory (7% of the peak here, but as I showed elsewhere, if we can
stop holding all of the "struct object" memory once we're done with it,
then this revindex stuff doesn't even factor into the peak).

-Peff

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v6 00/11] nd/pack-objects-pack-struct updates
  2018-03-23  2:46                     ` Jeff King
@ 2018-03-23  5:50                       ` Jeff King
  2018-03-23 16:01                         ` Ramsay Jones
  2018-03-23  7:05                       ` Duy Nguyen
  2018-03-23 14:03                       ` Ramsay Jones
  2 siblings, 1 reply; 273+ messages in thread
From: Jeff King @ 2018-03-23  5:50 UTC (permalink / raw)
  To: Ramsay Jones
  Cc: Duy Nguyen, Ævar Arnfjörð Bjarmason, Eric Wong,
	Git Mailing List, Junio C Hamano

On Thu, Mar 22, 2018 at 10:46:09PM -0400, Jeff King wrote:

> > which begs the question, how much slower would it be if we
> > replaced the radix-sort with an in-place sort (e.g. heapsort).
> > 
> > I hacked up the patch below, just for fun. I don't have any
> > large repos (or enough disk space) to do any meaningful perf
> > tests, but I did at least compile it and it passes the test-suite.
> > (That is no guarantee that I haven't introduced bugs, of course!)
> 
> It might have been easier to just revert 8b8dfd5132 (pack-revindex:
> radix-sort the revindex, 2013-07-11). It even includes some performance
> numbers. :)
> 
> In short, no, I don't think we want to go back to a comparison-sort. The
> radix sort back then was around 4 times faster for linux.git. And that
> was when there were half as many objects in the repository, so the radix
> sort should continue to improve as the repo size grows.

I was curious whether my hand-waving there was true. It turns out that
it is: the radix sort has stayed about the same speed but the comparison
sort has gotten even slower. Here are best-of-five timings for "git
cat-file --batch-check='%(objectsize:disk)'", which does very little
besides generate the rev-index:

  [current master, using radix sort]
  real	0m0.104s
  user	0m0.088s
  sys	0m0.016s

  [reverting 8b8dfd5132, going back to qsort]
  real	0m1.193s
  user	0m1.176s
  sys	0m0.016s

So it's now a factor of 11. Yikes.

That number does match some napkin math. The radix sort uses four 16-bit
buckets, but can quit when after two rounds (because none of the offsets
is beyond 2^32). So it's essentially O(2n). Whereas the comparison sort
is O(n log n), and with n around 6M, that puts log(n) right around 22.

It's possible that some other comparison-based sort might be a little
more efficient than qsort, but I don't think you'll be able to beat the
algorithmic speedup.

The revert of 8b8dfd5132 is below for reference (it needed a few
conflict tweaks).

-Peff

---
diff --git a/pack-revindex.c b/pack-revindex.c
index ff5f62c033..c20aa9541b 100644
--- a/pack-revindex.c
+++ b/pack-revindex.c
@@ -15,102 +15,11 @@
  * get the object sha1 from the main index.
  */
 
-/*
- * This is a least-significant-digit radix sort.
- *
- * It sorts each of the "n" items in "entries" by its offset field. The "max"
- * parameter must be at least as large as the largest offset in the array,
- * and lets us quit the sort early.
- */
-static void sort_revindex(struct revindex_entry *entries, unsigned n, off_t max)
+static int cmp_offset(const void *a_, const void *b_)
 {
-	/*
-	 * We use a "digit" size of 16 bits. That keeps our memory
-	 * usage reasonable, and we can generally (for a 4G or smaller
-	 * packfile) quit after two rounds of radix-sorting.
-	 */
-#define DIGIT_SIZE (16)
-#define BUCKETS (1 << DIGIT_SIZE)
-	/*
-	 * We want to know the bucket that a[i] will go into when we are using
-	 * the digit that is N bits from the (least significant) end.
-	 */
-#define BUCKET_FOR(a, i, bits) (((a)[(i)].offset >> (bits)) & (BUCKETS-1))
-
-	/*
-	 * We need O(n) temporary storage. Rather than do an extra copy of the
-	 * partial results into "entries", we sort back and forth between the
-	 * real array and temporary storage. In each iteration of the loop, we
-	 * keep track of them with alias pointers, always sorting from "from"
-	 * to "to".
-	 */
-	struct revindex_entry *tmp, *from, *to;
-	int bits;
-	unsigned *pos;
-
-	ALLOC_ARRAY(pos, BUCKETS);
-	ALLOC_ARRAY(tmp, n);
-	from = entries;
-	to = tmp;
-
-	/*
-	 * If (max >> bits) is zero, then we know that the radix digit we are
-	 * on (and any higher) will be zero for all entries, and our loop will
-	 * be a no-op, as everybody lands in the same zero-th bucket.
-	 */
-	for (bits = 0; max >> bits; bits += DIGIT_SIZE) {
-		unsigned i;
-
-		memset(pos, 0, BUCKETS * sizeof(*pos));
-
-		/*
-		 * We want pos[i] to store the index of the last element that
-		 * will go in bucket "i" (actually one past the last element).
-		 * To do this, we first count the items that will go in each
-		 * bucket, which gives us a relative offset from the last
-		 * bucket. We can then cumulatively add the index from the
-		 * previous bucket to get the true index.
-		 */
-		for (i = 0; i < n; i++)
-			pos[BUCKET_FOR(from, i, bits)]++;
-		for (i = 1; i < BUCKETS; i++)
-			pos[i] += pos[i-1];
-
-		/*
-		 * Now we can drop the elements into their correct buckets (in
-		 * our temporary array).  We iterate the pos counter backwards
-		 * to avoid using an extra index to count up. And since we are
-		 * going backwards there, we must also go backwards through the
-		 * array itself, to keep the sort stable.
-		 *
-		 * Note that we use an unsigned iterator to make sure we can
-		 * handle 2^32-1 objects, even on a 32-bit system. But this
-		 * means we cannot use the more obvious "i >= 0" loop condition
-		 * for counting backwards, and must instead check for
-		 * wrap-around with UINT_MAX.
-		 */
-		for (i = n - 1; i != UINT_MAX; i--)
-			to[--pos[BUCKET_FOR(from, i, bits)]] = from[i];
-
-		/*
-		 * Now "to" contains the most sorted list, so we swap "from" and
-		 * "to" for the next iteration.
-		 */
-		SWAP(from, to);
-	}
-
-	/*
-	 * If we ended with our data in the original array, great. If not,
-	 * we have to move it back from the temporary storage.
-	 */
-	if (from != entries)
-		COPY_ARRAY(entries, tmp, n);
-	free(tmp);
-	free(pos);
-
-#undef BUCKET_FOR
-#undef BUCKETS
-#undef DIGIT_SIZE
+	const struct revindex_entry *a = a_;
+	const struct revindex_entry *b = b_;
+	return (a->offset < b->offset) ? -1 : (a->offset > b->offset) ? 1 : 0;
 }
 
 /*
@@ -152,7 +61,7 @@ static void create_pack_revindex(struct packed_git *p)
 	 */
 	p->revindex[num_ent].offset = p->pack_size - 20;
 	p->revindex[num_ent].nr = -1;
-	sort_revindex(p->revindex, num_ent, p->pack_size);
+	qsort(p->revindex, num_ent, sizeof(*p->revindex), cmp_offset);
 }
 
 void load_pack_revindex(struct packed_git *p)

^ permalink raw reply related	[flat|nested] 273+ messages in thread

* Re: [PATCH v6 00/11] nd/pack-objects-pack-struct updates
  2018-03-23  2:46                     ` Jeff King
  2018-03-23  5:50                       ` Jeff King
@ 2018-03-23  7:05                       ` Duy Nguyen
  2018-03-23 14:03                       ` Ramsay Jones
  2 siblings, 0 replies; 273+ messages in thread
From: Duy Nguyen @ 2018-03-23  7:05 UTC (permalink / raw)
  To: Jeff King
  Cc: Ramsay Jones, Ævar Arnfjörð Bjarmason, Eric Wong,
	Git Mailing List, Junio C Hamano

On Fri, Mar 23, 2018 at 3:46 AM, Jeff King <peff@peff.net> wrote:
> On Fri, Mar 23, 2018 at 01:28:12AM +0000, Ramsay Jones wrote:
>
>> > Of the used heap after your patches:
>> >
>> >  - ~40% of that is from packlist_alloc()
>> >  - ~17% goes to "struct object"
>> >  - ~10% for the object.c hash table to store all the "struct object"
>> >  - ~7% goes to the delta cache
>> >  - ~7% goes to the pack revindex (actually, there's a duplicate 7%
>> >        there, too; I think our peak is when we're sorting the revindex
>> >        and have to keep two copies in memory at once)
>>
>> which begs the question, how much slower would it be if we
>> replaced the radix-sort with an in-place sort (e.g. heapsort).
>>
>> I hacked up the patch below, just for fun. I don't have any
>> large repos (or enough disk space) to do any meaningful perf
>> tests, but I did at least compile it and it passes the test-suite.
>> (That is no guarantee that I haven't introduced bugs, of course!)
>
> It might have been easier to just revert 8b8dfd5132 (pack-revindex:
> radix-sort the revindex, 2013-07-11). It even includes some performance
> numbers. :)
>
> In short, no, I don't think we want to go back to a comparison-sort. The
> radix sort back then was around 4 times faster for linux.git. And that
> was when there were half as many objects in the repository, so the radix
> sort should continue to improve as the repo size grows.
>
> The absolute time savings aren't huge for something as bulky as a
> repack, so it's less exciting in this context. But it's also not that
> much memory (7% of the peak here, but as I showed elsewhere, if we can
> stop holding all of the "struct object" memory once we're done with it,
> then this revindex stuff doesn't even factor into the peak).

We probably could do something about revindex after rev-list memory is
freed up too. revindex is used in two places (i'm ignoring
packfile.c): when we look for base objects in ofs-delta in
check_objects() and when we write a reused object. The first case
can't be helped, it's why we need revindex. The second case though is
just to get the compressed object size so we can copy the object.

If we cache the compressed size (like with uint32_t) then we can free
up revindex after check_objects() is done. Even if we repack
everything, this is still a very good saving (32 bits vs 128 bits per
object). The freed up memory could probably be used for delta cache.
But then if we hit a compressed object size larger than 4GB, revindex
must be recreated back, but we could detect this at check_objects
phase and keep revindex alivie.
-- 
Duy

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v6 00/11] nd/pack-objects-pack-struct updates
  2018-03-23  2:46                     ` Jeff King
  2018-03-23  5:50                       ` Jeff King
  2018-03-23  7:05                       ` Duy Nguyen
@ 2018-03-23 14:03                       ` Ramsay Jones
  2 siblings, 0 replies; 273+ messages in thread
From: Ramsay Jones @ 2018-03-23 14:03 UTC (permalink / raw)
  To: Jeff King
  Cc: Duy Nguyen, Ævar Arnfjörð Bjarmason, Eric Wong,
	Git Mailing List, Junio C Hamano



On 23/03/18 02:46, Jeff King wrote:
> On Fri, Mar 23, 2018 at 01:28:12AM +0000, Ramsay Jones wrote:
> 
>>> Of the used heap after your patches:
>>>
>>>  - ~40% of that is from packlist_alloc()
>>>  - ~17% goes to "struct object"
>>>  - ~10% for the object.c hash table to store all the "struct object"
>>>  - ~7% goes to the delta cache
>>>  - ~7% goes to the pack revindex (actually, there's a duplicate 7%
>>>        there, too; I think our peak is when we're sorting the revindex
>>>        and have to keep two copies in memory at once)
>>
>> which begs the question, how much slower would it be if we
>> replaced the radix-sort with an in-place sort (e.g. heapsort).
>>
>> I hacked up the patch below, just for fun. I don't have any
>> large repos (or enough disk space) to do any meaningful perf
>> tests, but I did at least compile it and it passes the test-suite.
>> (That is no guarantee that I haven't introduced bugs, of course!)
> 
> It might have been easier to just revert 8b8dfd5132 (pack-revindex:
> radix-sort the revindex, 2013-07-11). It even includes some performance
> numbers. :)

But not as much fun! :)

> In short, no, I don't think we want to go back to a comparison-sort. The
> radix sort back then was around 4 times faster for linux.git. And that
> was when there were half as many objects in the repository, so the radix
> sort should continue to improve as the repo size grows.

Yes, I expected radix-sort to be faster.

> The absolute time savings aren't huge for something as bulky as a
> repack, so it's less exciting in this context. But it's also not that
> much memory (7% of the peak here, but as I showed elsewhere, if we can
> stop holding all of the "struct object" memory once we're done with it,
> then this revindex stuff doesn't even factor into the peak).

I didn't see this post until afterwards. So, if it isn't even a
factor in the peak memory use, then it's clear this specific
space/time trade-off also isn't an issue! ;-)

Thanks.

ATB,
Ramsay Jones

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v6 00/11] nd/pack-objects-pack-struct updates
  2018-03-23  5:50                       ` Jeff King
@ 2018-03-23 16:01                         ` Ramsay Jones
  2018-03-24  6:40                           ` Jeff King
  0 siblings, 1 reply; 273+ messages in thread
From: Ramsay Jones @ 2018-03-23 16:01 UTC (permalink / raw)
  To: Jeff King
  Cc: Duy Nguyen, Ævar Arnfjörð Bjarmason, Eric Wong,
	Git Mailing List, Junio C Hamano



On 23/03/18 05:50, Jeff King wrote:
> On Thu, Mar 22, 2018 at 10:46:09PM -0400, Jeff King wrote:
[snip]
> I was curious whether my hand-waving there was true. It turns out that
> it is: the radix sort has stayed about the same speed but the comparison
> sort has gotten even slower. Here are best-of-five timings for "git
> cat-file --batch-check='%(objectsize:disk)'", which does very little
> besides generate the rev-index:

Not that it matters, but I assume this was something like:

  $ time (echo HEAD | git cat-file --batch-check="%(objectsize:disk)")

... and I suspect it was on the linux.git repo, yes?

I used to have a copy of the linux repo on disk, but I had to
delete it a while ago to recover some disk space (no matter how
big disks get, they never seem big enough)!

If I do this on my biggest repo (ffmpeg), I get:

  $ cd ../ffmpeg/

  $ time (echo HEAD | git cat-file --batch-check="%(objectsize:disk)")
  227

  real	0m0.037s
  user	0m0.020s
  sys	0m0.004s

  $ time (echo HEAD | ../git/git-cat-file --batch-check="%(objectsize:disk)")
  227

  real	0m0.146s
  user	0m0.112s
  sys	0m0.012s

  $ 

Where I'm using a version with my patch applied, rather than
reverting commit 8b8dfd5132. A 395% slowdown is bad enough, but
not as bad as a factor of 11! I bet you have a much more modern
system (with a fast SSD) than my old laptop. :-D

>   [current master, using radix sort]
>   real	0m0.104s
>   user	0m0.088s
>   sys	0m0.016s
> 
>   [reverting 8b8dfd5132, going back to qsort]
>   real	0m1.193s
>   user	0m1.176s
>   sys	0m0.016s
> 
> So it's now a factor of 11. Yikes.

Thanks for looking into this, even if it was a wild
goose chase. :)

ATB,
Ramsay Jones


^ permalink raw reply	[flat|nested] 273+ messages in thread

* [PATCH v7 00/13] nd/pack-objects-pack-struct updates
  2018-03-18 14:25           ` [PATCH v6 " Nguyễn Thái Ngọc Duy
                               ` (12 preceding siblings ...)
  2018-03-21  8:24             ` Jeff King
@ 2018-03-24  6:33             ` Nguyễn Thái Ngọc Duy
  2018-03-24  6:33               ` [PATCH v7 01/13] pack-objects: a bit of document about struct object_entry Nguyễn Thái Ngọc Duy
                                 ` (14 more replies)
  13 siblings, 15 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-24  6:33 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

Phew.. pack-objects is tough to crack. v7 changes

- the 16k pack limit is removed thanks to Jeff suggestion. The limit
  for memory saving though is reduced down to 1k again.
- only object size below 2G is cached (previously 4G) to avoid 1 << 32
  on 32 bits.
- fix oe_size() retrieving wrong size (e.g. if entry->type is a delta
  then it must return delta size not canonical size)
- fix race condition when oe_size() (and oe_delta_size()) accesses
  object database in parallel try_delta code.
- add new GIT_TEST_ vars so that we use the whole test suite to
  exercise these hard-to-activate code paths.
- I finally fixed the "compressed" typo!

I'm not ignoring or forgetting other suggestions in this thread. the
rev-list thing is still on my todo list (either do it externally or
free up mem as Jeff suggested). revindex as well. But those can be
done later.

Interdiff

diff --git a/Documentation/git-pack-objects.txt b/Documentation/git-pack-objects.txt
index b8d936ccf5..3503c9e3e6 100644
--- a/Documentation/git-pack-objects.txt
+++ b/Documentation/git-pack-objects.txt
@@ -269,15 +269,6 @@ Unexpected missing object will raise an error.
 	locally created objects [without .promisor] and objects from the
 	promisor remote [with .promisor].)  This is used with partial clone.
 
-LIMITATIONS
------------
-
-This command could only handle 16384 existing pack files at a time.
-If you have more than this, you need to exclude some pack files with
-".keep" file and --honor-pack-keep option, to combine 16k pack files
-in one, then remove these .keep files and run pack-objects one more
-time.
-
 SEE ALSO
 --------
 linkgit:git-rev-list[1]
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 4406af640f..c774821930 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -30,6 +30,8 @@
 #include "packfile.h"
 
 #define IN_PACK(obj) oe_in_pack(&to_pack, obj)
+#define SIZE(obj) oe_size(&to_pack, obj)
+#define SET_SIZE(obj,size) oe_set_size(&to_pack, obj, size)
 #define DELTA_SIZE(obj) oe_delta_size(&to_pack, obj)
 #define DELTA(obj) oe_delta(&to_pack, obj)
 #define DELTA_CHILD(obj) oe_delta_child(&to_pack, obj)
@@ -276,7 +278,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 
 	if (!usable_delta) {
 		if (oe_type(entry) == OBJ_BLOB &&
-		    oe_size_greater_than(entry, big_file_threshold) &&
+		    oe_size_greater_than(&to_pack, entry, big_file_threshold) &&
 		    (st = open_istream(entry->idx.oid.hash, &type, &size, NULL)) != NULL)
 			buf = NULL;
 		else {
@@ -386,7 +388,7 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 	unsigned char header[MAX_PACK_OBJECT_HEADER],
 		      dheader[MAX_PACK_OBJECT_HEADER];
 	unsigned hdrlen;
-	unsigned long entry_size = oe_size(entry);
+	unsigned long entry_size = SIZE(entry);
 
 	if (DELTA(entry))
 		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
@@ -1036,7 +1038,7 @@ static int want_object_in_pack(const struct object_id *oid,
 	if (*found_pack) {
 		want = want_found_object(exclude, *found_pack);
 		if (want != -1)
-			goto done;
+			return want;
 	}
 
 	list_for_each(pos, &packed_git_mru) {
@@ -1059,16 +1061,11 @@ static int want_object_in_pack(const struct object_id *oid,
 			if (!exclude && want > 0)
 				list_move(&p->mru, &packed_git_mru);
 			if (want != -1)
-				goto done;
+				return want;
 		}
 	}
 
-	want = 1;
-done:
-	if (want && *found_pack && !(*found_pack)->index)
-		oe_add_pack(&to_pack, *found_pack);
-
-	return want;
+	return 1;
 }
 
 static void create_object_entry(const struct object_id *oid,
@@ -1090,7 +1087,7 @@ static void create_object_entry(const struct object_id *oid,
 	else
 		nr_result++;
 	if (found_pack) {
-		oe_set_in_pack(entry, found_pack);
+		oe_set_in_pack(&to_pack, entry, found_pack);
 		entry->in_pack_offset = found_offset;
 	}
 
@@ -1415,7 +1412,7 @@ static void cleanup_preferred_base(void)
 
 static void check_object(struct object_entry *entry)
 {
-	unsigned long size;
+	unsigned long canonical_size;
 
 	if (IN_PACK(entry)) {
 		struct packed_git *p = IN_PACK(entry);
@@ -1427,6 +1424,7 @@ static void check_object(struct object_entry *entry)
 		off_t ofs;
 		unsigned char *buf, c;
 		enum object_type type;
+		unsigned long in_pack_size;
 
 		buf = use_pack(p, &w_curs, entry->in_pack_offset, &avail);
 
@@ -1436,14 +1434,13 @@ static void check_object(struct object_entry *entry)
 		 */
 		used = unpack_object_header_buffer(buf, avail,
 						   &type,
-						   &size);
+						   &in_pack_size);
 		if (used == 0)
 			goto give_up;
 
 		if (type < 0)
 			die("BUG: invalid type %d", type);
 		entry->in_pack_type = type;
-		oe_set_size(entry, size);
 
 		/*
 		 * Determine if this is a delta and if so whether we can
@@ -1454,6 +1451,7 @@ static void check_object(struct object_entry *entry)
 		default:
 			/* Not a delta hence we've already got all we need. */
 			oe_set_type(entry, entry->in_pack_type);
+			SET_SIZE(entry, in_pack_size);
 			entry->in_pack_header_size = used;
 			if (oe_type(entry) < OBJ_COMMIT || oe_type(entry) > OBJ_BLOB)
 				goto give_up;
@@ -1510,8 +1508,9 @@ static void check_object(struct object_entry *entry)
 			 * circular deltas.
 			 */
 			oe_set_type(entry, entry->in_pack_type);
+			SET_SIZE(entry, in_pack_size); /* delta size */
 			SET_DELTA(entry, base_entry);
-			SET_DELTA_SIZE(entry, oe_size(entry));
+			SET_DELTA_SIZE(entry, in_pack_size);
 			entry->delta_sibling_idx = base_entry->delta_child_idx;
 			SET_DELTA_CHILD(base_entry, entry);
 			unuse_pack(&w_curs);
@@ -1519,18 +1518,18 @@ static void check_object(struct object_entry *entry)
 		}
 
 		if (oe_type(entry)) {
-			unsigned long size;
+			off_t delta_pos;
 
-			size = get_size_from_delta(p, &w_curs,
-				entry->in_pack_offset + entry->in_pack_header_size);
 			/*
 			 * This must be a delta and we already know what the
 			 * final object type is.  Let's extract the actual
 			 * object size from the delta header.
 			 */
-			oe_set_size(entry, size);
-			if (oe_size_less_than(entry, 1))
+			delta_pos = entry->in_pack_offset + entry->in_pack_header_size;
+			canonical_size = get_size_from_delta(p, &w_curs, delta_pos);
+			if (canonical_size == 0)
 				goto give_up;
+			SET_SIZE(entry, canonical_size);
 			unuse_pack(&w_curs);
 			return;
 		}
@@ -1544,15 +1543,18 @@ static void check_object(struct object_entry *entry)
 		unuse_pack(&w_curs);
 	}
 
-	oe_set_type(entry, sha1_object_info(entry->idx.oid.hash, &size));
-	/*
-	 * The error condition is checked in prepare_pack().  This is
-	 * to permit a missing preferred base object to be ignored
-	 * as a preferred base.  Doing so can result in a larger
-	 * pack file, but the transfer will still take place.
-	 */
-	if (entry->type_valid)
-		oe_set_size(entry, size);
+	oe_set_type(entry, sha1_object_info(entry->idx.oid.hash,
+					    &canonical_size));
+	if (entry->type_valid) {
+		SET_SIZE(entry, canonical_size);
+	} else {
+		/*
+		 * Bad object type is checked in prepare_pack().  This is
+		 * to permit a missing preferred base object to be ignored
+		 * as a preferred base.  Doing so can result in a larger
+		 * pack file, but the transfer will still take place.
+		 */
+	}
 }
 
 static int pack_offset_sort(const void *_a, const void *_b)
@@ -1619,7 +1621,7 @@ static void drop_reused_delta(struct object_entry *entry)
 	} else {
 		oe_set_type(entry, type);
 	}
-	oe_set_size(entry, size);
+	SET_SIZE(entry, size);
 }
 
 /*
@@ -1759,7 +1761,8 @@ static void get_object_details(void)
 	for (i = 0; i < to_pack.nr_objects; i++) {
 		struct object_entry *entry = sorted_by_offset[i];
 		check_object(entry);
-		if (oe_size_greater_than(entry, big_file_threshold))
+		if (entry->type_valid &&
+		    oe_size_greater_than(&to_pack, entry, big_file_threshold))
 			entry->no_try_delta = 1;
 	}
 
@@ -1788,8 +1791,8 @@ static int type_size_sort(const void *_a, const void *_b)
 	const struct object_entry *b = *(struct object_entry **)_b;
 	enum object_type a_type = oe_type(a);
 	enum object_type b_type = oe_type(b);
-	unsigned long a_size = oe_size(a);
-	unsigned long b_size = oe_size(b);
+	unsigned long a_size = SIZE(a);
+	unsigned long b_size = SIZE(b);
 
 	if (a_type > b_type)
 		return -1;
@@ -1858,6 +1861,41 @@ static pthread_mutex_t progress_mutex;
 
 #endif
 
+unsigned long oe_get_size_slow(struct packing_data *pack,
+			       const struct object_entry *e)
+{
+	struct packed_git *p;
+	struct pack_window *w_curs;
+	unsigned char *buf;
+	enum object_type type;
+	unsigned long used, avail, size;
+
+	if (e->type_ != OBJ_OFS_DELTA && e->type_ != OBJ_REF_DELTA) {
+		read_lock();
+		if (sha1_object_info(e->idx.oid.hash, &size) < 0)
+			die(_("unable to get size of %s"),
+			    oid_to_hex(&e->idx.oid));
+		read_unlock();
+		return size;
+	}
+
+	p = oe_in_pack(pack, e);
+	if (!p)
+		die("BUG: when e->type is a delta, it must belong to a pack");
+
+	read_lock();
+	w_curs = NULL;
+	buf = use_pack(p, &w_curs, e->in_pack_offset, &avail);
+	used = unpack_object_header_buffer(buf, avail, &type, &size);
+	if (used == 0)
+		die(_("unable to parse object header of %s"),
+		    oid_to_hex(&e->idx.oid));
+
+	unuse_pack(&w_curs);
+	read_unlock();
+	return size;
+}
+
 static int try_delta(struct unpacked *trg, struct unpacked *src,
 		     unsigned max_depth, unsigned long *mem_usage)
 {
@@ -1892,7 +1930,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 		return 0;
 
 	/* Now some size filtering heuristics. */
-	trg_size = oe_size(trg_entry);
+	trg_size = SIZE(trg_entry);
 	if (!DELTA(trg_entry)) {
 		max_size = trg_size/2 - 20;
 		ref_depth = 1;
@@ -1904,7 +1942,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 						(max_depth - ref_depth + 1);
 	if (max_size == 0)
 		return 0;
-	src_size = oe_size(src_entry);
+	src_size = SIZE(src_entry);
 	sizediff = src_size < trg_size ? trg_size - src_size : 0;
 	if (sizediff >= max_size)
 		return 0;
@@ -2026,7 +2064,7 @@ static unsigned long free_unpacked(struct unpacked *n)
 	free_delta_index(n->index);
 	n->index = NULL;
 	if (n->data) {
-		freed_mem += oe_size(n->entry);
+		freed_mem += SIZE(n->entry);
 		FREE_AND_NULL(n->data);
 	}
 	n->entry = NULL;
@@ -2476,7 +2514,8 @@ static void prepare_pack(int window, int depth)
 			 */
 			continue;
 
-		if (oe_size_less_than(entry, 50))
+		if (!entry->type_valid ||
+		    oe_size_less_than(&to_pack, entry, 50))
 			continue;
 
 		if (entry->no_try_delta)
@@ -3235,9 +3274,6 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 		}
 	}
 
-	/* make sure IN_PACK(0) return NULL */
-	oe_add_pack(&to_pack, NULL);
-
 	if (progress)
 		progress_state = start_progress(_("Counting objects"), 0);
 	if (!use_internal_rev_list)
diff --git a/pack-objects.c b/pack-objects.c
index 9558d13834..59c6e40a02 100644
--- a/pack-objects.c
+++ b/pack-objects.c
@@ -2,6 +2,7 @@
 #include "object.h"
 #include "pack.h"
 #include "pack-objects.h"
+#include "packfile.h"
 
 static uint32_t locate_object_entry_hash(struct packing_data *pdata,
 					 const unsigned char *sha1,
@@ -86,15 +87,54 @@ struct object_entry *packlist_find(struct packing_data *pdata,
 	return &pdata->objects[pdata->index[i] - 1];
 }
 
+static void prepare_in_pack_by_idx(struct packing_data *pdata)
+{
+	struct packed_git **mapping, *p;
+	int cnt = 0, nr = 1 << OE_IN_PACK_BITS;
+
+	if (getenv("GIT_TEST_FULL_IN_PACK_ARRAY")) {
+		/*
+		 * leave in_pack_by_idx NULL to force in_pack[] to be
+		 * used instead
+		 */
+		return;
+	}
+
+	ALLOC_ARRAY(mapping, nr);
+	mapping[cnt++] = NULL; /* zero index must be mapped to NULL */
+	prepare_packed_git();
+	for (p = packed_git; p; p = p->next, cnt++) {
+		if (cnt == nr) {
+			free(mapping);
+			return;
+		}
+		p->index = cnt;
+		mapping[cnt] = p;
+	}
+	pdata->in_pack_by_idx = mapping;
+}
+
 struct object_entry *packlist_alloc(struct packing_data *pdata,
 				    const unsigned char *sha1,
 				    uint32_t index_pos)
 {
 	struct object_entry *new_entry;
 
+	if (!pdata->nr_objects) {
+		prepare_in_pack_by_idx(pdata);
+		if (getenv("GIT_TEST_OE_SIZE_BITS")) {
+			int bits = atoi(getenv("GIT_TEST_OE_SIZE_BITS"));;
+			pdata->oe_size_limit = 1 << bits;
+		}
+		if (!pdata->oe_size_limit)
+			pdata->oe_size_limit = 1 << OE_SIZE_BITS;
+	}
 	if (pdata->nr_objects >= pdata->nr_alloc) {
 		pdata->nr_alloc = (pdata->nr_alloc  + 1024) * 3 / 2;
 		REALLOC_ARRAY(pdata->objects, pdata->nr_alloc);
+
+		if (!pdata->in_pack_by_idx)
+			REALLOC_ARRAY(pdata->in_pack, pdata->nr_alloc);
 	}
 
 	new_entry = pdata->objects + pdata->nr_objects++;
@@ -107,5 +147,8 @@ struct object_entry *packlist_alloc(struct packing_data *pdata,
 	else
 		pdata->index[index_pos] = pdata->nr_objects;
 
+	if (pdata->in_pack)
+		pdata->in_pack[pdata->nr_objects - 1] = NULL;
+
 	return new_entry;
 }
diff --git a/pack-objects.h b/pack-objects.h
index af40211105..9f19672602 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -3,8 +3,9 @@
 
 #define OE_DFS_STATE_BITS	2
 #define OE_DEPTH_BITS		12
-#define OE_IN_PACK_BITS		14
+#define OE_IN_PACK_BITS		10
 #define OE_Z_DELTA_BITS		16
+#define OE_SIZE_BITS		31
 #define OE_DELTA_SIZE_BITS	31
 
 /*
@@ -35,7 +36,9 @@ enum dfs_state {
  *
  * "size" is the uncompressed object size. Compressed size of the raw
  * data for an object in a pack is not stored anywhere but is computed
- * and made available when reverse .idx is made.
+ * and made available when reverse .idx is made. Note that when an
+ * delta is reused, "size" is the uncompressed _delta_ size, not the
+ * canonical one after the delta has been applied.
  *
  * "hash" contains a path name hash which is used for sorting the
  * delta list and also during delta searching. Once prepare_pack()
@@ -74,7 +77,8 @@ struct object_entry {
 	void *delta_data;	/* cached delta (uncompressed) */
 	off_t in_pack_offset;
 	uint32_t hash;			/* name hint hash */
-	uint32_t size_;	/* object uncompressed size _if_ size_valid is true */
+	uint32_t size_:OE_SIZE_BITS;
+	unsigned size_valid:1;
 	uint32_t delta_idx;	/* delta base object */
 	uint32_t delta_child_idx; /* deltified objects who bases me */
 	uint32_t delta_sibling_idx; /* other deltified objects who
@@ -83,24 +87,23 @@ struct object_entry {
 	uint32_t delta_size_:OE_DELTA_SIZE_BITS; /* delta data size (uncompressed) */
 	uint32_t delta_size_valid:1;
 	unsigned in_pack_idx:OE_IN_PACK_BITS;	/* already in pack */
-	unsigned size_valid:1;
 	unsigned z_delta_size:OE_Z_DELTA_BITS;
 	unsigned type_valid:1;
 	unsigned type_:TYPE_BITS;
+	unsigned no_try_delta:1;
 	unsigned in_pack_type:TYPE_BITS; /* could be delta */
 	unsigned preferred_base:1; /*
 				    * we do not pack this, but is available
 				    * to be used as the base object to delta
 				    * objects against.
 				    */
-	unsigned no_try_delta:1;
-	unsigned char in_pack_header_size;
 	unsigned tagged:1; /* near the very tip of refs */
 	unsigned filled:1; /* assigned write-order */
 	unsigned dfs_state:OE_DFS_STATE_BITS;
+	unsigned char in_pack_header_size;
 	unsigned depth:OE_DEPTH_BITS;
 
-	/* size: 80, bit_padding: 16 bits */
+	/* size: 80, bit_padding: 20 bits, holes: 1 bit */
 };
 
 struct packing_data {
@@ -111,8 +114,17 @@ struct packing_data {
 	uint32_t index_size;
 
 	unsigned int *in_pack_pos;
-	int in_pack_count;
-	struct packed_git *in_pack[1 << OE_IN_PACK_BITS];
+
+	/*
+	 * Only one of these can be non-NULL and they have different
+	 * sizes. if in_pack_by_idx is allocated, oe_in_pack() returns
+	 * the pack of an object using in_pack_idx field. If not,
+	 * in_pack[] array is used the same way as in_pack_pos[]
+	 */
+	struct packed_git **in_pack_by_idx;
+	struct packed_git **in_pack;
+
+	uintmax_t oe_size_limit;
 };
 
 struct object_entry *packlist_alloc(struct packing_data *pdata,
@@ -171,38 +183,27 @@ static inline void oe_set_in_pack_pos(const struct packing_data *pack,
 	pack->in_pack_pos[e - pack->objects] = pos;
 }
 
-static inline unsigned int oe_add_pack(struct packing_data *pack,
-				       struct packed_git *p)
-{
-	if (pack->in_pack_count >= (1 << OE_IN_PACK_BITS))
-		die(_("too many packs to handle in one go. "
-		      "Please add .keep files to exclude\n"
-		      "some pack files and keep the number "
-		      "of non-kept files below %d."),
-		    1 << OE_IN_PACK_BITS);
-	if (p) {
-		if (p->index > 0)
-			die("BUG: this packed is already indexed");
-		p->index = pack->in_pack_count;
-	}
-	pack->in_pack[pack->in_pack_count] = p;
-	return pack->in_pack_count++;
-}
-
 static inline struct packed_git *oe_in_pack(const struct packing_data *pack,
 					    const struct object_entry *e)
 {
-	return pack->in_pack[e->in_pack_idx];
+	if (pack->in_pack_by_idx)
+		return pack->in_pack_by_idx[e->in_pack_idx];
+	else
+		return pack->in_pack[e - pack->objects];
 
 }
 
-static inline void oe_set_in_pack(struct object_entry *e,
+static inline void oe_set_in_pack(struct packing_data *pack,
+				  struct object_entry *e,
 				  struct packed_git *p)
 {
-	if (p->index <= 0)
-		die("BUG: found_pack should be NULL "
-		    "instead of having non-positive index");
-	e->in_pack_idx = p->index;
+	if (pack->in_pack_by_idx) {
+		if (p->index <= 0)
+			die("BUG: found_pack should be NULL "
+					"instead of having non-positive index");
+			e->in_pack_idx = p->index;
+	} else
+		pack->in_pack[e - pack->objects] = p;
 
 }
 
@@ -263,56 +264,49 @@ static inline void oe_set_delta_sibling(struct packing_data *pack,
 		e->delta_sibling_idx = 0;
 }
 
-static inline unsigned long oe_size(const struct object_entry *e)
+unsigned long oe_get_size_slow(struct packing_data *pack,
+			       const struct object_entry *e);
+static inline unsigned long oe_size(struct packing_data *pack,
+				    const struct object_entry *e)
 {
-	if (e->size_valid) {
+	if (e->size_valid)
 		return e->size_;
-	} else {
-		unsigned long size;
 
-		sha1_object_info(e->idx.oid.hash, &size);
-		return size;
-	}
+	return oe_get_size_slow(pack, e);
 }
 
-static inline int oe_fits_in_32bits(unsigned long limit)
+static inline int oe_size_less_than(struct packing_data *pack,
+				    const struct object_entry *lhs,
+				    unsigned long rhs)
 {
-	uint32_t truncated_limit = (uint32_t)limit;
-
-	return limit == truncated_limit;
-}
-
-static inline int oe_size_less_than(const struct object_entry *e,
-				    unsigned long limit)
-{
-	if (e->size_valid)
-		return e->size_ < limit;
-	if (oe_fits_in_32bits(limit)) /* limit < 2^32 <= size ? */
+	if (lhs->size_valid)
+		return lhs->size_ < rhs;
+	if (rhs < pack->oe_size_limit) /* rhs < 2^x <= lhs ? */
 		return 0;
-	return oe_size(e) < limit;
+	return oe_get_size_slow(pack, lhs) < rhs;
 }
 
-static inline int oe_size_greater_than(const struct object_entry *e,
-				       unsigned long limit)
+static inline int oe_size_greater_than(struct packing_data *pack,
+				       const struct object_entry *lhs,
+				       unsigned long rhs)
 {
-	if (e->size_valid)
-		return e->size_ > limit;
-	if (oe_fits_in_32bits(limit)) /* limit < 2^32 <= size ? */
+	if (lhs->size_valid)
+		return lhs->size_ > rhs;
+	if (rhs < pack->oe_size_limit) /* rhs < 2^x <= lhs ? */
 		return 1;
-	return oe_size(e) > limit;
+	return oe_get_size_slow(pack, lhs) > rhs;
 }
 
-static inline void oe_set_size(struct object_entry *e,
+static inline void oe_set_size(struct packing_data *pack,
+			       struct object_entry *e,
 			       unsigned long size)
 {
-	e->size_ = size;
-	e->size_valid = e->size_ == size;
-
-	if (!e->size_valid) {
-		unsigned long real_size;
-
-		if (sha1_object_info(e->idx.oid.hash, &real_size) < 0 ||
-		    size != real_size)
+	if (size < pack->oe_size_limit) {
+		e->size_ = size;
+		e->size_valid = 1;
+	} else {
+		e->size_valid = 0;
+		if (oe_get_size_slow(pack, e) != size)
 			die("BUG: 'size' is supposed to be the object size!");
 	}
 }
@@ -322,7 +316,7 @@ static inline unsigned long oe_delta_size(struct packing_data *pack,
 {
 	if (e->delta_size_valid)
 		return e->delta_size_;
-	return oe_size(e);
+	return oe_size(pack, e);
 }
 
 static inline void oe_set_delta_size(struct packing_data *pack,
@@ -331,7 +325,7 @@ static inline void oe_set_delta_size(struct packing_data *pack,
 {
 	e->delta_size_ = size;
 	e->delta_size_valid = e->delta_size_ == size;
-	if (!e->delta_size_valid && size != oe_size(e))
+	if (!e->delta_size_valid && size != oe_size(pack, e))
 		die("BUG: this can only happen in check_object() "
 		    "where delta size is the same as entry size");
 }
diff --git a/t/README b/t/README
index 1a1361a806..da117ca734 100644
--- a/t/README
+++ b/t/README
@@ -292,6 +292,26 @@ and know what setup is needed for it.  Or when you want to run
 everything up to a certain test.
 
 
+Running tests with special setups
+---------------------------------
+
+The whole test suite could be run to test some special features
+that cannot be easily covered by a few specific test cases. These
+could be enabled by running the test suite with correct GIT_TEST_
+environment set.
+
+GIT_TEST_SPLIT_INDEX forces split-index mode on the whole test suite.
+
+GIT_TEST_FULL_IN_PACK_ARRAY exercises the uncommon pack-objects code
+path where there are more than 1024 packs even if the actual number of
+packs in repository is below this limit.
+
+GIT_TEST_OE_SIZE_BITS=<bits> exercises the uncommon pack-objects
+code path where we do not cache objecct size in memory and read it
+from existing packs on demand. This normally only happens when the
+object size is over 2GB. This variable forces the code path on any
+object larger than 2^<bits> bytes.
+
 Naming Tests
 ------------
 
diff --git a/t/t5300-pack-object.sh b/t/t5300-pack-object.sh
index 9c68b99251..5c076637ff 100755
--- a/t/t5300-pack-object.sh
+++ b/t/t5300-pack-object.sh
@@ -457,6 +457,11 @@ test_expect_success !PTHREADS,C_LOCALE_OUTPUT 'pack-objects --threads=N or pack.
 	grep -F "no threads support, ignoring pack.threads" err
 '
 
+test_expect_success 'pack-objects in too-many-packs mode' '
+	GIT_TEST_FULL_IN_PACK_ARRAY=1 git repack -ad &&
+	git fsck
+'
+
 #
 # WARNING!
 #

Nguyễn Thái Ngọc Duy (13):
  pack-objects: a bit of document about struct object_entry
  pack-objects: turn type and in_pack_type to bitfields
  pack-objects: use bitfield for object_entry::dfs_state
  pack-objects: use bitfield for object_entry::depth
  pack-objects: move in_pack_pos out of struct object_entry
  pack-objects: move in_pack out of struct object_entry
  pack-objects: refer to delta objects by index instead of pointer
  pack-objects: shrink z_delta_size field in struct object_entry
  pack-objects: don't check size when the object is bad
  pack-objects: clarify the use of object_entry::size
  pack-objects: shrink size field in struct object_entry
  pack-objects: shrink delta_size field in struct object_entry
  pack-objects: reorder members to shrink struct object_entry

 Documentation/config.txt           |   4 +-
 Documentation/git-pack-objects.txt |   4 +-
 Documentation/git-repack.txt       |   4 +-
 builtin/pack-objects.c             | 353 +++++++++++++++++++----------
 cache.h                            |   3 +
 object.h                           |   1 -
 pack-bitmap-write.c                |  14 +-
 pack-bitmap.c                      |   2 +-
 pack-bitmap.h                      |   4 +-
 pack-objects.c                     |  43 ++++
 pack-objects.h                     | 306 ++++++++++++++++++++++---
 t/README                           |  10 +
 t/t5300-pack-object.sh             |   5 +
 13 files changed, 588 insertions(+), 165 deletions(-)

-- 
2.17.0.rc0.348.gd5a49e0b6f


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v7 01/13] pack-objects: a bit of document about struct object_entry
  2018-03-24  6:33             ` [PATCH v7 00/13] " Nguyễn Thái Ngọc Duy
@ 2018-03-24  6:33               ` Nguyễn Thái Ngọc Duy
  2018-03-24  6:33               ` [PATCH v7 02/13] pack-objects: turn type and in_pack_type to bitfields Nguyễn Thái Ngọc Duy
                                 ` (13 subsequent siblings)
  14 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-24  6:33 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

The role of this comment block becomes more important after we shuffle
fields around to shrink this struct. It will be much harder to see what
field is related to what.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 pack-objects.h | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/pack-objects.h b/pack-objects.h
index 03f1191659..c0a1f61aac 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -1,6 +1,51 @@
 #ifndef PACK_OBJECTS_H
 #define PACK_OBJECTS_H
 
+/*
+ * basic object info
+ * -----------------
+ * idx.oid is filled up before delta searching starts. idx.crc32 is
+ * only valid after the object is written out and will be used for
+ * generating the index. idx.offset will be both gradually set and
+ * used in writing phase (base objects get offset first, then deltas
+ * refer to them)
+ *
+ * "size" is the uncompressed object size. Compressed size of the raw
+ * data for an object in a pack is not stored anywhere but is computed
+ * and made available when reverse .idx is made.
+ *
+ * "hash" contains a path name hash which is used for sorting the
+ * delta list and also during delta searching. Once prepare_pack()
+ * returns it's no longer needed.
+ *
+ * source pack info
+ * ----------------
+ * The (in_pack, in_pack_offset) tuple contains the location of the
+ * object in the source pack. in_pack_header_size allows quickly
+ * skipping the header and going straight to the zlib stream.
+ *
+ * "type" and "in_pack_type" both describe object type. in_pack_type
+ * may contain a delta type, while type is always the canonical type.
+ *
+ * deltas
+ * ------
+ * Delta links (delta, delta_child and delta_sibling) are created to
+ * reflect that delta graph from the source pack then updated or added
+ * during delta searching phase when we find better deltas.
+ *
+ * delta_child and delta_sibling are last needed in
+ * compute_write_order(). "delta" and "delta_size" must remain valid
+ * at object writing phase in case the delta is not cached.
+ *
+ * If a delta is cached in memory and is compressed, delta_data points
+ * to the data and z_delta_size contains the compressed size. If it's
+ * uncompressed [1], z_delta_size must be zero. delta_size is always
+ * the uncompressed size and must be valid even if the delta is not
+ * cached.
+ *
+ * [1] during try_delta phase we don't bother with compressing because
+ * the delta could be quickly replaced with a better one.
+ */
 struct object_entry {
 	struct pack_idx_entry idx;
 	unsigned long size;	/* uncompressed size */
-- 
2.17.0.rc0.348.gd5a49e0b6f


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v7 02/13] pack-objects: turn type and in_pack_type to bitfields
  2018-03-24  6:33             ` [PATCH v7 00/13] " Nguyễn Thái Ngọc Duy
  2018-03-24  6:33               ` [PATCH v7 01/13] pack-objects: a bit of document about struct object_entry Nguyễn Thái Ngọc Duy
@ 2018-03-24  6:33               ` Nguyễn Thái Ngọc Duy
  2018-03-30 20:18                 ` Jeff King
  2018-03-24  6:33               ` [PATCH v7 03/13] pack-objects: use bitfield for object_entry::dfs_state Nguyễn Thái Ngọc Duy
                                 ` (12 subsequent siblings)
  14 siblings, 1 reply; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-24  6:33 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

An extra field type_valid is added to carry the equivalent of OBJ_BAD
in the original "type" field. in_pack_type always contains a valid
type so we only need 3 bits for it.

A note about accepting OBJ_NONE as "valid" type. The function
read_object_list_from_stdin() can pass this value [1] and it
eventually calls create_object_entry() where current code skip setting
"type" field if the incoming type is zero. This does not have any bad
side effects because "type" field should be memset()'d anyway.

But since we also need to set type_valid now, skipping oe_set_type()
leaves type_valid zero/false, which will make oe_type() return
OBJ_BAD, not OBJ_NONE anymore. Apparently we do care about OBJ_NONE in
prepare_pack(). This switch from OBJ_NONE to OBJ_BAD may trigger

    fatal: unable to get type of object ...

Accepting OBJ_NONE [2] does sound wrong, but this is how it is has
been for a very long time and I haven't time to dig in further.

[1] See 5c49c11686 (pack-objects: better check_object() performances -
    2007-04-16)

[2] 21666f1aae (convert object type handling from a string to a number
    - 2007-02-26)

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 60 ++++++++++++++++++++++++------------------
 cache.h                |  2 ++
 object.h               |  1 -
 pack-bitmap-write.c    |  6 ++---
 pack-objects.h         | 20 ++++++++++++--
 5 files changed, 58 insertions(+), 31 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 5c674b2843..647c01ea34 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -265,7 +265,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 	struct git_istream *st = NULL;
 
 	if (!usable_delta) {
-		if (entry->type == OBJ_BLOB &&
+		if (oe_type(entry) == OBJ_BLOB &&
 		    entry->size > big_file_threshold &&
 		    (st = open_istream(entry->idx.oid.hash, &type, &size, NULL)) != NULL)
 			buf = NULL;
@@ -371,7 +371,7 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 	struct pack_window *w_curs = NULL;
 	struct revindex_entry *revidx;
 	off_t offset;
-	enum object_type type = entry->type;
+	enum object_type type = oe_type(entry);
 	off_t datalen;
 	unsigned char header[MAX_PACK_OBJECT_HEADER],
 		      dheader[MAX_PACK_OBJECT_HEADER];
@@ -480,11 +480,12 @@ static off_t write_object(struct hashfile *f,
 		to_reuse = 0;	/* explicit */
 	else if (!entry->in_pack)
 		to_reuse = 0;	/* can't reuse what we don't have */
-	else if (entry->type == OBJ_REF_DELTA || entry->type == OBJ_OFS_DELTA)
+	else if (oe_type(entry) == OBJ_REF_DELTA ||
+		 oe_type(entry) == OBJ_OFS_DELTA)
 				/* check_object() decided it for us ... */
 		to_reuse = usable_delta;
 				/* ... but pack split may override that */
-	else if (entry->type != entry->in_pack_type)
+	else if (oe_type(entry) != entry->in_pack_type)
 		to_reuse = 0;	/* pack has delta which is unusable */
 	else if (entry->delta)
 		to_reuse = 0;	/* we want to pack afresh */
@@ -705,8 +706,8 @@ static struct object_entry **compute_write_order(void)
 	 * And then all remaining commits and tags.
 	 */
 	for (i = last_untagged; i < to_pack.nr_objects; i++) {
-		if (objects[i].type != OBJ_COMMIT &&
-		    objects[i].type != OBJ_TAG)
+		if (oe_type(&objects[i]) != OBJ_COMMIT &&
+		    oe_type(&objects[i]) != OBJ_TAG)
 			continue;
 		add_to_write_order(wo, &wo_end, &objects[i]);
 	}
@@ -715,7 +716,7 @@ static struct object_entry **compute_write_order(void)
 	 * And then all the trees.
 	 */
 	for (i = last_untagged; i < to_pack.nr_objects; i++) {
-		if (objects[i].type != OBJ_TREE)
+		if (oe_type(&objects[i]) != OBJ_TREE)
 			continue;
 		add_to_write_order(wo, &wo_end, &objects[i]);
 	}
@@ -1066,8 +1067,7 @@ static void create_object_entry(const struct object_id *oid,
 
 	entry = packlist_alloc(&to_pack, oid->hash, index_pos);
 	entry->hash = hash;
-	if (type)
-		entry->type = type;
+	oe_set_type(entry, type);
 	if (exclude)
 		entry->preferred_base = 1;
 	else
@@ -1407,6 +1407,7 @@ static void check_object(struct object_entry *entry)
 		unsigned long avail;
 		off_t ofs;
 		unsigned char *buf, c;
+		enum object_type type;
 
 		buf = use_pack(p, &w_curs, entry->in_pack_offset, &avail);
 
@@ -1415,11 +1416,15 @@ static void check_object(struct object_entry *entry)
 		 * since non-delta representations could still be reused.
 		 */
 		used = unpack_object_header_buffer(buf, avail,
-						   &entry->in_pack_type,
+						   &type,
 						   &entry->size);
 		if (used == 0)
 			goto give_up;
 
+		if (type < 0)
+			die("BUG: invalid type %d", type);
+		entry->in_pack_type = type;
+
 		/*
 		 * Determine if this is a delta and if so whether we can
 		 * reuse it or not.  Otherwise let's find out as cheaply as
@@ -1428,9 +1433,9 @@ static void check_object(struct object_entry *entry)
 		switch (entry->in_pack_type) {
 		default:
 			/* Not a delta hence we've already got all we need. */
-			entry->type = entry->in_pack_type;
+			oe_set_type(entry, entry->in_pack_type);
 			entry->in_pack_header_size = used;
-			if (entry->type < OBJ_COMMIT || entry->type > OBJ_BLOB)
+			if (oe_type(entry) < OBJ_COMMIT || oe_type(entry) > OBJ_BLOB)
 				goto give_up;
 			unuse_pack(&w_curs);
 			return;
@@ -1484,7 +1489,7 @@ static void check_object(struct object_entry *entry)
 			 * deltify other objects against, in order to avoid
 			 * circular deltas.
 			 */
-			entry->type = entry->in_pack_type;
+			oe_set_type(entry, entry->in_pack_type);
 			entry->delta = base_entry;
 			entry->delta_size = entry->size;
 			entry->delta_sibling = base_entry->delta_child;
@@ -1493,7 +1498,7 @@ static void check_object(struct object_entry *entry)
 			return;
 		}
 
-		if (entry->type) {
+		if (oe_type(entry)) {
 			/*
 			 * This must be a delta and we already know what the
 			 * final object type is.  Let's extract the actual
@@ -1516,7 +1521,7 @@ static void check_object(struct object_entry *entry)
 		unuse_pack(&w_curs);
 	}
 
-	entry->type = sha1_object_info(entry->idx.oid.hash, &entry->size);
+	oe_set_type(entry, sha1_object_info(entry->idx.oid.hash, &entry->size));
 	/*
 	 * The error condition is checked in prepare_pack().  This is
 	 * to permit a missing preferred base object to be ignored
@@ -1559,6 +1564,7 @@ static void drop_reused_delta(struct object_entry *entry)
 {
 	struct object_entry **p = &entry->delta->delta_child;
 	struct object_info oi = OBJECT_INFO_INIT;
+	enum object_type type;
 
 	while (*p) {
 		if (*p == entry)
@@ -1570,16 +1576,18 @@ static void drop_reused_delta(struct object_entry *entry)
 	entry->depth = 0;
 
 	oi.sizep = &entry->size;
-	oi.typep = &entry->type;
+	oi.typep = &type;
 	if (packed_object_info(entry->in_pack, entry->in_pack_offset, &oi) < 0) {
 		/*
 		 * We failed to get the info from this pack for some reason;
 		 * fall back to sha1_object_info, which may find another copy.
-		 * And if that fails, the error will be recorded in entry->type
+		 * And if that fails, the error will be recorded in oe_type(entry)
 		 * and dealt with in prepare_pack().
 		 */
-		entry->type = sha1_object_info(entry->idx.oid.hash,
-					       &entry->size);
+		oe_set_type(entry, sha1_object_info(entry->idx.oid.hash,
+						    &entry->size));
+	} else {
+		oe_set_type(entry, type);
 	}
 }
 
@@ -1747,10 +1755,12 @@ static int type_size_sort(const void *_a, const void *_b)
 {
 	const struct object_entry *a = *(struct object_entry **)_a;
 	const struct object_entry *b = *(struct object_entry **)_b;
+	enum object_type a_type = oe_type(a);
+	enum object_type b_type = oe_type(b);
 
-	if (a->type > b->type)
+	if (a_type > b_type)
 		return -1;
-	if (a->type < b->type)
+	if (a_type < b_type)
 		return 1;
 	if (a->hash > b->hash)
 		return -1;
@@ -1826,7 +1836,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	void *delta_buf;
 
 	/* Don't bother doing diffs between different types */
-	if (trg_entry->type != src_entry->type)
+	if (oe_type(trg_entry) != oe_type(src_entry))
 		return -1;
 
 	/*
@@ -2432,11 +2442,11 @@ static void prepare_pack(int window, int depth)
 
 		if (!entry->preferred_base) {
 			nr_deltas++;
-			if (entry->type < 0)
+			if (oe_type(entry) < 0)
 				die("unable to get type of object %s",
 				    oid_to_hex(&entry->idx.oid));
 		} else {
-			if (entry->type < 0) {
+			if (oe_type(entry) < 0) {
 				/*
 				 * This object is not found, but we
 				 * don't have to include it anyway.
@@ -2545,7 +2555,7 @@ static void read_object_list_from_stdin(void)
 			die("expected object ID, got garbage:\n %s", line);
 
 		add_preferred_base_object(p + 1);
-		add_object_entry(&oid, 0, p + 1, 0);
+		add_object_entry(&oid, OBJ_NONE, p + 1, 0);
 	}
 }
 
diff --git a/cache.h b/cache.h
index 21fbcc2414..862bdff83a 100644
--- a/cache.h
+++ b/cache.h
@@ -373,6 +373,8 @@ extern void free_name_hash(struct index_state *istate);
 #define read_blob_data_from_cache(path, sz) read_blob_data_from_index(&the_index, (path), (sz))
 #endif
 
+#define TYPE_BITS 3
+
 enum object_type {
 	OBJ_BAD = -1,
 	OBJ_NONE = 0,
diff --git a/object.h b/object.h
index 87563d9056..8ce294d6ec 100644
--- a/object.h
+++ b/object.h
@@ -25,7 +25,6 @@ struct object_array {
 
 #define OBJECT_ARRAY_INIT { 0, 0, NULL }
 
-#define TYPE_BITS   3
 /*
  * object flag allocation:
  * revision.h:      0---------10                                26
diff --git a/pack-bitmap-write.c b/pack-bitmap-write.c
index e01f992884..fd11f08940 100644
--- a/pack-bitmap-write.c
+++ b/pack-bitmap-write.c
@@ -64,12 +64,12 @@ void bitmap_writer_build_type_index(struct pack_idx_entry **index,
 
 		entry->in_pack_pos = i;
 
-		switch (entry->type) {
+		switch (oe_type(entry)) {
 		case OBJ_COMMIT:
 		case OBJ_TREE:
 		case OBJ_BLOB:
 		case OBJ_TAG:
-			real_type = entry->type;
+			real_type = oe_type(entry);
 			break;
 
 		default:
@@ -98,7 +98,7 @@ void bitmap_writer_build_type_index(struct pack_idx_entry **index,
 		default:
 			die("Missing type information for %s (%d/%d)",
 			    oid_to_hex(&entry->idx.oid), real_type,
-			    entry->type);
+			    oe_type(entry));
 		}
 	}
 }
diff --git a/pack-objects.h b/pack-objects.h
index c0a1f61aac..b883d7aa10 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -59,8 +59,9 @@ struct object_entry {
 	void *delta_data;	/* cached delta (uncompressed) */
 	unsigned long delta_size;	/* delta data size (uncompressed) */
 	unsigned long z_delta_size;	/* delta data size (compressed) */
-	enum object_type type;
-	enum object_type in_pack_type;	/* could be delta */
+	unsigned type_:TYPE_BITS;
+	unsigned in_pack_type:TYPE_BITS; /* could be delta */
+	unsigned type_valid:1;
 	uint32_t hash;			/* name hint hash */
 	unsigned int in_pack_pos;
 	unsigned char in_pack_header_size;
@@ -123,4 +124,19 @@ static inline uint32_t pack_name_hash(const char *name)
 	return hash;
 }
 
+static inline enum object_type oe_type(const struct object_entry *e)
+{
+	return e->type_valid ? e->type_ : OBJ_BAD;
+}
+
+static inline void oe_set_type(struct object_entry *e,
+			       enum object_type type)
+{
+	if (type >= OBJ_ANY)
+		die("BUG: OBJ_ANY cannot be set in pack-objects code");
+
+	e->type_valid = type >= OBJ_NONE;
+	e->type_ = (unsigned)type;
+}
+
 #endif
-- 
2.17.0.rc0.348.gd5a49e0b6f


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v7 03/13] pack-objects: use bitfield for object_entry::dfs_state
  2018-03-24  6:33             ` [PATCH v7 00/13] " Nguyễn Thái Ngọc Duy
  2018-03-24  6:33               ` [PATCH v7 01/13] pack-objects: a bit of document about struct object_entry Nguyễn Thái Ngọc Duy
  2018-03-24  6:33               ` [PATCH v7 02/13] pack-objects: turn type and in_pack_type to bitfields Nguyễn Thái Ngọc Duy
@ 2018-03-24  6:33               ` Nguyễn Thái Ngọc Duy
  2018-03-30 20:23                 ` Jeff King
  2018-03-24  6:33               ` [PATCH v7 04/13] pack-objects: use bitfield for object_entry::depth Nguyễn Thái Ngọc Duy
                                 ` (11 subsequent siblings)
  14 siblings, 1 reply; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-24  6:33 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c |  3 +++
 pack-objects.h         | 28 +++++++++++++++++-----------
 2 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 647c01ea34..83f8154865 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -3049,6 +3049,9 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 		OPT_END(),
 	};
 
+	if (DFS_NUM_STATES > (1 << OE_DFS_STATE_BITS))
+		die("BUG: too many dfs states, increase OE_DFS_STATE_BITS");
+
 	check_replace_refs = 0;
 
 	reset_pack_idx_option(&pack_idx_opts);
diff --git a/pack-objects.h b/pack-objects.h
index b883d7aa10..8507e1b869 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -1,6 +1,21 @@
 #ifndef PACK_OBJECTS_H
 #define PACK_OBJECTS_H
 
+#define OE_DFS_STATE_BITS	2
+
+/*
+ * State flags for depth-first search used for analyzing delta cycles.
+ *
+ * The depth is measured in delta-links to the base (so if A is a delta
+ * against B, then A has a depth of 1, and B a depth of 0).
+ */
+enum dfs_state {
+	DFS_NONE = 0,
+	DFS_ACTIVE,
+	DFS_DONE,
+	DFS_NUM_STATES
+};
+
 /*
  * basic object info
  * -----------------
@@ -73,19 +88,10 @@ struct object_entry {
 	unsigned no_try_delta:1;
 	unsigned tagged:1; /* near the very tip of refs */
 	unsigned filled:1; /* assigned write-order */
+	unsigned dfs_state:OE_DFS_STATE_BITS;
 
-	/*
-	 * State flags for depth-first search used for analyzing delta cycles.
-	 *
-	 * The depth is measured in delta-links to the base (so if A is a delta
-	 * against B, then A has a depth of 1, and B a depth of 0).
-	 */
-	enum {
-		DFS_NONE = 0,
-		DFS_ACTIVE,
-		DFS_DONE
-	} dfs_state;
 	int depth;
+
 };
 
 struct packing_data {
-- 
2.17.0.rc0.348.gd5a49e0b6f


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v7 04/13] pack-objects: use bitfield for object_entry::depth
  2018-03-24  6:33             ` [PATCH v7 00/13] " Nguyễn Thái Ngọc Duy
                                 ` (2 preceding siblings ...)
  2018-03-24  6:33               ` [PATCH v7 03/13] pack-objects: use bitfield for object_entry::dfs_state Nguyễn Thái Ngọc Duy
@ 2018-03-24  6:33               ` Nguyễn Thái Ngọc Duy
  2018-03-30 20:26                 ` Jeff King
  2018-03-24  6:33               ` [PATCH v7 05/13] pack-objects: move in_pack_pos out of struct object_entry Nguyễn Thái Ngọc Duy
                                 ` (10 subsequent siblings)
  14 siblings, 1 reply; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-24  6:33 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

Because of struct packing from now on we can only handle max depth
4095 (or even lower when new booleans are added in this struct). This
should be ok since long delta chain will cause significant slow down
anyway.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 Documentation/config.txt           | 1 +
 Documentation/git-pack-objects.txt | 4 +++-
 Documentation/git-repack.txt       | 4 +++-
 builtin/pack-objects.c             | 4 ++++
 pack-objects.h                     | 5 ++---
 5 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index f57e9cf10c..9bd3f5a789 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -2412,6 +2412,7 @@ pack.window::
 pack.depth::
 	The maximum delta depth used by linkgit:git-pack-objects[1] when no
 	maximum depth is given on the command line. Defaults to 50.
+	Maximum value is 4095.
 
 pack.windowMemory::
 	The maximum size of memory that is consumed by each thread
diff --git a/Documentation/git-pack-objects.txt b/Documentation/git-pack-objects.txt
index 81bc490ac5..3503c9e3e6 100644
--- a/Documentation/git-pack-objects.txt
+++ b/Documentation/git-pack-objects.txt
@@ -96,7 +96,9 @@ base-name::
 	it too deep affects the performance on the unpacker
 	side, because delta data needs to be applied that many
 	times to get to the necessary object.
-	The default value for --window is 10 and --depth is 50.
++
+The default value for --window is 10 and --depth is 50. The maximum
+depth is 4095.
 
 --window-memory=<n>::
 	This option provides an additional limit on top of `--window`;
diff --git a/Documentation/git-repack.txt b/Documentation/git-repack.txt
index ae750e9e11..25c83c4927 100644
--- a/Documentation/git-repack.txt
+++ b/Documentation/git-repack.txt
@@ -90,7 +90,9 @@ other objects in that pack they already have locally.
 	space. `--depth` limits the maximum delta depth; making it too deep
 	affects the performance on the unpacker side, because delta data needs
 	to be applied that many times to get to the necessary object.
-	The default value for --window is 10 and --depth is 50.
++
+The default value for --window is 10 and --depth is 50. The maximum
+depth is 4095.
 
 --threads=<n>::
 	This option is passed through to `git pack-objects`.
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 83f8154865..205e1f646c 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -3068,6 +3068,10 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 	if (pack_to_stdout != !base_name || argc)
 		usage_with_options(pack_usage, pack_objects_options);
 
+	if (depth >= (1 << OE_DEPTH_BITS))
+		die(_("delta chain depth %d is greater than maximum limit %d"),
+		    depth, (1 << OE_DEPTH_BITS) - 1);
+
 	argv_array_push(&rp, "pack-objects");
 	if (thin) {
 		use_internal_rev_list = 1;
diff --git a/pack-objects.h b/pack-objects.h
index 8507e1b869..59407aae3c 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -2,6 +2,7 @@
 #define PACK_OBJECTS_H
 
 #define OE_DFS_STATE_BITS	2
+#define OE_DEPTH_BITS		12
 
 /*
  * State flags for depth-first search used for analyzing delta cycles.
@@ -89,9 +90,7 @@ struct object_entry {
 	unsigned tagged:1; /* near the very tip of refs */
 	unsigned filled:1; /* assigned write-order */
 	unsigned dfs_state:OE_DFS_STATE_BITS;
-
-	int depth;
-
+	unsigned depth:OE_DEPTH_BITS;
 };
 
 struct packing_data {
-- 
2.17.0.rc0.348.gd5a49e0b6f


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v7 05/13] pack-objects: move in_pack_pos out of struct object_entry
  2018-03-24  6:33             ` [PATCH v7 00/13] " Nguyễn Thái Ngọc Duy
                                 ` (3 preceding siblings ...)
  2018-03-24  6:33               ` [PATCH v7 04/13] pack-objects: use bitfield for object_entry::depth Nguyễn Thái Ngọc Duy
@ 2018-03-24  6:33               ` Nguyễn Thái Ngọc Duy
  2018-03-30 20:30                 ` Jeff King
  2018-03-24  6:33               ` [PATCH v7 06/13] pack-objects: move in_pack " Nguyễn Thái Ngọc Duy
                                 ` (9 subsequent siblings)
  14 siblings, 1 reply; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-24  6:33 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

This field is only need for pack-bitmap, which is an optional
feature. Move it to a separate array that is only allocated when
pack-bitmap is used (it's not freed in the same way that objects[] is
not).

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c |  3 ++-
 pack-bitmap-write.c    |  8 +++++---
 pack-bitmap.c          |  2 +-
 pack-bitmap.h          |  4 +++-
 pack-objects.h         | 16 +++++++++++++++-
 5 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 205e1f646c..e1244918a5 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -879,7 +879,8 @@ static void write_pack_file(void)
 
 			if (write_bitmap_index) {
 				bitmap_writer_set_checksum(oid.hash);
-				bitmap_writer_build_type_index(written_list, nr_written);
+				bitmap_writer_build_type_index(
+					&to_pack, written_list, nr_written);
 			}
 
 			finish_tmp_packfile(&tmpname, pack_tmp_name,
diff --git a/pack-bitmap-write.c b/pack-bitmap-write.c
index fd11f08940..f7c897515b 100644
--- a/pack-bitmap-write.c
+++ b/pack-bitmap-write.c
@@ -48,7 +48,8 @@ void bitmap_writer_show_progress(int show)
 /**
  * Build the initial type index for the packfile
  */
-void bitmap_writer_build_type_index(struct pack_idx_entry **index,
+void bitmap_writer_build_type_index(struct packing_data *to_pack,
+				    struct pack_idx_entry **index,
 				    uint32_t index_nr)
 {
 	uint32_t i;
@@ -57,12 +58,13 @@ void bitmap_writer_build_type_index(struct pack_idx_entry **index,
 	writer.trees = ewah_new();
 	writer.blobs = ewah_new();
 	writer.tags = ewah_new();
+	ALLOC_ARRAY(to_pack->in_pack_pos, to_pack->nr_objects);
 
 	for (i = 0; i < index_nr; ++i) {
 		struct object_entry *entry = (struct object_entry *)index[i];
 		enum object_type real_type;
 
-		entry->in_pack_pos = i;
+		oe_set_in_pack_pos(to_pack, entry, i);
 
 		switch (oe_type(entry)) {
 		case OBJ_COMMIT:
@@ -147,7 +149,7 @@ static uint32_t find_object_pos(const unsigned char *sha1)
 			"(object %s is missing)", sha1_to_hex(sha1));
 	}
 
-	return entry->in_pack_pos;
+	return oe_in_pack_pos(writer.to_pack, entry);
 }
 
 static void show_object(struct object *object, const char *name, void *data)
diff --git a/pack-bitmap.c b/pack-bitmap.c
index 9270983e5f..865d9ecc4e 100644
--- a/pack-bitmap.c
+++ b/pack-bitmap.c
@@ -1032,7 +1032,7 @@ int rebuild_existing_bitmaps(struct packing_data *mapping,
 		oe = packlist_find(mapping, sha1, NULL);
 
 		if (oe)
-			reposition[i] = oe->in_pack_pos + 1;
+			reposition[i] = oe_in_pack_pos(mapping, oe) + 1;
 	}
 
 	rebuild = bitmap_new();
diff --git a/pack-bitmap.h b/pack-bitmap.h
index 3742a00e14..5ded2f139a 100644
--- a/pack-bitmap.h
+++ b/pack-bitmap.h
@@ -44,7 +44,9 @@ int rebuild_existing_bitmaps(struct packing_data *mapping, khash_sha1 *reused_bi
 
 void bitmap_writer_show_progress(int show);
 void bitmap_writer_set_checksum(unsigned char *sha1);
-void bitmap_writer_build_type_index(struct pack_idx_entry **index, uint32_t index_nr);
+void bitmap_writer_build_type_index(struct packing_data *to_pack,
+				    struct pack_idx_entry **index,
+				    uint32_t index_nr);
 void bitmap_writer_reuse_bitmaps(struct packing_data *to_pack);
 void bitmap_writer_select_commits(struct commit **indexed_commits,
 		unsigned int indexed_commits_nr, int max_bitmaps);
diff --git a/pack-objects.h b/pack-objects.h
index 59407aae3c..4a11653657 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -79,7 +79,6 @@ struct object_entry {
 	unsigned in_pack_type:TYPE_BITS; /* could be delta */
 	unsigned type_valid:1;
 	uint32_t hash;			/* name hint hash */
-	unsigned int in_pack_pos;
 	unsigned char in_pack_header_size;
 	unsigned preferred_base:1; /*
 				    * we do not pack this, but is available
@@ -99,6 +98,8 @@ struct packing_data {
 
 	int32_t *index;
 	uint32_t index_size;
+
+	unsigned int *in_pack_pos;
 };
 
 struct object_entry *packlist_alloc(struct packing_data *pdata,
@@ -144,4 +145,17 @@ static inline void oe_set_type(struct object_entry *e,
 	e->type_ = (unsigned)type;
 }
 
+static inline unsigned int oe_in_pack_pos(const struct packing_data *pack,
+					  const struct object_entry *e)
+{
+	return pack->in_pack_pos[e - pack->objects];
+}
+
+static inline void oe_set_in_pack_pos(const struct packing_data *pack,
+				      const struct object_entry *e,
+				      unsigned int pos)
+{
+	pack->in_pack_pos[e - pack->objects] = pos;
+}
+
 #endif
-- 
2.17.0.rc0.348.gd5a49e0b6f


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v7 06/13] pack-objects: move in_pack out of struct object_entry
  2018-03-24  6:33             ` [PATCH v7 00/13] " Nguyễn Thái Ngọc Duy
                                 ` (4 preceding siblings ...)
  2018-03-24  6:33               ` [PATCH v7 05/13] pack-objects: move in_pack_pos out of struct object_entry Nguyễn Thái Ngọc Duy
@ 2018-03-24  6:33               ` Nguyễn Thái Ngọc Duy
  2018-03-24  9:42                 ` Ævar Arnfjörð Bjarmason
                                   ` (2 more replies)
  2018-03-24  6:33               ` [PATCH v7 07/13] pack-objects: refer to delta objects by index instead of pointer Nguyễn Thái Ngọc Duy
                                 ` (8 subsequent siblings)
  14 siblings, 3 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-24  6:33 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

Instead of using 8 bytes (on 64 bit arch) to store a pointer to a
pack. Use an index instead since the number of packs should be
relatively small.

This limits the number of packs we can handle to 1k. Since we can't be
sure people can never run into the situation where they have more than
1k pack files. Provide a fall back route for it.

If we find out they have too many packs, the new in_pack_by_idx[]
array (which has at most 1k elements) will not be used. Instead we
allocate in_pack[] array that holds nr_objects elements. This is
similar to how the optional in_pack_pos field is handled.

The new simple test is just to make sure the too-many-packs code path
is at least executed. The true test is running

    make test GIT_TEST_FULL_IN_PACK_ARRAY=1

to take advantage of other special case tests.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 26 +++++++++++++++-----------
 cache.h                |  1 +
 pack-objects.c         | 36 ++++++++++++++++++++++++++++++++++++
 pack-objects.h         | 40 +++++++++++++++++++++++++++++++++++++++-
 t/README               |  4 ++++
 t/t5300-pack-object.sh |  5 +++++
 6 files changed, 100 insertions(+), 12 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index e1244918a5..b41610569e 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -29,6 +29,8 @@
 #include "list.h"
 #include "packfile.h"
 
+#define IN_PACK(obj) oe_in_pack(&to_pack, obj)
+
 static const char *pack_usage[] = {
 	N_("git pack-objects --stdout [<options>...] [< <ref-list> | < <object-list>]"),
 	N_("git pack-objects [<options>...] <base-name> [< <ref-list> | < <object-list>]"),
@@ -367,7 +369,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 				unsigned long limit, int usable_delta)
 {
-	struct packed_git *p = entry->in_pack;
+	struct packed_git *p = IN_PACK(entry);
 	struct pack_window *w_curs = NULL;
 	struct revindex_entry *revidx;
 	off_t offset;
@@ -478,7 +480,7 @@ static off_t write_object(struct hashfile *f,
 
 	if (!reuse_object)
 		to_reuse = 0;	/* explicit */
-	else if (!entry->in_pack)
+	else if (!IN_PACK(entry))
 		to_reuse = 0;	/* can't reuse what we don't have */
 	else if (oe_type(entry) == OBJ_REF_DELTA ||
 		 oe_type(entry) == OBJ_OFS_DELTA)
@@ -1074,7 +1076,7 @@ static void create_object_entry(const struct object_id *oid,
 	else
 		nr_result++;
 	if (found_pack) {
-		entry->in_pack = found_pack;
+		oe_set_in_pack(&to_pack, entry, found_pack);
 		entry->in_pack_offset = found_offset;
 	}
 
@@ -1399,8 +1401,8 @@ static void cleanup_preferred_base(void)
 
 static void check_object(struct object_entry *entry)
 {
-	if (entry->in_pack) {
-		struct packed_git *p = entry->in_pack;
+	if (IN_PACK(entry)) {
+		struct packed_git *p = IN_PACK(entry);
 		struct pack_window *w_curs = NULL;
 		const unsigned char *base_ref = NULL;
 		struct object_entry *base_entry;
@@ -1535,14 +1537,16 @@ static int pack_offset_sort(const void *_a, const void *_b)
 {
 	const struct object_entry *a = *(struct object_entry **)_a;
 	const struct object_entry *b = *(struct object_entry **)_b;
+	const struct packed_git *a_in_pack = IN_PACK(a);
+	const struct packed_git *b_in_pack = IN_PACK(b);
 
 	/* avoid filesystem trashing with loose objects */
-	if (!a->in_pack && !b->in_pack)
+	if (!a_in_pack && !b_in_pack)
 		return oidcmp(&a->idx.oid, &b->idx.oid);
 
-	if (a->in_pack < b->in_pack)
+	if (a_in_pack < b_in_pack)
 		return -1;
-	if (a->in_pack > b->in_pack)
+	if (a_in_pack > b_in_pack)
 		return 1;
 	return a->in_pack_offset < b->in_pack_offset ? -1 :
 			(a->in_pack_offset > b->in_pack_offset);
@@ -1578,7 +1582,7 @@ static void drop_reused_delta(struct object_entry *entry)
 
 	oi.sizep = &entry->size;
 	oi.typep = &type;
-	if (packed_object_info(entry->in_pack, entry->in_pack_offset, &oi) < 0) {
+	if (packed_object_info(IN_PACK(entry), entry->in_pack_offset, &oi) < 0) {
 		/*
 		 * We failed to get the info from this pack for some reason;
 		 * fall back to sha1_object_info, which may find another copy.
@@ -1848,8 +1852,8 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	 * it, we will still save the transfer cost, as we already know
 	 * the other side has it and we won't send src_entry at all.
 	 */
-	if (reuse_delta && trg_entry->in_pack &&
-	    trg_entry->in_pack == src_entry->in_pack &&
+	if (reuse_delta && IN_PACK(trg_entry) &&
+	    IN_PACK(trg_entry) == IN_PACK(src_entry) &&
 	    !src_entry->preferred_base &&
 	    trg_entry->in_pack_type != OBJ_REF_DELTA &&
 	    trg_entry->in_pack_type != OBJ_OFS_DELTA)
diff --git a/cache.h b/cache.h
index 862bdff83a..b90feb3802 100644
--- a/cache.h
+++ b/cache.h
@@ -1635,6 +1635,7 @@ extern struct packed_git {
 	int index_version;
 	time_t mtime;
 	int pack_fd;
+	int index;		/* for builtin/pack-objects.c */
 	unsigned pack_local:1,
 		 pack_keep:1,
 		 freshened:1,
diff --git a/pack-objects.c b/pack-objects.c
index 9558d13834..13f2b2bff2 100644
--- a/pack-objects.c
+++ b/pack-objects.c
@@ -2,6 +2,7 @@
 #include "object.h"
 #include "pack.h"
 #include "pack-objects.h"
+#include "packfile.h"
 
 static uint32_t locate_object_entry_hash(struct packing_data *pdata,
 					 const unsigned char *sha1,
@@ -86,15 +87,47 @@ struct object_entry *packlist_find(struct packing_data *pdata,
 	return &pdata->objects[pdata->index[i] - 1];
 }
 
+static void prepare_in_pack_by_idx(struct packing_data *pdata)
+{
+	struct packed_git **mapping, *p;
+	int cnt = 0, nr = 1 << OE_IN_PACK_BITS;
+
+	if (getenv("GIT_TEST_FULL_IN_PACK_ARRAY")) {
+		/*
+		 * leave in_pack_by_idx NULL to force in_pack[] to be
+		 * used instead
+		 */
+		return;
+	}
+
+	ALLOC_ARRAY(mapping, nr);
+	mapping[cnt++] = NULL; /* zero index must be mapped to NULL */
+	prepare_packed_git();
+	for (p = packed_git; p; p = p->next, cnt++) {
+		if (cnt == nr) {
+			free(mapping);
+			return;
+		}
+		p->index = cnt;
+		mapping[cnt] = p;
+	}
+	pdata->in_pack_by_idx = mapping;
+}
+
 struct object_entry *packlist_alloc(struct packing_data *pdata,
 				    const unsigned char *sha1,
 				    uint32_t index_pos)
 {
 	struct object_entry *new_entry;
 
+	if (!pdata->nr_objects)
+		prepare_in_pack_by_idx(pdata);
 	if (pdata->nr_objects >= pdata->nr_alloc) {
 		pdata->nr_alloc = (pdata->nr_alloc  + 1024) * 3 / 2;
 		REALLOC_ARRAY(pdata->objects, pdata->nr_alloc);
+
+		if (!pdata->in_pack_by_idx)
+			REALLOC_ARRAY(pdata->in_pack, pdata->nr_alloc);
 	}
 
 	new_entry = pdata->objects + pdata->nr_objects++;
@@ -107,5 +140,8 @@ struct object_entry *packlist_alloc(struct packing_data *pdata,
 	else
 		pdata->index[index_pos] = pdata->nr_objects;
 
+	if (pdata->in_pack)
+		pdata->in_pack[pdata->nr_objects - 1] = NULL;
+
 	return new_entry;
 }
diff --git a/pack-objects.h b/pack-objects.h
index 4a11653657..cae4f4fe01 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -3,6 +3,7 @@
 
 #define OE_DFS_STATE_BITS	2
 #define OE_DEPTH_BITS		12
+#define OE_IN_PACK_BITS		10
 
 /*
  * State flags for depth-first search used for analyzing delta cycles.
@@ -18,6 +19,10 @@ enum dfs_state {
 };
 
 /*
+ * The size of struct nearly determines pack-objects's memory
+ * consumption. This struct is packed tight for that reason. When you
+ * add or reorder something in this struct, think a bit about this.
+ *
  * basic object info
  * -----------------
  * idx.oid is filled up before delta searching starts. idx.crc32 is
@@ -65,7 +70,7 @@ enum dfs_state {
 struct object_entry {
 	struct pack_idx_entry idx;
 	unsigned long size;	/* uncompressed size */
-	struct packed_git *in_pack;	/* already in pack */
+	unsigned in_pack_idx:OE_IN_PACK_BITS;	/* already in pack */
 	off_t in_pack_offset;
 	struct object_entry *delta;	/* delta base object */
 	struct object_entry *delta_child; /* deltified objects who bases me */
@@ -100,6 +105,15 @@ struct packing_data {
 	uint32_t index_size;
 
 	unsigned int *in_pack_pos;
+
+	/*
+	 * Only one of these can be non-NULL and they have different
+	 * sizes. if in_pack_by_idx is allocated, oe_in_pack() returns
+	 * the pack of an object using in_pack_idx field. If not,
+	 * in_pack[] array is used the same way as in_pack_pos[]
+	 */
+	struct packed_git **in_pack_by_idx;
+	struct packed_git **in_pack;
 };
 
 struct object_entry *packlist_alloc(struct packing_data *pdata,
@@ -158,4 +172,28 @@ static inline void oe_set_in_pack_pos(const struct packing_data *pack,
 	pack->in_pack_pos[e - pack->objects] = pos;
 }
 
+static inline struct packed_git *oe_in_pack(const struct packing_data *pack,
+					    const struct object_entry *e)
+{
+	if (pack->in_pack_by_idx)
+		return pack->in_pack_by_idx[e->in_pack_idx];
+	else
+		return pack->in_pack[e - pack->objects];
+
+}
+
+static inline void oe_set_in_pack(struct packing_data *pack,
+				  struct object_entry *e,
+				  struct packed_git *p)
+{
+	if (pack->in_pack_by_idx) {
+		if (p->index <= 0)
+			die("BUG: found_pack should be NULL "
+					"instead of having non-positive index");
+			e->in_pack_idx = p->index;
+	} else
+		pack->in_pack[e - pack->objects] = p;
+
+}
+
 #endif
diff --git a/t/README b/t/README
index 09eb2b9768..c6130ff16d 100644
--- a/t/README
+++ b/t/README
@@ -302,6 +302,10 @@ environment set.
 
 GIT_TEST_SPLIT_INDEX forces split-index mode on the whole test suite.
 
+GIT_TEST_FULL_IN_PACK_ARRAY exercises the uncommon pack-objects code
+path where there are more than 1024 packs even if the actual number of
+packs in repository is below this limit.
+
 Naming Tests
 ------------
 
diff --git a/t/t5300-pack-object.sh b/t/t5300-pack-object.sh
index 9c68b99251..5c076637ff 100755
--- a/t/t5300-pack-object.sh
+++ b/t/t5300-pack-object.sh
@@ -457,6 +457,11 @@ test_expect_success !PTHREADS,C_LOCALE_OUTPUT 'pack-objects --threads=N or pack.
 	grep -F "no threads support, ignoring pack.threads" err
 '
 
+test_expect_success 'pack-objects in too-many-packs mode' '
+	GIT_TEST_FULL_IN_PACK_ARRAY=1 git repack -ad &&
+	git fsck
+'
+
 #
 # WARNING!
 #
-- 
2.17.0.rc0.348.gd5a49e0b6f


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v7 07/13] pack-objects: refer to delta objects by index instead of pointer
  2018-03-24  6:33             ` [PATCH v7 00/13] " Nguyễn Thái Ngọc Duy
                                 ` (5 preceding siblings ...)
  2018-03-24  6:33               ` [PATCH v7 06/13] pack-objects: move in_pack " Nguyễn Thái Ngọc Duy
@ 2018-03-24  6:33               ` Nguyễn Thái Ngọc Duy
  2018-03-30 20:53                 ` Jeff King
  2018-03-24  6:33               ` [PATCH v7 08/13] pack-objects: shrink z_delta_size field in struct object_entry Nguyễn Thái Ngọc Duy
                                 ` (7 subsequent siblings)
  14 siblings, 1 reply; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-24  6:33 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

These delta pointers always point to elements in the objects[] array
in packing_data struct. We can only hold maximum 4G of those objects
because the array size in nr_objects is uint32_t. We could use
uint32_t indexes to address these elements instead of pointers. On
64-bit architecture (8 bytes per pointer) this would save 4 bytes per
pointer.

Convert these delta pointers to indexes. Since we need to handle NULL
pointers as well, the index is shifted by one [1].

[1] This means we can only index 2^32-2 objects even though nr_objects
    could contain 2^32-1 objects. It should not be a problem in
    practice because when we grow objects[], nr_alloc would probably
    blow up long before nr_objects hits the wall.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 116 ++++++++++++++++++++++-------------------
 pack-objects.h         |  67 ++++++++++++++++++++++--
 2 files changed, 124 insertions(+), 59 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index b41610569e..f7d3f6a1a8 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -30,6 +30,12 @@
 #include "packfile.h"
 
 #define IN_PACK(obj) oe_in_pack(&to_pack, obj)
+#define DELTA(obj) oe_delta(&to_pack, obj)
+#define DELTA_CHILD(obj) oe_delta_child(&to_pack, obj)
+#define DELTA_SIBLING(obj) oe_delta_sibling(&to_pack, obj)
+#define SET_DELTA(obj, val) oe_set_delta(&to_pack, obj, val)
+#define SET_DELTA_CHILD(obj, val) oe_set_delta_child(&to_pack, obj, val)
+#define SET_DELTA_SIBLING(obj, val) oe_set_delta_sibling(&to_pack, obj, val)
 
 static const char *pack_usage[] = {
 	N_("git pack-objects --stdout [<options>...] [< <ref-list> | < <object-list>]"),
@@ -127,11 +133,11 @@ static void *get_delta(struct object_entry *entry)
 	buf = read_sha1_file(entry->idx.oid.hash, &type, &size);
 	if (!buf)
 		die("unable to read %s", oid_to_hex(&entry->idx.oid));
-	base_buf = read_sha1_file(entry->delta->idx.oid.hash, &type,
+	base_buf = read_sha1_file(DELTA(entry)->idx.oid.hash, &type,
 				  &base_size);
 	if (!base_buf)
 		die("unable to read %s",
-		    oid_to_hex(&entry->delta->idx.oid));
+		    oid_to_hex(&DELTA(entry)->idx.oid));
 	delta_buf = diff_delta(base_buf, base_size,
 			       buf, size, &delta_size, 0);
 	if (!delta_buf || delta_size != entry->delta_size)
@@ -288,12 +294,12 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 		size = entry->delta_size;
 		buf = entry->delta_data;
 		entry->delta_data = NULL;
-		type = (allow_ofs_delta && entry->delta->idx.offset) ?
+		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	} else {
 		buf = get_delta(entry);
 		size = entry->delta_size;
-		type = (allow_ofs_delta && entry->delta->idx.offset) ?
+		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	}
 
@@ -317,7 +323,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 		 * encoding of the relative offset for the delta
 		 * base from this object's position in the pack.
 		 */
-		off_t ofs = entry->idx.offset - entry->delta->idx.offset;
+		off_t ofs = entry->idx.offset - DELTA(entry)->idx.offset;
 		unsigned pos = sizeof(dheader) - 1;
 		dheader[pos] = ofs & 127;
 		while (ofs >>= 7)
@@ -343,7 +349,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 			return 0;
 		}
 		hashwrite(f, header, hdrlen);
-		hashwrite(f, entry->delta->idx.oid.hash, 20);
+		hashwrite(f, DELTA(entry)->idx.oid.hash, 20);
 		hdrlen += 20;
 	} else {
 		if (limit && hdrlen + datalen + 20 >= limit) {
@@ -379,8 +385,8 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 		      dheader[MAX_PACK_OBJECT_HEADER];
 	unsigned hdrlen;
 
-	if (entry->delta)
-		type = (allow_ofs_delta && entry->delta->idx.offset) ?
+	if (DELTA(entry))
+		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	hdrlen = encode_in_pack_object_header(header, sizeof(header),
 					      type, entry->size);
@@ -408,7 +414,7 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 	}
 
 	if (type == OBJ_OFS_DELTA) {
-		off_t ofs = entry->idx.offset - entry->delta->idx.offset;
+		off_t ofs = entry->idx.offset - DELTA(entry)->idx.offset;
 		unsigned pos = sizeof(dheader) - 1;
 		dheader[pos] = ofs & 127;
 		while (ofs >>= 7)
@@ -427,7 +433,7 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 			return 0;
 		}
 		hashwrite(f, header, hdrlen);
-		hashwrite(f, entry->delta->idx.oid.hash, 20);
+		hashwrite(f, DELTA(entry)->idx.oid.hash, 20);
 		hdrlen += 20;
 		reused_delta++;
 	} else {
@@ -467,13 +473,13 @@ static off_t write_object(struct hashfile *f,
 	else
 		limit = pack_size_limit - write_offset;
 
-	if (!entry->delta)
+	if (!DELTA(entry))
 		usable_delta = 0;	/* no delta */
 	else if (!pack_size_limit)
 	       usable_delta = 1;	/* unlimited packfile */
-	else if (entry->delta->idx.offset == (off_t)-1)
+	else if (DELTA(entry)->idx.offset == (off_t)-1)
 		usable_delta = 0;	/* base was written to another pack */
-	else if (entry->delta->idx.offset)
+	else if (DELTA(entry)->idx.offset)
 		usable_delta = 1;	/* base already exists in this pack */
 	else
 		usable_delta = 0;	/* base could end up in another pack */
@@ -489,7 +495,7 @@ static off_t write_object(struct hashfile *f,
 				/* ... but pack split may override that */
 	else if (oe_type(entry) != entry->in_pack_type)
 		to_reuse = 0;	/* pack has delta which is unusable */
-	else if (entry->delta)
+	else if (DELTA(entry))
 		to_reuse = 0;	/* we want to pack afresh */
 	else
 		to_reuse = 1;	/* we have it in-pack undeltified,
@@ -541,12 +547,12 @@ static enum write_one_status write_one(struct hashfile *f,
 	}
 
 	/* if we are deltified, write out base object first. */
-	if (e->delta) {
+	if (DELTA(e)) {
 		e->idx.offset = 1; /* now recurse */
-		switch (write_one(f, e->delta, offset)) {
+		switch (write_one(f, DELTA(e), offset)) {
 		case WRITE_ONE_RECURSIVE:
 			/* we cannot depend on this one */
-			e->delta = NULL;
+			SET_DELTA(e, NULL);
 			break;
 		default:
 			break;
@@ -608,34 +614,34 @@ static void add_descendants_to_write_order(struct object_entry **wo,
 			/* add this node... */
 			add_to_write_order(wo, endp, e);
 			/* all its siblings... */
-			for (s = e->delta_sibling; s; s = s->delta_sibling) {
+			for (s = DELTA_SIBLING(e); s; s = DELTA_SIBLING(s)) {
 				add_to_write_order(wo, endp, s);
 			}
 		}
 		/* drop down a level to add left subtree nodes if possible */
-		if (e->delta_child) {
+		if (DELTA_CHILD(e)) {
 			add_to_order = 1;
-			e = e->delta_child;
+			e = DELTA_CHILD(e);
 		} else {
 			add_to_order = 0;
 			/* our sibling might have some children, it is next */
-			if (e->delta_sibling) {
-				e = e->delta_sibling;
+			if (DELTA_SIBLING(e)) {
+				e = DELTA_SIBLING(e);
 				continue;
 			}
 			/* go back to our parent node */
-			e = e->delta;
-			while (e && !e->delta_sibling) {
+			e = DELTA(e);
+			while (e && !DELTA_SIBLING(e)) {
 				/* we're on the right side of a subtree, keep
 				 * going up until we can go right again */
-				e = e->delta;
+				e = DELTA(e);
 			}
 			if (!e) {
 				/* done- we hit our original root node */
 				return;
 			}
 			/* pass it off to sibling at this level */
-			e = e->delta_sibling;
+			e = DELTA_SIBLING(e);
 		}
 	};
 }
@@ -646,7 +652,7 @@ static void add_family_to_write_order(struct object_entry **wo,
 {
 	struct object_entry *root;
 
-	for (root = e; root->delta; root = root->delta)
+	for (root = e; DELTA(root); root = DELTA(root))
 		; /* nothing */
 	add_descendants_to_write_order(wo, endp, root);
 }
@@ -661,8 +667,8 @@ static struct object_entry **compute_write_order(void)
 	for (i = 0; i < to_pack.nr_objects; i++) {
 		objects[i].tagged = 0;
 		objects[i].filled = 0;
-		objects[i].delta_child = NULL;
-		objects[i].delta_sibling = NULL;
+		SET_DELTA_CHILD(&objects[i], NULL);
+		SET_DELTA_SIBLING(&objects[i], NULL);
 	}
 
 	/*
@@ -672,11 +678,11 @@ static struct object_entry **compute_write_order(void)
 	 */
 	for (i = to_pack.nr_objects; i > 0;) {
 		struct object_entry *e = &objects[--i];
-		if (!e->delta)
+		if (!DELTA(e))
 			continue;
 		/* Mark me as the first child */
-		e->delta_sibling = e->delta->delta_child;
-		e->delta->delta_child = e;
+		e->delta_sibling_idx = DELTA(e)->delta_child_idx;
+		SET_DELTA_CHILD(DELTA(e), e);
 	}
 
 	/*
@@ -1493,10 +1499,10 @@ static void check_object(struct object_entry *entry)
 			 * circular deltas.
 			 */
 			oe_set_type(entry, entry->in_pack_type);
-			entry->delta = base_entry;
+			SET_DELTA(entry, base_entry);
 			entry->delta_size = entry->size;
-			entry->delta_sibling = base_entry->delta_child;
-			base_entry->delta_child = entry;
+			entry->delta_sibling_idx = base_entry->delta_child_idx;
+			SET_DELTA_CHILD(base_entry, entry);
 			unuse_pack(&w_curs);
 			return;
 		}
@@ -1567,17 +1573,19 @@ static int pack_offset_sort(const void *_a, const void *_b)
  */
 static void drop_reused_delta(struct object_entry *entry)
 {
-	struct object_entry **p = &entry->delta->delta_child;
+	unsigned *idx = &to_pack.objects[entry->delta_idx - 1].delta_child_idx;
 	struct object_info oi = OBJECT_INFO_INIT;
 	enum object_type type;
 
-	while (*p) {
-		if (*p == entry)
-			*p = (*p)->delta_sibling;
+	while (*idx) {
+		struct object_entry *oe = &to_pack.objects[*idx - 1];
+
+		if (oe == entry)
+			*idx = oe->delta_sibling_idx;
 		else
-			p = &(*p)->delta_sibling;
+			idx = &oe->delta_sibling_idx;
 	}
-	entry->delta = NULL;
+	SET_DELTA(entry, NULL);
 	entry->depth = 0;
 
 	oi.sizep = &entry->size;
@@ -1617,7 +1625,7 @@ static void break_delta_chains(struct object_entry *entry)
 
 	for (cur = entry, total_depth = 0;
 	     cur;
-	     cur = cur->delta, total_depth++) {
+	     cur = DELTA(cur), total_depth++) {
 		if (cur->dfs_state == DFS_DONE) {
 			/*
 			 * We've already seen this object and know it isn't
@@ -1642,7 +1650,7 @@ static void break_delta_chains(struct object_entry *entry)
 		 * it's not a delta, we're done traversing, but we'll mark it
 		 * done to save time on future traversals.
 		 */
-		if (!cur->delta) {
+		if (!DELTA(cur)) {
 			cur->dfs_state = DFS_DONE;
 			break;
 		}
@@ -1665,7 +1673,7 @@ static void break_delta_chains(struct object_entry *entry)
 		 * We keep all commits in the chain that we examined.
 		 */
 		cur->dfs_state = DFS_ACTIVE;
-		if (cur->delta->dfs_state == DFS_ACTIVE) {
+		if (DELTA(cur)->dfs_state == DFS_ACTIVE) {
 			drop_reused_delta(cur);
 			cur->dfs_state = DFS_DONE;
 			break;
@@ -1680,7 +1688,7 @@ static void break_delta_chains(struct object_entry *entry)
 	 * an extra "next" pointer to keep going after we reset cur->delta.
 	 */
 	for (cur = entry; cur; cur = next) {
-		next = cur->delta;
+		next = DELTA(cur);
 
 		/*
 		 * We should have a chain of zero or more ACTIVE states down to
@@ -1865,7 +1873,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 
 	/* Now some size filtering heuristics. */
 	trg_size = trg_entry->size;
-	if (!trg_entry->delta) {
+	if (!DELTA(trg_entry)) {
 		max_size = trg_size/2 - 20;
 		ref_depth = 1;
 	} else {
@@ -1941,7 +1949,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	if (!delta_buf)
 		return 0;
 
-	if (trg_entry->delta) {
+	if (DELTA(trg_entry)) {
 		/* Prefer only shallower same-sized deltas. */
 		if (delta_size == trg_entry->delta_size &&
 		    src->depth + 1 >= trg->depth) {
@@ -1970,7 +1978,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 		free(delta_buf);
 	}
 
-	trg_entry->delta = src_entry;
+	SET_DELTA(trg_entry, src_entry);
 	trg_entry->delta_size = delta_size;
 	trg->depth = src->depth + 1;
 
@@ -1979,13 +1987,13 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 
 static unsigned int check_delta_limit(struct object_entry *me, unsigned int n)
 {
-	struct object_entry *child = me->delta_child;
+	struct object_entry *child = DELTA_CHILD(me);
 	unsigned int m = n;
 	while (child) {
 		unsigned int c = check_delta_limit(child, n + 1);
 		if (m < c)
 			m = c;
-		child = child->delta_sibling;
+		child = DELTA_SIBLING(child);
 	}
 	return m;
 }
@@ -2054,7 +2062,7 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
 		 * otherwise they would become too deep.
 		 */
 		max_depth = depth;
-		if (entry->delta_child) {
+		if (DELTA_CHILD(entry)) {
 			max_depth -= check_delta_limit(entry, 0);
 			if (max_depth <= 0)
 				goto next;
@@ -2104,7 +2112,7 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
 		 * depth, leaving it in the window is pointless.  we
 		 * should evict it first.
 		 */
-		if (entry->delta && max_depth <= n->depth)
+		if (DELTA(entry) && max_depth <= n->depth)
 			continue;
 
 		/*
@@ -2112,7 +2120,7 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
 		 * currently deltified object, to keep it longer.  It will
 		 * be the first base object to be attempted next.
 		 */
-		if (entry->delta) {
+		if (DELTA(entry)) {
 			struct unpacked swap = array[best_base];
 			int dist = (window + idx - best_base) % window;
 			int dst = best_base;
@@ -2433,7 +2441,7 @@ static void prepare_pack(int window, int depth)
 	for (i = 0; i < to_pack.nr_objects; i++) {
 		struct object_entry *entry = to_pack.objects + i;
 
-		if (entry->delta)
+		if (DELTA(entry))
 			/* This happens if we decided to reuse existing
 			 * delta from a pack.  "reuse_delta &&" is implied.
 			 */
diff --git a/pack-objects.h b/pack-objects.h
index cae4f4fe01..433c194ffe 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -72,11 +72,11 @@ struct object_entry {
 	unsigned long size;	/* uncompressed size */
 	unsigned in_pack_idx:OE_IN_PACK_BITS;	/* already in pack */
 	off_t in_pack_offset;
-	struct object_entry *delta;	/* delta base object */
-	struct object_entry *delta_child; /* deltified objects who bases me */
-	struct object_entry *delta_sibling; /* other deltified objects who
-					     * uses the same base as me
-					     */
+	uint32_t delta_idx;	/* delta base object */
+	uint32_t delta_child_idx; /* deltified objects who bases me */
+	uint32_t delta_sibling_idx; /* other deltified objects who
+				     * uses the same base as me
+				     */
 	void *delta_data;	/* cached delta (uncompressed) */
 	unsigned long delta_size;	/* delta data size (uncompressed) */
 	unsigned long z_delta_size;	/* delta data size (compressed) */
@@ -196,4 +196,61 @@ static inline void oe_set_in_pack(struct packing_data *pack,
 
 }
 
+static inline struct object_entry *oe_delta(
+		const struct packing_data *pack,
+		const struct object_entry *e)
+{
+	if (e->delta_idx)
+		return &pack->objects[e->delta_idx - 1];
+	return NULL;
+}
+
+static inline void oe_set_delta(struct packing_data *pack,
+				struct object_entry *e,
+				struct object_entry *delta)
+{
+	if (delta)
+		e->delta_idx = (delta - pack->objects) + 1;
+	else
+		e->delta_idx = 0;
+}
+
+static inline struct object_entry *oe_delta_child(
+		const struct packing_data *pack,
+		const struct object_entry *e)
+{
+	if (e->delta_child_idx)
+		return &pack->objects[e->delta_child_idx - 1];
+	return NULL;
+}
+
+static inline void oe_set_delta_child(struct packing_data *pack,
+				      struct object_entry *e,
+				      struct object_entry *delta)
+{
+	if (delta)
+		e->delta_child_idx = (delta - pack->objects) + 1;
+	else
+		e->delta_child_idx = 0;
+}
+
+static inline struct object_entry *oe_delta_sibling(
+		const struct packing_data *pack,
+		const struct object_entry *e)
+{
+	if (e->delta_sibling_idx)
+		return &pack->objects[e->delta_sibling_idx - 1];
+	return NULL;
+}
+
+static inline void oe_set_delta_sibling(struct packing_data *pack,
+					struct object_entry *e,
+					struct object_entry *delta)
+{
+	if (delta)
+		e->delta_sibling_idx = (delta - pack->objects) + 1;
+	else
+		e->delta_sibling_idx = 0;
+}
+
 #endif
-- 
2.17.0.rc0.348.gd5a49e0b6f


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v7 08/13] pack-objects: shrink z_delta_size field in struct object_entry
  2018-03-24  6:33             ` [PATCH v7 00/13] " Nguyễn Thái Ngọc Duy
                                 ` (6 preceding siblings ...)
  2018-03-24  6:33               ` [PATCH v7 07/13] pack-objects: refer to delta objects by index instead of pointer Nguyễn Thái Ngọc Duy
@ 2018-03-24  6:33               ` Nguyễn Thái Ngọc Duy
  2018-03-30 20:59                 ` Jeff King
  2018-03-24  6:33               ` [PATCH v7 09/13] pack-objects: don't check size when the object is bad Nguyễn Thái Ngọc Duy
                                 ` (6 subsequent siblings)
  14 siblings, 1 reply; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-24  6:33 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

We only cache deltas when it's smaller than a certain limit. This limit
defaults to 1000 but save its compressed length in a 64-bit field.
Shrink that field down to 16 bits, so you can only cache 65kb deltas.
Larger deltas must be recomputed at when the pack is written down.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 Documentation/config.txt |  3 ++-
 builtin/pack-objects.c   | 22 ++++++++++++++++------
 pack-objects.h           |  3 ++-
 3 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index 9bd3f5a789..00fa824448 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -2449,7 +2449,8 @@ pack.deltaCacheLimit::
 	The maximum size of a delta, that is cached in
 	linkgit:git-pack-objects[1]. This cache is used to speed up the
 	writing object phase by not having to recompute the final delta
-	result once the best match for all objects is found. Defaults to 1000.
+	result once the best match for all objects is found.
+	Defaults to 1000. Maximum value is 65535.
 
 pack.threads::
 	Specifies the number of threads to spawn when searching for best
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index f7d3f6a1a8..b3e19815f1 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -2100,12 +2100,19 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
 		 * between writes at that moment.
 		 */
 		if (entry->delta_data && !pack_to_stdout) {
-			entry->z_delta_size = do_compress(&entry->delta_data,
-							  entry->delta_size);
-			cache_lock();
-			delta_cache_size -= entry->delta_size;
-			delta_cache_size += entry->z_delta_size;
-			cache_unlock();
+			unsigned long size;
+
+			size = do_compress(&entry->delta_data, entry->delta_size);
+			if (size < (1 << OE_Z_DELTA_BITS)) {
+				entry->z_delta_size = size;
+				cache_lock();
+				delta_cache_size -= entry->delta_size;
+				delta_cache_size += entry->z_delta_size;
+				cache_unlock();
+			} else {
+				FREE_AND_NULL(entry->delta_data);
+				entry->z_delta_size = 0;
+			}
 		}
 
 		/* if we made n a delta, and if n is already at max
@@ -3084,6 +3091,9 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 	if (depth >= (1 << OE_DEPTH_BITS))
 		die(_("delta chain depth %d is greater than maximum limit %d"),
 		    depth, (1 << OE_DEPTH_BITS) - 1);
+	if (cache_max_small_delta_size >= (1 << OE_Z_DELTA_BITS))
+		die(_("pack.deltaCacheLimit is greater than maximum limit %d"),
+		    (1 << OE_Z_DELTA_BITS) - 1);
 
 	argv_array_push(&rp, "pack-objects");
 	if (thin) {
diff --git a/pack-objects.h b/pack-objects.h
index 433c194ffe..415b961245 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -4,6 +4,7 @@
 #define OE_DFS_STATE_BITS	2
 #define OE_DEPTH_BITS		12
 #define OE_IN_PACK_BITS		10
+#define OE_Z_DELTA_BITS		16
 
 /*
  * State flags for depth-first search used for analyzing delta cycles.
@@ -79,7 +80,7 @@ struct object_entry {
 				     */
 	void *delta_data;	/* cached delta (uncompressed) */
 	unsigned long delta_size;	/* delta data size (uncompressed) */
-	unsigned long z_delta_size;	/* delta data size (compressed) */
+	unsigned z_delta_size:OE_Z_DELTA_BITS;
 	unsigned type_:TYPE_BITS;
 	unsigned in_pack_type:TYPE_BITS; /* could be delta */
 	unsigned type_valid:1;
-- 
2.17.0.rc0.348.gd5a49e0b6f


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v7 09/13] pack-objects: don't check size when the object is bad
  2018-03-24  6:33             ` [PATCH v7 00/13] " Nguyễn Thái Ngọc Duy
                                 ` (7 preceding siblings ...)
  2018-03-24  6:33               ` [PATCH v7 08/13] pack-objects: shrink z_delta_size field in struct object_entry Nguyễn Thái Ngọc Duy
@ 2018-03-24  6:33               ` Nguyễn Thái Ngọc Duy
  2018-03-24  6:33               ` [PATCH v7 10/13] pack-objects: clarify the use of object_entry::size Nguyễn Thái Ngọc Duy
                                 ` (5 subsequent siblings)
  14 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-24  6:33 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

sha1_object_info() in check_objects() may fail to locate an object in
the pack and return type OBJ_BAD. In that case, it will likely leave
the "size" field untouched. We delay error handling until later in
prepare_pack() though. Until then, do not touch "size" field.

This field should contain the default value zero, but we can't say
sha1_object_info() cannot damage it. This becomes more important later
when the object size may have to be retrieved back from the
(non-existing) pack.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index b3e19815f1..da010f7d19 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -1741,7 +1741,7 @@ static void get_object_details(void)
 	for (i = 0; i < to_pack.nr_objects; i++) {
 		struct object_entry *entry = sorted_by_offset[i];
 		check_object(entry);
-		if (big_file_threshold < entry->size)
+		if (entry->type_valid && big_file_threshold < entry->size)
 			entry->no_try_delta = 1;
 	}
 
@@ -2454,7 +2454,7 @@ static void prepare_pack(int window, int depth)
 			 */
 			continue;
 
-		if (entry->size < 50)
+		if (!entry->type_valid || entry->size < 50)
 			continue;
 
 		if (entry->no_try_delta)
-- 
2.17.0.rc0.348.gd5a49e0b6f


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v7 10/13] pack-objects: clarify the use of object_entry::size
  2018-03-24  6:33             ` [PATCH v7 00/13] " Nguyễn Thái Ngọc Duy
                                 ` (8 preceding siblings ...)
  2018-03-24  6:33               ` [PATCH v7 09/13] pack-objects: don't check size when the object is bad Nguyễn Thái Ngọc Duy
@ 2018-03-24  6:33               ` Nguyễn Thái Ngọc Duy
  2018-03-30 21:04                 ` Jeff King
  2018-03-24  6:33               ` [PATCH v7 11/13] pack-objects: shrink size field in struct object_entry Nguyễn Thái Ngọc Duy
                                 ` (4 subsequent siblings)
  14 siblings, 1 reply; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-24  6:33 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

While this field most of the time contains the canonical object size,
there is one case it does not: when we have found that the base object
of the delta in question is also to be packed, we will very happily
reuse the delta by copying it over instead of regenerating the new
delta.

"size" in this case will record the delta size, not canonical object
size. Later on in write_reuse_object(), we reconstruct the delta
header and "size" is used for this purpose. When this happens, the
"type" field contains a delta type instead of a canonical type.
Highlight this in the code since it could be tricky to see.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 11 ++++++++---
 pack-objects.h         |  4 +++-
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index da010f7d19..f054ba9dfa 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -1417,6 +1417,7 @@ static void check_object(struct object_entry *entry)
 		off_t ofs;
 		unsigned char *buf, c;
 		enum object_type type;
+		unsigned long in_pack_size;
 
 		buf = use_pack(p, &w_curs, entry->in_pack_offset, &avail);
 
@@ -1426,7 +1427,7 @@ static void check_object(struct object_entry *entry)
 		 */
 		used = unpack_object_header_buffer(buf, avail,
 						   &type,
-						   &entry->size);
+						   &in_pack_size);
 		if (used == 0)
 			goto give_up;
 
@@ -1443,6 +1444,7 @@ static void check_object(struct object_entry *entry)
 		default:
 			/* Not a delta hence we've already got all we need. */
 			oe_set_type(entry, entry->in_pack_type);
+			entry->size = in_pack_size;
 			entry->in_pack_header_size = used;
 			if (oe_type(entry) < OBJ_COMMIT || oe_type(entry) > OBJ_BLOB)
 				goto give_up;
@@ -1499,6 +1501,7 @@ static void check_object(struct object_entry *entry)
 			 * circular deltas.
 			 */
 			oe_set_type(entry, entry->in_pack_type);
+			entry->size = in_pack_size; /* delta size */
 			SET_DELTA(entry, base_entry);
 			entry->delta_size = entry->size;
 			entry->delta_sibling_idx = base_entry->delta_child_idx;
@@ -1508,13 +1511,15 @@ static void check_object(struct object_entry *entry)
 		}
 
 		if (oe_type(entry)) {
+			off_t delta_pos;
+
 			/*
 			 * This must be a delta and we already know what the
 			 * final object type is.  Let's extract the actual
 			 * object size from the delta header.
 			 */
-			entry->size = get_size_from_delta(p, &w_curs,
-					entry->in_pack_offset + entry->in_pack_header_size);
+			delta_pos = entry->in_pack_offset + entry->in_pack_header_size;
+			entry->size = get_size_from_delta(p, &w_curs, delta_pos);
 			if (entry->size == 0)
 				goto give_up;
 			unuse_pack(&w_curs);
diff --git a/pack-objects.h b/pack-objects.h
index 415b961245..d23e17050c 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -34,7 +34,9 @@ enum dfs_state {
  *
  * "size" is the uncompressed object size. Compressed size of the raw
  * data for an object in a pack is not stored anywhere but is computed
- * and made available when reverse .idx is made.
+ * and made available when reverse .idx is made. Note that when an
+ * delta is reused, "size" is the uncompressed _delta_ size, not the
+ * canonical one after the delta has been applied.
  *
  * "hash" contains a path name hash which is used for sorting the
  * delta list and also during delta searching. Once prepare_pack()
-- 
2.17.0.rc0.348.gd5a49e0b6f


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v7 11/13] pack-objects: shrink size field in struct object_entry
  2018-03-24  6:33             ` [PATCH v7 00/13] " Nguyễn Thái Ngọc Duy
                                 ` (9 preceding siblings ...)
  2018-03-24  6:33               ` [PATCH v7 10/13] pack-objects: clarify the use of object_entry::size Nguyễn Thái Ngọc Duy
@ 2018-03-24  6:33               ` Nguyễn Thái Ngọc Duy
  2018-03-30 21:18                 ` Jeff King
  2018-03-24  6:33               ` [PATCH v7 12/13] pack-objects: shrink delta_size " Nguyễn Thái Ngọc Duy
                                 ` (3 subsequent siblings)
  14 siblings, 1 reply; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-24  6:33 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

It's very very rare that an uncompressed object is larger than 4GB
(partly because Git does not handle those large files very well to
begin with). Let's optimize it for the common case where object size
is smaller than this limit.

Shrink size field down to 32 bits [1] and one overflow bit. If the
size is too large, we read it back from disk. As noted in the previous
patch, we need to return the delta size instead of canonical size when
the to-be-reused object entry type is a delta instead of a canonical
one.

Add two compare helpers that can take advantage of the overflow
bit (e.g. if the file is 4GB+, chances are it's already larger than
core.bigFileThreshold and there's no point in comparing the actual
value).

Another note about oe_get_size_slow(). This function MUST be thread
safe because SIZE() macro is used inside try_delta() which may run in
parallel. Outside parallel code, no-contention locking should be dirt
cheap (or insignificant compared to i/o access anyway). To exercise
this code, it's best to run the test suite with something like

    make test GIT_TEST_OE_SIZE_BITS=2

which forces this code on all objects larger than 3 bytes.

[1] it's actually already 32 bits on Windows

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 100 +++++++++++++++++++++++++++++++----------
 pack-objects.c         |   9 +++-
 pack-objects.h         |  53 +++++++++++++++++++++-
 t/README               |   6 +++
 4 files changed, 142 insertions(+), 26 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index f054ba9dfa..caeef086d3 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -30,6 +30,8 @@
 #include "packfile.h"
 
 #define IN_PACK(obj) oe_in_pack(&to_pack, obj)
+#define SIZE(obj) oe_size(&to_pack, obj)
+#define SET_SIZE(obj,size) oe_set_size(&to_pack, obj, size)
 #define DELTA(obj) oe_delta(&to_pack, obj)
 #define DELTA_CHILD(obj) oe_delta_child(&to_pack, obj)
 #define DELTA_SIBLING(obj) oe_delta_sibling(&to_pack, obj)
@@ -274,7 +276,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 
 	if (!usable_delta) {
 		if (oe_type(entry) == OBJ_BLOB &&
-		    entry->size > big_file_threshold &&
+		    oe_size_greater_than(&to_pack, entry, big_file_threshold) &&
 		    (st = open_istream(entry->idx.oid.hash, &type, &size, NULL)) != NULL)
 			buf = NULL;
 		else {
@@ -384,12 +386,13 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 	unsigned char header[MAX_PACK_OBJECT_HEADER],
 		      dheader[MAX_PACK_OBJECT_HEADER];
 	unsigned hdrlen;
+	unsigned long entry_size = SIZE(entry);
 
 	if (DELTA(entry))
 		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	hdrlen = encode_in_pack_object_header(header, sizeof(header),
-					      type, entry->size);
+					      type, entry_size);
 
 	offset = entry->in_pack_offset;
 	revidx = find_pack_revindex(p, offset);
@@ -406,7 +409,7 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 	datalen -= entry->in_pack_header_size;
 
 	if (!pack_to_stdout && p->index_version == 1 &&
-	    check_pack_inflate(p, &w_curs, offset, datalen, entry->size)) {
+	    check_pack_inflate(p, &w_curs, offset, datalen, entry_size)) {
 		error("corrupt packed object for %s",
 		      oid_to_hex(&entry->idx.oid));
 		unuse_pack(&w_curs);
@@ -1407,6 +1410,8 @@ static void cleanup_preferred_base(void)
 
 static void check_object(struct object_entry *entry)
 {
+	unsigned long canonical_size;
+
 	if (IN_PACK(entry)) {
 		struct packed_git *p = IN_PACK(entry);
 		struct pack_window *w_curs = NULL;
@@ -1444,7 +1449,7 @@ static void check_object(struct object_entry *entry)
 		default:
 			/* Not a delta hence we've already got all we need. */
 			oe_set_type(entry, entry->in_pack_type);
-			entry->size = in_pack_size;
+			SET_SIZE(entry, in_pack_size);
 			entry->in_pack_header_size = used;
 			if (oe_type(entry) < OBJ_COMMIT || oe_type(entry) > OBJ_BLOB)
 				goto give_up;
@@ -1501,9 +1506,9 @@ static void check_object(struct object_entry *entry)
 			 * circular deltas.
 			 */
 			oe_set_type(entry, entry->in_pack_type);
-			entry->size = in_pack_size; /* delta size */
+			SET_SIZE(entry, in_pack_size); /* delta size */
 			SET_DELTA(entry, base_entry);
-			entry->delta_size = entry->size;
+			entry->delta_size = in_pack_size;
 			entry->delta_sibling_idx = base_entry->delta_child_idx;
 			SET_DELTA_CHILD(base_entry, entry);
 			unuse_pack(&w_curs);
@@ -1519,9 +1524,10 @@ static void check_object(struct object_entry *entry)
 			 * object size from the delta header.
 			 */
 			delta_pos = entry->in_pack_offset + entry->in_pack_header_size;
-			entry->size = get_size_from_delta(p, &w_curs, delta_pos);
-			if (entry->size == 0)
+			canonical_size = get_size_from_delta(p, &w_curs, delta_pos);
+			if (canonical_size == 0)
 				goto give_up;
+			SET_SIZE(entry, canonical_size);
 			unuse_pack(&w_curs);
 			return;
 		}
@@ -1535,13 +1541,18 @@ static void check_object(struct object_entry *entry)
 		unuse_pack(&w_curs);
 	}
 
-	oe_set_type(entry, sha1_object_info(entry->idx.oid.hash, &entry->size));
-	/*
-	 * The error condition is checked in prepare_pack().  This is
-	 * to permit a missing preferred base object to be ignored
-	 * as a preferred base.  Doing so can result in a larger
-	 * pack file, but the transfer will still take place.
-	 */
+	oe_set_type(entry, sha1_object_info(entry->idx.oid.hash,
+					    &canonical_size));
+	if (entry->type_valid) {
+		SET_SIZE(entry, canonical_size);
+	} else {
+		/*
+		 * Bad object type is checked in prepare_pack().  This is
+		 * to permit a missing preferred base object to be ignored
+		 * as a preferred base.  Doing so can result in a larger
+		 * pack file, but the transfer will still take place.
+		 */
+	}
 }
 
 static int pack_offset_sort(const void *_a, const void *_b)
@@ -1581,6 +1592,7 @@ static void drop_reused_delta(struct object_entry *entry)
 	unsigned *idx = &to_pack.objects[entry->delta_idx - 1].delta_child_idx;
 	struct object_info oi = OBJECT_INFO_INIT;
 	enum object_type type;
+	unsigned long size;
 
 	while (*idx) {
 		struct object_entry *oe = &to_pack.objects[*idx - 1];
@@ -1593,7 +1605,7 @@ static void drop_reused_delta(struct object_entry *entry)
 	SET_DELTA(entry, NULL);
 	entry->depth = 0;
 
-	oi.sizep = &entry->size;
+	oi.sizep = &size;
 	oi.typep = &type;
 	if (packed_object_info(IN_PACK(entry), entry->in_pack_offset, &oi) < 0) {
 		/*
@@ -1603,10 +1615,11 @@ static void drop_reused_delta(struct object_entry *entry)
 		 * and dealt with in prepare_pack().
 		 */
 		oe_set_type(entry, sha1_object_info(entry->idx.oid.hash,
-						    &entry->size));
+						    &size));
 	} else {
 		oe_set_type(entry, type);
 	}
+	SET_SIZE(entry, size);
 }
 
 /*
@@ -1746,7 +1759,8 @@ static void get_object_details(void)
 	for (i = 0; i < to_pack.nr_objects; i++) {
 		struct object_entry *entry = sorted_by_offset[i];
 		check_object(entry);
-		if (entry->type_valid && big_file_threshold < entry->size)
+		if (entry->type_valid &&
+		    oe_size_greater_than(&to_pack, entry, big_file_threshold))
 			entry->no_try_delta = 1;
 	}
 
@@ -1775,6 +1789,8 @@ static int type_size_sort(const void *_a, const void *_b)
 	const struct object_entry *b = *(struct object_entry **)_b;
 	enum object_type a_type = oe_type(a);
 	enum object_type b_type = oe_type(b);
+	unsigned long a_size = SIZE(a);
+	unsigned long b_size = SIZE(b);
 
 	if (a_type > b_type)
 		return -1;
@@ -1788,9 +1804,9 @@ static int type_size_sort(const void *_a, const void *_b)
 		return -1;
 	if (a->preferred_base < b->preferred_base)
 		return 1;
-	if (a->size > b->size)
+	if (a_size > b_size)
 		return -1;
-	if (a->size < b->size)
+	if (a_size < b_size)
 		return 1;
 	return a < b ? -1 : (a > b);  /* newest first */
 }
@@ -1843,6 +1859,41 @@ static pthread_mutex_t progress_mutex;
 
 #endif
 
+unsigned long oe_get_size_slow(struct packing_data *pack,
+			       const struct object_entry *e)
+{
+	struct packed_git *p;
+	struct pack_window *w_curs;
+	unsigned char *buf;
+	enum object_type type;
+	unsigned long used, avail, size;
+
+	if (e->type_ != OBJ_OFS_DELTA && e->type_ != OBJ_REF_DELTA) {
+		read_lock();
+		if (sha1_object_info(e->idx.oid.hash, &size) < 0)
+			die(_("unable to get size of %s"),
+			    oid_to_hex(&e->idx.oid));
+		read_unlock();
+		return size;
+	}
+
+	p = oe_in_pack(pack, e);
+	if (!p)
+		die("BUG: when e->type is a delta, it must belong to a pack");
+
+	read_lock();
+	w_curs = NULL;
+	buf = use_pack(p, &w_curs, e->in_pack_offset, &avail);
+	used = unpack_object_header_buffer(buf, avail, &type, &size);
+	if (used == 0)
+		die(_("unable to parse object header of %s"),
+		    oid_to_hex(&e->idx.oid));
+
+	unuse_pack(&w_curs);
+	read_unlock();
+	return size;
+}
+
 static int try_delta(struct unpacked *trg, struct unpacked *src,
 		     unsigned max_depth, unsigned long *mem_usage)
 {
@@ -1877,7 +1928,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 		return 0;
 
 	/* Now some size filtering heuristics. */
-	trg_size = trg_entry->size;
+	trg_size = SIZE(trg_entry);
 	if (!DELTA(trg_entry)) {
 		max_size = trg_size/2 - 20;
 		ref_depth = 1;
@@ -1889,7 +1940,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 						(max_depth - ref_depth + 1);
 	if (max_size == 0)
 		return 0;
-	src_size = src_entry->size;
+	src_size = SIZE(src_entry);
 	sizediff = src_size < trg_size ? trg_size - src_size : 0;
 	if (sizediff >= max_size)
 		return 0;
@@ -2009,7 +2060,7 @@ static unsigned long free_unpacked(struct unpacked *n)
 	free_delta_index(n->index);
 	n->index = NULL;
 	if (n->data) {
-		freed_mem += n->entry->size;
+		freed_mem += SIZE(n->entry);
 		FREE_AND_NULL(n->data);
 	}
 	n->entry = NULL;
@@ -2459,7 +2510,8 @@ static void prepare_pack(int window, int depth)
 			 */
 			continue;
 
-		if (!entry->type_valid || entry->size < 50)
+		if (!entry->type_valid ||
+		    oe_size_less_than(&to_pack, entry, 50))
 			continue;
 
 		if (entry->no_try_delta)
diff --git a/pack-objects.c b/pack-objects.c
index 13f2b2bff2..59c6e40a02 100644
--- a/pack-objects.c
+++ b/pack-objects.c
@@ -120,8 +120,15 @@ struct object_entry *packlist_alloc(struct packing_data *pdata,
 {
 	struct object_entry *new_entry;
 
-	if (!pdata->nr_objects)
+	if (!pdata->nr_objects) {
 		prepare_in_pack_by_idx(pdata);
+		if (getenv("GIT_TEST_OE_SIZE_BITS")) {
+			int bits = atoi(getenv("GIT_TEST_OE_SIZE_BITS"));;
+			pdata->oe_size_limit = 1 << bits;
+		}
+		if (!pdata->oe_size_limit)
+			pdata->oe_size_limit = 1 << OE_SIZE_BITS;
+	}
 	if (pdata->nr_objects >= pdata->nr_alloc) {
 		pdata->nr_alloc = (pdata->nr_alloc  + 1024) * 3 / 2;
 		REALLOC_ARRAY(pdata->objects, pdata->nr_alloc);
diff --git a/pack-objects.h b/pack-objects.h
index d23e17050c..8bb082f22f 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -5,6 +5,7 @@
 #define OE_DEPTH_BITS		12
 #define OE_IN_PACK_BITS		10
 #define OE_Z_DELTA_BITS		16
+#define OE_SIZE_BITS		31
 
 /*
  * State flags for depth-first search used for analyzing delta cycles.
@@ -72,7 +73,8 @@ enum dfs_state {
  */
 struct object_entry {
 	struct pack_idx_entry idx;
-	unsigned long size;	/* uncompressed size */
+	uint32_t size_:OE_SIZE_BITS;
+	uint32_t size_valid:1;
 	unsigned in_pack_idx:OE_IN_PACK_BITS;	/* already in pack */
 	off_t in_pack_offset;
 	uint32_t delta_idx;	/* delta base object */
@@ -117,6 +119,8 @@ struct packing_data {
 	 */
 	struct packed_git **in_pack_by_idx;
 	struct packed_git **in_pack;
+
+	uintmax_t oe_size_limit;
 };
 
 struct object_entry *packlist_alloc(struct packing_data *pdata,
@@ -256,4 +260,51 @@ static inline void oe_set_delta_sibling(struct packing_data *pack,
 		e->delta_sibling_idx = 0;
 }
 
+unsigned long oe_get_size_slow(struct packing_data *pack,
+			       const struct object_entry *e);
+static inline unsigned long oe_size(struct packing_data *pack,
+				    const struct object_entry *e)
+{
+	if (e->size_valid)
+		return e->size_;
+
+	return oe_get_size_slow(pack, e);
+}
+
+static inline int oe_size_less_than(struct packing_data *pack,
+				    const struct object_entry *lhs,
+				    unsigned long rhs)
+{
+	if (lhs->size_valid)
+		return lhs->size_ < rhs;
+	if (rhs < pack->oe_size_limit) /* rhs < 2^x <= lhs ? */
+		return 0;
+	return oe_get_size_slow(pack, lhs) < rhs;
+}
+
+static inline int oe_size_greater_than(struct packing_data *pack,
+				       const struct object_entry *lhs,
+				       unsigned long rhs)
+{
+	if (lhs->size_valid)
+		return lhs->size_ > rhs;
+	if (rhs < pack->oe_size_limit) /* rhs < 2^x <= lhs ? */
+		return 1;
+	return oe_get_size_slow(pack, lhs) > rhs;
+}
+
+static inline void oe_set_size(struct packing_data *pack,
+			       struct object_entry *e,
+			       unsigned long size)
+{
+	if (size < pack->oe_size_limit) {
+		e->size_ = size;
+		e->size_valid = 1;
+	} else {
+		e->size_valid = 0;
+		if (oe_get_size_slow(pack, e) != size)
+			die("BUG: 'size' is supposed to be the object size!");
+	}
+}
+
 #endif
diff --git a/t/README b/t/README
index c6130ff16d..da117ca734 100644
--- a/t/README
+++ b/t/README
@@ -306,6 +306,12 @@ GIT_TEST_FULL_IN_PACK_ARRAY exercises the uncommon pack-objects code
 path where there are more than 1024 packs even if the actual number of
 packs in repository is below this limit.
 
+GIT_TEST_OE_SIZE_BITS=<bits> exercises the uncommon pack-objects
+code path where we do not cache objecct size in memory and read it
+from existing packs on demand. This normally only happens when the
+object size is over 2GB. This variable forces the code path on any
+object larger than 2^<bits> bytes.
+
 Naming Tests
 ------------
 
-- 
2.17.0.rc0.348.gd5a49e0b6f


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v7 12/13] pack-objects: shrink delta_size field in struct object_entry
  2018-03-24  6:33             ` [PATCH v7 00/13] " Nguyễn Thái Ngọc Duy
                                 ` (10 preceding siblings ...)
  2018-03-24  6:33               ` [PATCH v7 11/13] pack-objects: shrink size field in struct object_entry Nguyễn Thái Ngọc Duy
@ 2018-03-24  6:33               ` Nguyễn Thái Ngọc Duy
  2018-03-30 21:24                 ` Jeff King
  2018-03-24  6:33               ` [PATCH v7 13/13] pack-objects: reorder members to shrink " Nguyễn Thái Ngọc Duy
                                 ` (2 subsequent siblings)
  14 siblings, 1 reply; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-24  6:33 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

Allowing a delta size of 64 bits is crazy. Shrink this field down to
31 bits with one overflow bit.

If we find an existing delta larger than 2GB, we do not cache
delta_size at all and will get the value from oe_size(), potentially
from disk if it's larger than 4GB.

Note, since DELTA_SIZE() is used in try_delta() code, it must be
thread-safe. Luckily oe_size() does guarantee this so we it is
thread-safe.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 24 ++++++++++++++----------
 pack-objects.h         | 23 ++++++++++++++++++++++-
 2 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index caeef086d3..c774821930 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -32,10 +32,12 @@
 #define IN_PACK(obj) oe_in_pack(&to_pack, obj)
 #define SIZE(obj) oe_size(&to_pack, obj)
 #define SET_SIZE(obj,size) oe_set_size(&to_pack, obj, size)
+#define DELTA_SIZE(obj) oe_delta_size(&to_pack, obj)
 #define DELTA(obj) oe_delta(&to_pack, obj)
 #define DELTA_CHILD(obj) oe_delta_child(&to_pack, obj)
 #define DELTA_SIBLING(obj) oe_delta_sibling(&to_pack, obj)
 #define SET_DELTA(obj, val) oe_set_delta(&to_pack, obj, val)
+#define SET_DELTA_SIZE(obj, val) oe_set_delta_size(&to_pack, obj, val)
 #define SET_DELTA_CHILD(obj, val) oe_set_delta_child(&to_pack, obj, val)
 #define SET_DELTA_SIBLING(obj, val) oe_set_delta_sibling(&to_pack, obj, val)
 
@@ -142,7 +144,7 @@ static void *get_delta(struct object_entry *entry)
 		    oid_to_hex(&DELTA(entry)->idx.oid));
 	delta_buf = diff_delta(base_buf, base_size,
 			       buf, size, &delta_size, 0);
-	if (!delta_buf || delta_size != entry->delta_size)
+	if (!delta_buf || delta_size != DELTA_SIZE(entry))
 		die("delta size changed");
 	free(buf);
 	free(base_buf);
@@ -293,14 +295,14 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 		FREE_AND_NULL(entry->delta_data);
 		entry->z_delta_size = 0;
 	} else if (entry->delta_data) {
-		size = entry->delta_size;
+		size = DELTA_SIZE(entry);
 		buf = entry->delta_data;
 		entry->delta_data = NULL;
 		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	} else {
 		buf = get_delta(entry);
-		size = entry->delta_size;
+		size = DELTA_SIZE(entry);
 		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	}
@@ -1508,7 +1510,7 @@ static void check_object(struct object_entry *entry)
 			oe_set_type(entry, entry->in_pack_type);
 			SET_SIZE(entry, in_pack_size); /* delta size */
 			SET_DELTA(entry, base_entry);
-			entry->delta_size = in_pack_size;
+			SET_DELTA_SIZE(entry, in_pack_size);
 			entry->delta_sibling_idx = base_entry->delta_child_idx;
 			SET_DELTA_CHILD(base_entry, entry);
 			unuse_pack(&w_curs);
@@ -1933,7 +1935,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 		max_size = trg_size/2 - 20;
 		ref_depth = 1;
 	} else {
-		max_size = trg_entry->delta_size;
+		max_size = DELTA_SIZE(trg_entry);
 		ref_depth = trg->depth;
 	}
 	max_size = (uint64_t)max_size * (max_depth - src->depth) /
@@ -2004,10 +2006,12 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	delta_buf = create_delta(src->index, trg->data, trg_size, &delta_size, max_size);
 	if (!delta_buf)
 		return 0;
+	if (delta_size >= (1 << OE_DELTA_SIZE_BITS))
+		return 0;
 
 	if (DELTA(trg_entry)) {
 		/* Prefer only shallower same-sized deltas. */
-		if (delta_size == trg_entry->delta_size &&
+		if (delta_size == DELTA_SIZE(trg_entry) &&
 		    src->depth + 1 >= trg->depth) {
 			free(delta_buf);
 			return 0;
@@ -2022,7 +2026,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	free(trg_entry->delta_data);
 	cache_lock();
 	if (trg_entry->delta_data) {
-		delta_cache_size -= trg_entry->delta_size;
+		delta_cache_size -= DELTA_SIZE(trg_entry);
 		trg_entry->delta_data = NULL;
 	}
 	if (delta_cacheable(src_size, trg_size, delta_size)) {
@@ -2035,7 +2039,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	}
 
 	SET_DELTA(trg_entry, src_entry);
-	trg_entry->delta_size = delta_size;
+	SET_DELTA_SIZE(trg_entry, delta_size);
 	trg->depth = src->depth + 1;
 
 	return 1;
@@ -2158,11 +2162,11 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
 		if (entry->delta_data && !pack_to_stdout) {
 			unsigned long size;
 
-			size = do_compress(&entry->delta_data, entry->delta_size);
+			size = do_compress(&entry->delta_data, DELTA_SIZE(entry));
 			if (size < (1 << OE_Z_DELTA_BITS)) {
 				entry->z_delta_size = size;
 				cache_lock();
-				delta_cache_size -= entry->delta_size;
+				delta_cache_size -= DELTA_SIZE(entry);
 				delta_cache_size += entry->z_delta_size;
 				cache_unlock();
 			} else {
diff --git a/pack-objects.h b/pack-objects.h
index 8bb082f22f..6d0345a82c 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -6,6 +6,7 @@
 #define OE_IN_PACK_BITS		10
 #define OE_Z_DELTA_BITS		16
 #define OE_SIZE_BITS		31
+#define OE_DELTA_SIZE_BITS	31
 
 /*
  * State flags for depth-first search used for analyzing delta cycles.
@@ -83,7 +84,8 @@ struct object_entry {
 				     * uses the same base as me
 				     */
 	void *delta_data;	/* cached delta (uncompressed) */
-	unsigned long delta_size;	/* delta data size (uncompressed) */
+	uint32_t delta_size_:OE_DELTA_SIZE_BITS; /* delta data size (uncompressed) */
+	uint32_t delta_size_valid:1;
 	unsigned z_delta_size:OE_Z_DELTA_BITS;
 	unsigned type_:TYPE_BITS;
 	unsigned in_pack_type:TYPE_BITS; /* could be delta */
@@ -307,4 +309,23 @@ static inline void oe_set_size(struct packing_data *pack,
 	}
 }
 
+static inline unsigned long oe_delta_size(struct packing_data *pack,
+					  const struct object_entry *e)
+{
+	if (e->delta_size_valid)
+		return e->delta_size_;
+	return oe_size(pack, e);
+}
+
+static inline void oe_set_delta_size(struct packing_data *pack,
+				     struct object_entry *e,
+				     unsigned long size)
+{
+	e->delta_size_ = size;
+	e->delta_size_valid = e->delta_size_ == size;
+	if (!e->delta_size_valid && size != oe_size(pack, e))
+		die("BUG: this can only happen in check_object() "
+		    "where delta size is the same as entry size");
+}
+
 #endif
-- 
2.17.0.rc0.348.gd5a49e0b6f


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v7 13/13] pack-objects: reorder members to shrink struct object_entry
  2018-03-24  6:33             ` [PATCH v7 00/13] " Nguyễn Thái Ngọc Duy
                                 ` (11 preceding siblings ...)
  2018-03-24  6:33               ` [PATCH v7 12/13] pack-objects: shrink delta_size " Nguyễn Thái Ngọc Duy
@ 2018-03-24  6:33               ` Nguyễn Thái Ngọc Duy
  2018-03-30 21:26                 ` Jeff King
  2018-03-26 15:13               ` [PATCH v7 00/13] nd/pack-objects-pack-struct updates Jeff King
  2018-03-31 10:02               ` [PATCH v8 00/15] " Nguyễn Thái Ngọc Duy
  14 siblings, 1 reply; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-24  6:33 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

Previous patches leave lots of holes and padding in this struct. This
patch reorders the members and shrinks the struct down to 80 bytes
(from 136 bytes, before any field shrinking is done) with 16 bits to
spare (and a couple more in in_pack_header_size when we really run out
of bits).

This is the last in a series of memory reduction patches (see
"pack-objects: a bit of document about struct object_entry" for the
first one).

Overall they've reduced repack memory size on linux-2.6.git from
3.747G to 3.424G, or by around 320M, a decrease of 8.5%. The runtime
of repack has stayed the same throughout this series. Ævar's testing
on a big monorepo he has access to (bigger than linux-2.6.git) has
shown a 7.9% reduction, so the overall expected improvement should be
somewhere around 8%.

See 87po42cwql.fsf@evledraar.gmail.com on-list
(https://public-inbox.org/git/87po42cwql.fsf@evledraar.gmail.com/) for
more detailed numbers and a test script used to produce the numbers
cited above.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 pack-objects.h | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/pack-objects.h b/pack-objects.h
index 6d0345a82c..9f19672602 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -74,34 +74,36 @@ enum dfs_state {
  */
 struct object_entry {
 	struct pack_idx_entry idx;
-	uint32_t size_:OE_SIZE_BITS;
-	uint32_t size_valid:1;
-	unsigned in_pack_idx:OE_IN_PACK_BITS;	/* already in pack */
+	void *delta_data;	/* cached delta (uncompressed) */
 	off_t in_pack_offset;
+	uint32_t hash;			/* name hint hash */
+	uint32_t size_:OE_SIZE_BITS;
+	unsigned size_valid:1;
 	uint32_t delta_idx;	/* delta base object */
 	uint32_t delta_child_idx; /* deltified objects who bases me */
 	uint32_t delta_sibling_idx; /* other deltified objects who
 				     * uses the same base as me
 				     */
-	void *delta_data;	/* cached delta (uncompressed) */
 	uint32_t delta_size_:OE_DELTA_SIZE_BITS; /* delta data size (uncompressed) */
 	uint32_t delta_size_valid:1;
+	unsigned in_pack_idx:OE_IN_PACK_BITS;	/* already in pack */
 	unsigned z_delta_size:OE_Z_DELTA_BITS;
+	unsigned type_valid:1;
 	unsigned type_:TYPE_BITS;
+	unsigned no_try_delta:1;
 	unsigned in_pack_type:TYPE_BITS; /* could be delta */
-	unsigned type_valid:1;
-	uint32_t hash;			/* name hint hash */
-	unsigned char in_pack_header_size;
 	unsigned preferred_base:1; /*
 				    * we do not pack this, but is available
 				    * to be used as the base object to delta
 				    * objects against.
 				    */
-	unsigned no_try_delta:1;
 	unsigned tagged:1; /* near the very tip of refs */
 	unsigned filled:1; /* assigned write-order */
 	unsigned dfs_state:OE_DFS_STATE_BITS;
+	unsigned char in_pack_header_size;
 	unsigned depth:OE_DEPTH_BITS;
+
+	/* size: 80, bit_padding: 20 bits, holes: 1 bit */
 };
 
 struct packing_data {
-- 
2.17.0.rc0.348.gd5a49e0b6f


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* Re: [PATCH v6 00/11] nd/pack-objects-pack-struct updates
  2018-03-23 16:01                         ` Ramsay Jones
@ 2018-03-24  6:40                           ` Jeff King
  0 siblings, 0 replies; 273+ messages in thread
From: Jeff King @ 2018-03-24  6:40 UTC (permalink / raw)
  To: Ramsay Jones
  Cc: Duy Nguyen, Ævar Arnfjörð Bjarmason, Eric Wong,
	Git Mailing List, Junio C Hamano

On Fri, Mar 23, 2018 at 04:01:50PM +0000, Ramsay Jones wrote:

> Not that it matters, but I assume this was something like:
> 
>   $ time (echo HEAD | git cat-file --batch-check="%(objectsize:disk)")
> 
> ... and I suspect it was on the linux.git repo, yes?

Yes to both.

> If I do this on my biggest repo (ffmpeg), I get:
> 
>   $ cd ../ffmpeg/
> 
>   $ time (echo HEAD | git cat-file --batch-check="%(objectsize:disk)")
>   227
> 
>   real	0m0.037s
>   user	0m0.020s
>   sys	0m0.004s
> 
>   $ time (echo HEAD | ../git/git-cat-file --batch-check="%(objectsize:disk)")
>   227
> 
>   real	0m0.146s
>   user	0m0.112s
>   sys	0m0.012s
> 
>   $ 
> 
> Where I'm using a version with my patch applied, rather than
> reverting commit 8b8dfd5132. A 395% slowdown is bad enough, but
> not as bad as a factor of 11! I bet you have a much more modern
> system (with a fast SSD) than my old laptop. :-D

Yes, though it was all being run out of disk cache anyway. I also have a
lot of RAM. :)

The ffmpeg repository only has ~550k objects. So that's log(19), and
we'd expect radix to be something like 8-9x faster than a comparison
sort. But at some point the constants take over too (each O(n) round the
radix sort actually has to look at the items twice, and of course there
are negative cache effects when duplicating the array).

So your numbers match what I'd expect.

> Thanks for looking into this, even if it was a wild
> goose chase. :)

No problem. I think it's nice to sanity check my own hand-waving once in
a while. ;)

-Peff

^ permalink raw reply	[flat|nested] 273+ messages in thread

* [PATCH v4 0/7] nd/repack-keep-pack updates
  2018-03-16 19:27     ` [PATCH v3 0/7] nd/repack-keep-pack updates Nguyễn Thái Ngọc Duy
                         ` (6 preceding siblings ...)
  2018-03-16 19:27       ` [PATCH v3 7/7] pack-objects: display progress in get_object_details() Nguyễn Thái Ngọc Duy
@ 2018-03-24  7:25       ` Nguyễn Thái Ngọc Duy
  2018-03-24  7:25         ` [PATCH v4 1/7] t7700: have closing quote of a test at the beginning of line Nguyễn Thái Ngọc Duy
                           ` (6 more replies)
  7 siblings, 7 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-24  7:25 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

v4 is mostly refining tests and other minor fixes on v3

- --keep-base-pack is renamed to --keep-largest-pack
- "Counting objects" progress line is back
- test and docs updates

Interdiff

diff --git a/Documentation/config.txt b/Documentation/config.txt
index 6b602f918f..cf862d3edf 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -1562,7 +1562,8 @@ gc.bigPackThreshold::
 	If non-zero, all packs larger than this limit are kept when
 	`git gc` is run. This is very similar to `--keep-base-pack`
 	except that all packs that meet the threshold are kept, not
-	just the base pack. Defaults to zero.
+	just the base pack. Defaults to zero. Common unit suffixes of
+	'k', 'm', or 'g' are supported.
 +
 Note that if the number of kept packs is more than gc.autoPackLimit,
 this configuration variable is ignored, all packs except the base pack
diff --git a/Documentation/git-gc.txt b/Documentation/git-gc.txt
index 19b0d1741b..7549094900 100644
--- a/Documentation/git-gc.txt
+++ b/Documentation/git-gc.txt
@@ -9,7 +9,7 @@ git-gc - Cleanup unnecessary files and optimize the local repository
 SYNOPSIS
 --------
 [verse]
-'git gc' [--aggressive] [--auto] [--quiet] [--prune=<date> | --no-prune] [--force] [--keep-base-pack]
+'git gc' [--aggressive] [--auto] [--quiet] [--prune=<date> | --no-prune] [--force] [--keep-largest-pack]
 
 DESCRIPTION
 -----------
@@ -84,11 +84,10 @@ packs.
 	Force `git gc` to run even if there may be another `git gc`
 	instance running on this repository.
 
---keep-base-pack::
-	All packs except the base pack and those marked with a `.keep`
-	files are consolidated into a single pack. The largest pack is
-	considered the base pack. When this option is used,
-	`gc.bigPackThreshold` is ignored.
+--keep-largest-pack::
+	All packs except the largest pack and those marked with a
+	`.keep` files are consolidated into a single pack. When this
+	option is used, `gc.bigPackThreshold` is ignored.
 
 Configuration
 -------------
diff --git a/builtin/gc.c b/builtin/gc.c
index 140c1bb7dd..d0b04e369e 100644
--- a/builtin/gc.c
+++ b/builtin/gc.c
@@ -344,7 +344,7 @@ static int need_to_gc(void)
 				find_base_packs(&keep_pack, 0);
 			}
 		} else {
-			struct packed_git * p = find_base_packs(&keep_pack, 0);
+			struct packed_git *p = find_base_packs(&keep_pack, 0);
 			uint64_t mem_have, mem_want;
 
 			mem_have = total_ram();
@@ -504,8 +504,8 @@ int cmd_gc(int argc, const char **argv, const char *prefix)
 		OPT_BOOL(0, "aggressive", &aggressive, N_("be more thorough (increased runtime)")),
 		OPT_BOOL(0, "auto", &auto_gc, N_("enable auto-gc mode")),
 		OPT_BOOL(0, "force", &force, N_("force running gc even if there may be another gc running")),
-		OPT_BOOL(0, "keep-base-pack", &keep_base_pack,
-			 N_("repack all other packs except the base pack")),
+		OPT_BOOL(0, "keep-largest-pack", &keep_base_pack,
+			 N_("repack all other packs except the largest pack")),
 		OPT_END()
 	};
 
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 7d738627fc..1379b4cb92 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -54,7 +54,8 @@ static int pack_loose_unreachable;
 static int local;
 static int have_non_local_packs;
 static int incremental;
-static int ignore_packed_keep, ignore_packed_keep_in_core;
+static int ignore_packed_keep_on_disk;
+static int ignore_packed_keep_in_core;
 static int allow_ofs_delta;
 static struct pack_idx_option pack_idx_opts;
 static const char *base_name;
@@ -983,14 +984,15 @@ static int want_found_object(int exclude, struct packed_git *p)
 	 * Otherwise, we signal "-1" at the end to tell the caller that we do
 	 * not know either way, and it needs to check more packs.
 	 */
-	if (!ignore_packed_keep && !ignore_packed_keep_in_core &&
+	if (!ignore_packed_keep_on_disk &&
+	    !ignore_packed_keep_in_core &&
 	    (!local || !have_non_local_packs))
 		return 1;
 
 	if (local && !p->pack_local)
 		return 0;
 	if (p->pack_local &&
-	    ((ignore_packed_keep && p->pack_keep) ||
+	    ((ignore_packed_keep_on_disk && p->pack_keep) ||
 	     (ignore_packed_keep_in_core && p->pack_keep_in_core)))
 		return 0;
 
@@ -1716,7 +1718,7 @@ static void get_object_details(void)
 	struct object_entry **sorted_by_offset;
 
 	if (progress)
-		progress_state = start_progress(_("Getting object details"),
+		progress_state = start_progress(_("Counting objects"),
 						to_pack.nr_objects);
 
 	sorted_by_offset = xcalloc(to_pack.nr_objects, sizeof(struct object_entry *));
@@ -2818,7 +2820,7 @@ static int pack_options_allow_reuse(void)
 {
 	return pack_to_stdout &&
 	       allow_ofs_delta &&
-	       !ignore_packed_keep &&
+	       !ignore_packed_keep_on_disk &&
 	       !ignore_packed_keep_in_core &&
 	       (!local || !have_non_local_packs) &&
 	       !incremental;
@@ -3059,7 +3061,7 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 			 N_("create thin packs")),
 		OPT_BOOL(0, "shallow", &shallow,
 			 N_("create packs suitable for shallow fetches")),
-		OPT_BOOL(0, "honor-pack-keep", &ignore_packed_keep,
+		OPT_BOOL(0, "honor-pack-keep", &ignore_packed_keep_on_disk,
 			 N_("ignore packs that have companion .keep file")),
 		OPT_STRING_LIST(0, "keep-pack", &keep_pack_list, N_("name"),
 				N_("ignore this pack")),
@@ -3192,19 +3194,19 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 
 	prepare_packed_git();
 	add_extra_kept_packs(&keep_pack_list);
-	if (ignore_packed_keep) {
+	if (ignore_packed_keep_on_disk) {
 		struct packed_git *p;
 		for (p = packed_git; p; p = p->next)
 			if (p->pack_local && p->pack_keep)
 				break;
 		if (!p) /* no keep-able packs found */
-			ignore_packed_keep = 0;
+			ignore_packed_keep_on_disk = 0;
 	}
 	if (local) {
 		/*
-		 * unlike ignore_packed_keep above, we do not want to
-		 * unset "local" based on looking at packs, as it
-		 * also covers non-local objects
+		 * unlike ignore_packed_keep_on_disk above, we do not
+		 * want to unset "local" based on looking at packs, as
+		 * it also covers non-local objects
 		 */
 		struct packed_git *p;
 		for (p = packed_git; p; p = p->next) {
diff --git a/t/t6500-gc.sh b/t/t6500-gc.sh
index 96ca70f9cc..100f287b97 100755
--- a/t/t6500-gc.sh
+++ b/t/t6500-gc.sh
@@ -50,6 +50,31 @@ test_expect_success 'gc is not aborted due to a stale symref' '
 	)
 '
 
+test_expect_success 'gc --keep-largest-pack' '
+	test_create_repo keep-pack &&
+	(
+		cd keep-pack &&
+		test_commit one &&
+		test_commit two &&
+		test_commit three &&
+		git gc &&
+		( cd .git/objects/pack && ls *.pack ) >pack-list &&
+		test_line_count = 1 pack-list &&
+		BASE_PACK=.git/objects/pack/pack-*.pack &&
+		test_commit four &&
+		git repack -d &&
+		test_commit five &&
+		git repack -d &&
+		( cd .git/objects/pack && ls *.pack ) >pack-list &&
+		test_line_count = 3 pack-list &&
+		git gc --keep-largest-pack &&
+		( cd .git/objects/pack && ls *.pack ) >pack-list &&
+		test_line_count = 2 pack-list &&
+		test_path_is_file $BASE_PACK &&
+		git fsck
+	)
+'
+
 test_expect_success 'auto gc with too many loose objects does not attempt to create bitmaps' '
 	test_config gc.auto 3 &&
 	test_config gc.autodetach false &&
@@ -123,28 +148,6 @@ test_expect_success 'background auto gc respects lock for all operations' '
 	test_path_is_file .git/refs/heads/should-be-loose
 '
 
-test_expect_success 'gc --keep-base-pack' '
-	test_create_repo keep-pack &&
-	(
-		cd keep-pack &&
-		for i in 10; do
-			test_commit $i
-		done &&
-		git gc &&
-		( cd .git/objects/pack && ls *.pack ) >pack-list &&
-		test_line_count = 1 pack-list &&
-		BASE_PACK=.git/objects/pack/pack-*.pack &&
-		for i in 10; do
-			test_commit more-$i
-		done &&
-		git gc --keep-base-pack &&
-		( cd .git/objects/pack && ls *.pack ) >pack-list &&
-		test_line_count = 2 pack-list &&
-		test_path_is_file $BASE_PACK &&
-		git fsck
-	)
-'
-
 # DO NOT leave a detached auto gc process running near the end of the
 # test script: it can run long enough in the background to racily
 # interfere with the cleanup in 'test_done'.
diff --git a/t/t7700-repack.sh b/t/t7700-repack.sh
index 05ae0de3aa..6162e2a8e6 100755
--- a/t/t7700-repack.sh
+++ b/t/t7700-repack.sh
@@ -4,6 +4,12 @@ test_description='git repack works correctly'
 
 . ./test-lib.sh
 
+commit_and_pack() {
+	test_commit "$@" >/dev/null &&
+	SHA1=$(git pack-objects --all --unpacked --incremental .git/objects/pack/pack </dev/null) &&
+	echo pack-${SHA1}.pack
+}
+
 test_expect_success 'objects in packs marked .keep are not repacked' '
 	echo content1 > file1 &&
 	echo content2 > file2 &&
@@ -200,22 +206,16 @@ test_expect_success 'repack --keep-pack' '
 	test_create_repo keep-pack &&
 	(
 		cd keep-pack &&
-		test_commit one &&
-		git repack -d &&
-		test_commit two &&
-		git repack -d &&
-		test_commit three &&
-		git repack -d &&
-		test_commit four &&
-		git repack -d &&
-		( cd .git/objects/pack && ls *.pack ) >pack-list &&
-		test_line_count = 4 pack-list &&
-		KEEP1=$(head -n1 pack-list) &&
-		KEEP4=$(tail -n1 pack-list) &&
-		git repack -a -d --keep-pack $KEEP1 --keep-pack $KEEP4 &&
+		P1=$(commit_and_pack 1) &&
+		P2=$(commit_and_pack 2) &&
+		P3=$(commit_and_pack 3) &&
+		P4=$(commit_and_pack 4) &&
+		ls .git/objects/pack/*.pack >old-counts &&
+		test_line_count = 4 old-counts &&
+		git repack -a -d --keep-pack $P1 --keep-pack $P4 &&
 		ls .git/objects/pack/*.pack >new-counts &&
-		grep -q $KEEP1 new-counts &&
-		grep -q $KEEP4 new-counts &&
+		grep -q $P1 new-counts &&
+		grep -q $P4 new-counts &&
 		test_line_count = 3 new-counts &&
 		git fsck
 	)

Nguyễn Thái Ngọc Duy (7):
  t7700: have closing quote of a test at the beginning of line
  repack: add --keep-pack option
  gc: add --keep-largest-pack option
  gc: add gc.bigPackThreshold config
  gc: handle a corner case in gc.bigPackThreshold
  gc --auto: exclude base pack if not enough mem to "repack -ad"
  pack-objects: show some progress when counting kept objects

 Documentation/config.txt           |  12 +++
 Documentation/git-gc.txt           |  19 +++-
 Documentation/git-pack-objects.txt |   9 +-
 Documentation/git-repack.txt       |   9 +-
 builtin/gc.c                       | 167 +++++++++++++++++++++++++++--
 builtin/pack-objects.c             |  84 +++++++++++----
 builtin/repack.c                   |  21 +++-
 cache.h                            |   1 +
 config.mak.uname                   |   1 +
 git-compat-util.h                  |   4 +
 pack-objects.h                     |   2 +
 t/t6500-gc.sh                      |  32 ++++++
 t/t7700-repack.sh                  |  27 ++++-
 13 files changed, 352 insertions(+), 36 deletions(-)

-- 
2.17.0.rc0.348.gd5a49e0b6f


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v4 1/7] t7700: have closing quote of a test at the beginning of line
  2018-03-24  7:25       ` [PATCH v4 0/7] nd/repack-keep-pack updates Nguyễn Thái Ngọc Duy
@ 2018-03-24  7:25         ` Nguyễn Thái Ngọc Duy
  2018-03-24  7:25         ` [PATCH v4 2/7] repack: add --keep-pack option Nguyễn Thái Ngọc Duy
                           ` (5 subsequent siblings)
  6 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-24  7:25 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

The closing quote of a test body by convention is always at the start
of line.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 t/t7700-repack.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/t/t7700-repack.sh b/t/t7700-repack.sh
index 6061a04147..38247afbec 100755
--- a/t/t7700-repack.sh
+++ b/t/t7700-repack.sh
@@ -194,7 +194,7 @@ test_expect_success 'objects made unreachable by grafts only are kept' '
 	git reflog expire --expire=$test_tick --expire-unreachable=$test_tick --all &&
 	git repack -a -d &&
 	git cat-file -t $H1
-	'
+'
 
 test_done
 
-- 
2.17.0.rc0.348.gd5a49e0b6f


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v4 2/7] repack: add --keep-pack option
  2018-03-24  7:25       ` [PATCH v4 0/7] nd/repack-keep-pack updates Nguyễn Thái Ngọc Duy
  2018-03-24  7:25         ` [PATCH v4 1/7] t7700: have closing quote of a test at the beginning of line Nguyễn Thái Ngọc Duy
@ 2018-03-24  7:25         ` Nguyễn Thái Ngọc Duy
  2018-03-24  7:25         ` [PATCH v4 3/7] gc: add --keep-largest-pack option Nguyễn Thái Ngọc Duy
                           ` (4 subsequent siblings)
  6 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-24  7:25 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

We allow to keep existing packs by having companion .keep files. This
is helpful when a pack is permanently kept. In the next patch, git-gc
just wants to keep a pack temporarily, for one pack-objects
run. git-gc can use --keep-pack for this use case.

A note about why the pack_keep field cannot be reused and
pack_keep_in_core has to be added. This is about the case when
--keep-pack is specified together with either --keep-unreachable or
--unpack-unreachable, but --honor-pack-keep is NOT specified.

In this case, we want to exclude objects from the packs specified on
command line, not from ones with .keep files. If only one bit flag is
used, we have to clear pack_keep on pack files with the .keep file.

But we can't make any assumption about unreachable objects in .keep
packs. If "pack_keep" field is false for .keep packs, we could
potentially pull lots of unreachable objects into the new pack, or
unpack them loose. The safer approach is ignore all packs with either
.keep file or --keep-pack.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 Documentation/git-pack-objects.txt |  9 ++++-
 Documentation/git-repack.txt       |  9 ++++-
 builtin/pack-objects.c             | 64 ++++++++++++++++++++++++------
 builtin/repack.c                   | 21 ++++++++--
 cache.h                            |  1 +
 t/t7700-repack.sh                  | 25 ++++++++++++
 6 files changed, 111 insertions(+), 18 deletions(-)

diff --git a/Documentation/git-pack-objects.txt b/Documentation/git-pack-objects.txt
index 81bc490ac5..403524652a 100644
--- a/Documentation/git-pack-objects.txt
+++ b/Documentation/git-pack-objects.txt
@@ -12,7 +12,7 @@ SYNOPSIS
 'git pack-objects' [-q | --progress | --all-progress] [--all-progress-implied]
 	[--no-reuse-delta] [--delta-base-offset] [--non-empty]
 	[--local] [--incremental] [--window=<n>] [--depth=<n>]
-	[--revs [--unpacked | --all]]
+	[--revs [--unpacked | --all]] [--keep-pack=<pack-name>]
 	[--stdout [--filter=<filter-spec>] | base-name]
 	[--shallow] [--keep-true-parents] < object-list
 
@@ -126,6 +126,13 @@ base-name::
 	has a .keep file to be ignored, even if it would have
 	otherwise been packed.
 
+--keep-pack=<pack-name>::
+	This flag causes an object already in the given pack to be
+	ignored, even if it would have otherwise been
+	packed. `<pack-name>` is the the pack file name without
+	leading directory (e.g. `pack-123.pack`). The option could be
+	specified multiple times to keep multiple packs.
+
 --incremental::
 	This flag causes an object already in a pack to be ignored
 	even if it would have otherwise been packed.
diff --git a/Documentation/git-repack.txt b/Documentation/git-repack.txt
index ae750e9e11..ce497d9d12 100644
--- a/Documentation/git-repack.txt
+++ b/Documentation/git-repack.txt
@@ -9,7 +9,7 @@ git-repack - Pack unpacked objects in a repository
 SYNOPSIS
 --------
 [verse]
-'git repack' [-a] [-A] [-d] [-f] [-F] [-l] [-n] [-q] [-b] [--window=<n>] [--depth=<n>] [--threads=<n>]
+'git repack' [-a] [-A] [-d] [-f] [-F] [-l] [-n] [-q] [-b] [--window=<n>] [--depth=<n>] [--threads=<n>] [--keep-pack=<pack-name>]
 
 DESCRIPTION
 -----------
@@ -133,6 +133,13 @@ other objects in that pack they already have locally.
 	with `-b` or `repack.writeBitmaps`, as it ensures that the
 	bitmapped packfile has the necessary objects.
 
+--keep-pack=<pack-name>::
+	Exclude the given pack from repacking. This is the equivalent
+	of having `.keep` file on the pack. `<pack-name>` is the the
+	pack file name without leading directory (e.g. `pack-123.pack`).
+	The option could be specified multiple times to keep multiple
+	packs.
+
 --unpack-unreachable=<when>::
 	When loosening unreachable objects, do not bother loosening any
 	objects older than `<when>`. This can be used to optimize out
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index a197926eaa..7ec70131a9 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -28,6 +28,7 @@
 #include "argv-array.h"
 #include "list.h"
 #include "packfile.h"
+#include "dir.h"
 
 static const char *pack_usage[] = {
 	N_("git pack-objects --stdout [<options>...] [< <ref-list> | < <object-list>]"),
@@ -53,7 +54,8 @@ static int pack_loose_unreachable;
 static int local;
 static int have_non_local_packs;
 static int incremental;
-static int ignore_packed_keep;
+static int ignore_packed_keep_on_disk;
+static int ignore_packed_keep_in_core;
 static int allow_ofs_delta;
 static struct pack_idx_option pack_idx_opts;
 static const char *base_name;
@@ -982,13 +984,16 @@ static int want_found_object(int exclude, struct packed_git *p)
 	 * Otherwise, we signal "-1" at the end to tell the caller that we do
 	 * not know either way, and it needs to check more packs.
 	 */
-	if (!ignore_packed_keep &&
+	if (!ignore_packed_keep_on_disk &&
+	    !ignore_packed_keep_in_core &&
 	    (!local || !have_non_local_packs))
 		return 1;
 
 	if (local && !p->pack_local)
 		return 0;
-	if (ignore_packed_keep && p->pack_local && p->pack_keep)
+	if (p->pack_local &&
+	    ((ignore_packed_keep_on_disk && p->pack_keep) ||
+	     (ignore_packed_keep_in_core && p->pack_keep_in_core)))
 		return 0;
 
 	/* we don't know yet; keep looking for more packs */
@@ -2677,7 +2682,7 @@ static void add_objects_in_unpacked_packs(struct rev_info *revs)
 		struct object_id oid;
 		struct object *o;
 
-		if (!p->pack_local || p->pack_keep)
+		if (!p->pack_local || p->pack_keep || p->pack_keep_in_core)
 			continue;
 		if (open_pack_index(p))
 			die("cannot open pack index");
@@ -2739,7 +2744,8 @@ static int has_sha1_pack_kept_or_nonlocal(const struct object_id *oid)
 	p = (last_found != (void *)1) ? last_found : packed_git;
 
 	while (p) {
-		if ((!p->pack_local || p->pack_keep) &&
+		if ((!p->pack_local || p->pack_keep ||
+				p->pack_keep_in_core) &&
 			find_pack_entry_one(oid->hash, p)) {
 			last_found = p;
 			return 1;
@@ -2782,7 +2788,7 @@ static void loosen_unused_packed_objects(struct rev_info *revs)
 	struct object_id oid;
 
 	for (p = packed_git; p; p = p->next) {
-		if (!p->pack_local || p->pack_keep)
+		if (!p->pack_local || p->pack_keep || p->pack_keep_in_core)
 			continue;
 
 		if (open_pack_index(p))
@@ -2808,7 +2814,8 @@ static int pack_options_allow_reuse(void)
 {
 	return pack_to_stdout &&
 	       allow_ofs_delta &&
-	       !ignore_packed_keep &&
+	       !ignore_packed_keep_on_disk &&
+	       !ignore_packed_keep_in_core &&
 	       (!local || !have_non_local_packs) &&
 	       !incremental;
 }
@@ -2917,6 +2924,33 @@ static void get_object_list(int ac, const char **av)
 	oid_array_clear(&recent_objects);
 }
 
+static void add_extra_kept_packs(const struct string_list *names)
+{
+	struct packed_git *p;
+
+	if (!names->nr)
+		return;
+
+	prepare_packed_git();
+	for (p = packed_git; p; p = p->next) {
+		const char *name = basename(p->pack_name);
+		int i;
+
+		if (!p->pack_local)
+			continue;
+
+		for (i = 0; i < names->nr; i++)
+			if (!fspathcmp(name, names->items[i].string))
+				break;
+
+		if (i < names->nr) {
+			p->pack_keep_in_core = 1;
+			ignore_packed_keep_in_core = 1;
+			continue;
+		}
+	}
+}
+
 static int option_parse_index_version(const struct option *opt,
 				      const char *arg, int unset)
 {
@@ -2956,6 +2990,7 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 	struct argv_array rp = ARGV_ARRAY_INIT;
 	int rev_list_unpacked = 0, rev_list_all = 0, rev_list_reflog = 0;
 	int rev_list_index = 0;
+	struct string_list keep_pack_list = STRING_LIST_INIT_NODUP;
 	struct option pack_objects_options[] = {
 		OPT_SET_INT('q', "quiet", &progress,
 			    N_("do not show progress meter"), 0),
@@ -3020,8 +3055,10 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 			 N_("create thin packs")),
 		OPT_BOOL(0, "shallow", &shallow,
 			 N_("create packs suitable for shallow fetches")),
-		OPT_BOOL(0, "honor-pack-keep", &ignore_packed_keep,
+		OPT_BOOL(0, "honor-pack-keep", &ignore_packed_keep_on_disk,
 			 N_("ignore packs that have companion .keep file")),
+		OPT_STRING_LIST(0, "keep-pack", &keep_pack_list, N_("name"),
+				N_("ignore this pack")),
 		OPT_INTEGER(0, "compression", &pack_compression_level,
 			    N_("pack compression level")),
 		OPT_SET_INT(0, "keep-true-parents", &grafts_replace_parents,
@@ -3150,19 +3187,20 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 		progress = 2;
 
 	prepare_packed_git();
-	if (ignore_packed_keep) {
+	add_extra_kept_packs(&keep_pack_list);
+	if (ignore_packed_keep_on_disk) {
 		struct packed_git *p;
 		for (p = packed_git; p; p = p->next)
 			if (p->pack_local && p->pack_keep)
 				break;
 		if (!p) /* no keep-able packs found */
-			ignore_packed_keep = 0;
+			ignore_packed_keep_on_disk = 0;
 	}
 	if (local) {
 		/*
-		 * unlike ignore_packed_keep above, we do not want to
-		 * unset "local" based on looking at packs, as it
-		 * also covers non-local objects
+		 * unlike ignore_packed_keep_on_disk above, we do not
+		 * want to unset "local" based on looking at packs, as
+		 * it also covers non-local objects
 		 */
 		struct packed_git *p;
 		for (p = packed_git; p; p = p->next) {
diff --git a/builtin/repack.c b/builtin/repack.c
index 7bdb40142f..6c636e159e 100644
--- a/builtin/repack.c
+++ b/builtin/repack.c
@@ -86,7 +86,8 @@ static void remove_pack_on_signal(int signo)
  * have a corresponding .keep or .promisor file. These packs are not to
  * be kept if we are going to pack everything into one file.
  */
-static void get_non_kept_pack_filenames(struct string_list *fname_list)
+static void get_non_kept_pack_filenames(struct string_list *fname_list,
+					const struct string_list *extra_keep)
 {
 	DIR *dir;
 	struct dirent *e;
@@ -97,6 +98,14 @@ static void get_non_kept_pack_filenames(struct string_list *fname_list)
 
 	while ((e = readdir(dir)) != NULL) {
 		size_t len;
+		int i;
+
+		for (i = 0; i < extra_keep->nr; i++)
+			if (!fspathcmp(e->d_name, extra_keep->items[i].string))
+				break;
+		if (extra_keep->nr > 0 && i < extra_keep->nr)
+			continue;
+
 		if (!strip_suffix(e->d_name, ".pack", &len))
 			continue;
 
@@ -148,7 +157,7 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
 	struct string_list rollback = STRING_LIST_INIT_NODUP;
 	struct string_list existing_packs = STRING_LIST_INIT_DUP;
 	struct strbuf line = STRBUF_INIT;
-	int ext, ret, failed;
+	int i, ext, ret, failed;
 	FILE *out;
 
 	/* variables to be filled by option parsing */
@@ -160,6 +169,7 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
 	const char *depth = NULL;
 	const char *threads = NULL;
 	const char *max_pack_size = NULL;
+	struct string_list keep_pack_list = STRING_LIST_INIT_NODUP;
 	int no_reuse_delta = 0, no_reuse_object = 0;
 	int no_update_server_info = 0;
 	int quiet = 0;
@@ -200,6 +210,8 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
 				N_("maximum size of each packfile")),
 		OPT_BOOL(0, "pack-kept-objects", &pack_kept_objects,
 				N_("repack objects in packs marked with .keep")),
+		OPT_STRING_LIST(0, "keep-pack", &keep_pack_list, N_("name"),
+				N_("do not repack this pack")),
 		OPT_END()
 	};
 
@@ -230,6 +242,9 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
 	argv_array_push(&cmd.args, "--keep-true-parents");
 	if (!pack_kept_objects)
 		argv_array_push(&cmd.args, "--honor-pack-keep");
+	for (i = 0; i < keep_pack_list.nr; i++)
+		argv_array_pushf(&cmd.args, "--keep-pack=%s",
+				 keep_pack_list.items[i].string);
 	argv_array_push(&cmd.args, "--non-empty");
 	argv_array_push(&cmd.args, "--all");
 	argv_array_push(&cmd.args, "--reflog");
@@ -254,7 +269,7 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
 		argv_array_push(&cmd.args, "--write-bitmap-index");
 
 	if (pack_everything & ALL_INTO_ONE) {
-		get_non_kept_pack_filenames(&existing_packs);
+		get_non_kept_pack_filenames(&existing_packs, &keep_pack_list);
 
 		if (existing_packs.nr && delete_redundant) {
 			if (unpack_unreachable) {
diff --git a/cache.h b/cache.h
index d06932ed0b..ba168e3571 100644
--- a/cache.h
+++ b/cache.h
@@ -1635,6 +1635,7 @@ extern struct packed_git {
 	int pack_fd;
 	unsigned pack_local:1,
 		 pack_keep:1,
+		 pack_keep_in_core:1,
 		 freshened:1,
 		 do_not_close:1,
 		 pack_promisor:1;
diff --git a/t/t7700-repack.sh b/t/t7700-repack.sh
index 38247afbec..6162e2a8e6 100755
--- a/t/t7700-repack.sh
+++ b/t/t7700-repack.sh
@@ -4,6 +4,12 @@ test_description='git repack works correctly'
 
 . ./test-lib.sh
 
+commit_and_pack() {
+	test_commit "$@" >/dev/null &&
+	SHA1=$(git pack-objects --all --unpacked --incremental .git/objects/pack/pack </dev/null) &&
+	echo pack-${SHA1}.pack
+}
+
 test_expect_success 'objects in packs marked .keep are not repacked' '
 	echo content1 > file1 &&
 	echo content2 > file2 &&
@@ -196,5 +202,24 @@ test_expect_success 'objects made unreachable by grafts only are kept' '
 	git cat-file -t $H1
 '
 
+test_expect_success 'repack --keep-pack' '
+	test_create_repo keep-pack &&
+	(
+		cd keep-pack &&
+		P1=$(commit_and_pack 1) &&
+		P2=$(commit_and_pack 2) &&
+		P3=$(commit_and_pack 3) &&
+		P4=$(commit_and_pack 4) &&
+		ls .git/objects/pack/*.pack >old-counts &&
+		test_line_count = 4 old-counts &&
+		git repack -a -d --keep-pack $P1 --keep-pack $P4 &&
+		ls .git/objects/pack/*.pack >new-counts &&
+		grep -q $P1 new-counts &&
+		grep -q $P4 new-counts &&
+		test_line_count = 3 new-counts &&
+		git fsck
+	)
+'
+
 test_done
 
-- 
2.17.0.rc0.348.gd5a49e0b6f


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v4 3/7] gc: add --keep-largest-pack option
  2018-03-24  7:25       ` [PATCH v4 0/7] nd/repack-keep-pack updates Nguyễn Thái Ngọc Duy
  2018-03-24  7:25         ` [PATCH v4 1/7] t7700: have closing quote of a test at the beginning of line Nguyễn Thái Ngọc Duy
  2018-03-24  7:25         ` [PATCH v4 2/7] repack: add --keep-pack option Nguyễn Thái Ngọc Duy
@ 2018-03-24  7:25         ` Nguyễn Thái Ngọc Duy
  2018-03-24  7:25         ` [PATCH v4 4/7] gc: add gc.bigPackThreshold config Nguyễn Thái Ngọc Duy
                           ` (3 subsequent siblings)
  6 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-24  7:25 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

This adds a new repack mode that combines everything into a secondary
pack, leaving the largest pack alone.

This could help reduce memory pressure. On linux-2.6.git, valgrind
massif reports 1.6GB heap in "pack all" case, and 535MB in "pack
all except the base pack" case. We save roughly 1GB memory by
excluding the base pack.

This should also lower I/O because we don't have to rewrite a giant
pack every time (e.g. for linux-2.6.git that's a 1.4GB pack file)..

PS. The use of string_list here seems overkill, but we'll need it in
the next patch...

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 Documentation/git-gc.txt |  6 ++++-
 builtin/gc.c             | 47 ++++++++++++++++++++++++++++++++++++----
 t/t6500-gc.sh            | 25 +++++++++++++++++++++
 3 files changed, 73 insertions(+), 5 deletions(-)

diff --git a/Documentation/git-gc.txt b/Documentation/git-gc.txt
index 571b5a7e3c..bf81b8de30 100644
--- a/Documentation/git-gc.txt
+++ b/Documentation/git-gc.txt
@@ -9,7 +9,7 @@ git-gc - Cleanup unnecessary files and optimize the local repository
 SYNOPSIS
 --------
 [verse]
-'git gc' [--aggressive] [--auto] [--quiet] [--prune=<date> | --no-prune] [--force]
+'git gc' [--aggressive] [--auto] [--quiet] [--prune=<date> | --no-prune] [--force] [--keep-largest-pack]
 
 DESCRIPTION
 -----------
@@ -78,6 +78,10 @@ automatic consolidation of packs.
 	Force `git gc` to run even if there may be another `git gc`
 	instance running on this repository.
 
+--keep-largest-pack::
+	All packs except the largest pack and those marked with a
+	`.keep` files are consolidated into a single pack.
+
 Configuration
 -------------
 
diff --git a/builtin/gc.c b/builtin/gc.c
index 77fa720bd0..9a09cf53b0 100644
--- a/builtin/gc.c
+++ b/builtin/gc.c
@@ -164,6 +164,24 @@ static int too_many_loose_objects(void)
 	return needed;
 }
 
+static void find_base_packs(struct string_list *packs)
+{
+	struct packed_git *p, *base = NULL;
+
+	prepare_packed_git();
+
+	for (p = packed_git; p; p = p->next) {
+		if (!p->pack_local)
+			continue;
+		if (!base || base->pack_size < p->pack_size) {
+			base = p;
+		}
+	}
+
+	if (base)
+		string_list_append(packs, base->pack_name);
+}
+
 static int too_many_packs(void)
 {
 	struct packed_git *p;
@@ -187,7 +205,13 @@ static int too_many_packs(void)
 	return gc_auto_pack_limit < cnt;
 }
 
-static void add_repack_all_option(void)
+static int keep_one_pack(struct string_list_item *item, void *data)
+{
+	argv_array_pushf(&repack, "--keep-pack=%s", basename(item->string));
+	return 0;
+}
+
+static void add_repack_all_option(struct string_list *keep_pack)
 {
 	if (prune_expire && !strcmp(prune_expire, "now"))
 		argv_array_push(&repack, "-a");
@@ -196,6 +220,9 @@ static void add_repack_all_option(void)
 		if (prune_expire)
 			argv_array_pushf(&repack, "--unpack-unreachable=%s", prune_expire);
 	}
+
+	if (keep_pack)
+		for_each_string_list(keep_pack, keep_one_pack, NULL);
 }
 
 static void add_repack_incremental_option(void)
@@ -219,7 +246,7 @@ static int need_to_gc(void)
 	 * there is no need.
 	 */
 	if (too_many_packs())
-		add_repack_all_option();
+		add_repack_all_option(NULL);
 	else if (too_many_loose_objects())
 		add_repack_incremental_option();
 	else
@@ -353,6 +380,7 @@ int cmd_gc(int argc, const char **argv, const char *prefix)
 	const char *name;
 	pid_t pid;
 	int daemonized = 0;
+	int keep_base_pack = -1;
 
 	struct option builtin_gc_options[] = {
 		OPT__QUIET(&quiet, N_("suppress progress reporting")),
@@ -362,6 +390,8 @@ int cmd_gc(int argc, const char **argv, const char *prefix)
 		OPT_BOOL(0, "aggressive", &aggressive, N_("be more thorough (increased runtime)")),
 		OPT_BOOL(0, "auto", &auto_gc, N_("enable auto-gc mode")),
 		OPT_BOOL(0, "force", &force, N_("force running gc even if there may be another gc running")),
+		OPT_BOOL(0, "keep-largest-pack", &keep_base_pack,
+			 N_("repack all other packs except the largest pack")),
 		OPT_END()
 	};
 
@@ -427,8 +457,17 @@ int cmd_gc(int argc, const char **argv, const char *prefix)
 			 */
 			daemonized = !daemonize();
 		}
-	} else
-		add_repack_all_option();
+	} else {
+		struct string_list keep_pack = STRING_LIST_INIT_NODUP;
+
+		if (keep_base_pack != -1) {
+			if (keep_base_pack)
+				find_base_packs(&keep_pack);
+		}
+
+		add_repack_all_option(&keep_pack);
+		string_list_clear(&keep_pack, 0);
+	}
 
 	name = lock_repo_for_gc(force, &pid);
 	if (name) {
diff --git a/t/t6500-gc.sh b/t/t6500-gc.sh
index 41b0be575d..a95969af2a 100755
--- a/t/t6500-gc.sh
+++ b/t/t6500-gc.sh
@@ -43,6 +43,31 @@ test_expect_success 'gc is not aborted due to a stale symref' '
 	)
 '
 
+test_expect_success 'gc --keep-largest-pack' '
+	test_create_repo keep-pack &&
+	(
+		cd keep-pack &&
+		test_commit one &&
+		test_commit two &&
+		test_commit three &&
+		git gc &&
+		( cd .git/objects/pack && ls *.pack ) >pack-list &&
+		test_line_count = 1 pack-list &&
+		BASE_PACK=.git/objects/pack/pack-*.pack &&
+		test_commit four &&
+		git repack -d &&
+		test_commit five &&
+		git repack -d &&
+		( cd .git/objects/pack && ls *.pack ) >pack-list &&
+		test_line_count = 3 pack-list &&
+		git gc --keep-largest-pack &&
+		( cd .git/objects/pack && ls *.pack ) >pack-list &&
+		test_line_count = 2 pack-list &&
+		test_path_is_file $BASE_PACK &&
+		git fsck
+	)
+'
+
 test_expect_success 'auto gc with too many loose objects does not attempt to create bitmaps' '
 	test_config gc.auto 3 &&
 	test_config gc.autodetach false &&
-- 
2.17.0.rc0.348.gd5a49e0b6f


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v4 4/7] gc: add gc.bigPackThreshold config
  2018-03-24  7:25       ` [PATCH v4 0/7] nd/repack-keep-pack updates Nguyễn Thái Ngọc Duy
                           ` (2 preceding siblings ...)
  2018-03-24  7:25         ` [PATCH v4 3/7] gc: add --keep-largest-pack option Nguyễn Thái Ngọc Duy
@ 2018-03-24  7:25         ` Nguyễn Thái Ngọc Duy
  2018-03-24  7:25         ` [PATCH v4 5/7] gc: handle a corner case in gc.bigPackThreshold Nguyễn Thái Ngọc Duy
                           ` (2 subsequent siblings)
  6 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-24  7:25 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

The --keep-largest-pack option is not very convenient to use because
you need to tell gc to do this explicitly (and probably on just a few
large repos).

Add a config key that enables this mode when packs larger than a limit
are found. Note that there's a slight behavior difference compared to
--keep-largest-pack: all packs larger than the threshold are kept, not
just the largest one.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 Documentation/config.txt |  8 ++++++++
 Documentation/git-gc.txt |  6 ++++--
 builtin/gc.c             | 26 ++++++++++++++++++++------
 3 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index ce9102cea8..d63db3f12c 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -1558,6 +1558,14 @@ gc.autoDetach::
 	Make `git gc --auto` return immediately and run in background
 	if the system supports it. Default is true.
 
+gc.bigPackThreshold::
+	If non-zero, all packs larger than this limit are kept when
+	`git gc` is run. This is very similar to `--keep-base-pack`
+	except that all packs that meet the threshold are kept, not
+	just the base pack. Defaults to zero. Common unit suffixes of
+	'k', 'm', or 'g' are supported.
+
+
 gc.logExpiry::
 	If the file gc.log exists, then `git gc --auto` won't run
 	unless that file is more than 'gc.logExpiry' old.  Default is
diff --git a/Documentation/git-gc.txt b/Documentation/git-gc.txt
index bf81b8de30..0adf381b52 100644
--- a/Documentation/git-gc.txt
+++ b/Documentation/git-gc.txt
@@ -55,7 +55,8 @@ all loose objects are combined into a single pack using
 disables automatic packing of loose objects.
 +
 If the number of packs exceeds the value of `gc.autoPackLimit`,
-then existing packs (except those marked with a `.keep` file)
+then existing packs (except those marked with a `.keep` file
+or over `gc.bigPackThreshold` limit)
 are consolidated into a single pack by using the `-A` option of
 'git repack'. Setting `gc.autoPackLimit` to 0 disables
 automatic consolidation of packs.
@@ -80,7 +81,8 @@ automatic consolidation of packs.
 
 --keep-largest-pack::
 	All packs except the largest pack and those marked with a
-	`.keep` files are consolidated into a single pack.
+	`.keep` files are consolidated into a single pack. When this
+	option is used, `gc.bigPackThreshold` is ignored.
 
 Configuration
 -------------
diff --git a/builtin/gc.c b/builtin/gc.c
index 9a09cf53b0..53a0500898 100644
--- a/builtin/gc.c
+++ b/builtin/gc.c
@@ -39,6 +39,7 @@ static timestamp_t gc_log_expire_time;
 static const char *gc_log_expire = "1.day.ago";
 static const char *prune_expire = "2.weeks.ago";
 static const char *prune_worktrees_expire = "3.months.ago";
+static unsigned long big_pack_threshold;
 
 static struct argv_array pack_refs_cmd = ARGV_ARRAY_INIT;
 static struct argv_array reflog = ARGV_ARRAY_INIT;
@@ -126,6 +127,8 @@ static void gc_config(void)
 	git_config_get_expiry("gc.worktreepruneexpire", &prune_worktrees_expire);
 	git_config_get_expiry("gc.logexpiry", &gc_log_expire);
 
+	git_config_get_ulong("gc.bigpackthreshold", &big_pack_threshold);
+
 	git_config(git_default_config, NULL);
 }
 
@@ -164,7 +167,7 @@ static int too_many_loose_objects(void)
 	return needed;
 }
 
-static void find_base_packs(struct string_list *packs)
+static void find_base_packs(struct string_list *packs, unsigned long limit)
 {
 	struct packed_git *p, *base = NULL;
 
@@ -173,7 +176,10 @@ static void find_base_packs(struct string_list *packs)
 	for (p = packed_git; p; p = p->next) {
 		if (!p->pack_local)
 			continue;
-		if (!base || base->pack_size < p->pack_size) {
+		if (limit) {
+			if (p->pack_size >= limit)
+				string_list_append(packs, p->pack_name);
+		} else if (!base || base->pack_size < p->pack_size) {
 			base = p;
 		}
 	}
@@ -245,9 +251,15 @@ static int need_to_gc(void)
 	 * we run "repack -A -d -l".  Otherwise we tell the caller
 	 * there is no need.
 	 */
-	if (too_many_packs())
-		add_repack_all_option(NULL);
-	else if (too_many_loose_objects())
+	if (too_many_packs()) {
+		struct string_list keep_pack = STRING_LIST_INIT_NODUP;
+
+		if (big_pack_threshold)
+			find_base_packs(&keep_pack, big_pack_threshold);
+
+		add_repack_all_option(&keep_pack);
+		string_list_clear(&keep_pack, 0);
+	} else if (too_many_loose_objects())
 		add_repack_incremental_option();
 	else
 		return 0;
@@ -462,7 +474,9 @@ int cmd_gc(int argc, const char **argv, const char *prefix)
 
 		if (keep_base_pack != -1) {
 			if (keep_base_pack)
-				find_base_packs(&keep_pack);
+				find_base_packs(&keep_pack, 0);
+		} else if (big_pack_threshold) {
+			find_base_packs(&keep_pack, big_pack_threshold);
 		}
 
 		add_repack_all_option(&keep_pack);
-- 
2.17.0.rc0.348.gd5a49e0b6f


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v4 5/7] gc: handle a corner case in gc.bigPackThreshold
  2018-03-24  7:25       ` [PATCH v4 0/7] nd/repack-keep-pack updates Nguyễn Thái Ngọc Duy
                           ` (3 preceding siblings ...)
  2018-03-24  7:25         ` [PATCH v4 4/7] gc: add gc.bigPackThreshold config Nguyễn Thái Ngọc Duy
@ 2018-03-24  7:25         ` Nguyễn Thái Ngọc Duy
  2018-03-24  7:25         ` [PATCH v4 6/7] gc --auto: exclude base pack if not enough mem to "repack -ad" Nguyễn Thái Ngọc Duy
  2018-03-24  7:25         ` [PATCH v4 7/7] pack-objects: show some progress when counting kept objects Nguyễn Thái Ngọc Duy
  6 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-24  7:25 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

This config allows us to keep <N> packs back if their size is larger
than a limit. But if this N >= gc.autoPackLimit, we may have a
problem. We are supposed to reduce the number of packs after a
threshold because it affects performance.

We could tell the user that they have incompatible gc.bigPackThreshold
and gc.autoPackLimit, but it's kinda hard when 'git gc --auto' runs in
background. Instead let's fall back to the next best stategy: try to
reduce the number of packs anyway, but keep the base pack out. This
reduces the number of packs to two and hopefully won't take up too
much resources to repack (the assumption still is the base pack takes
most resources to handle).

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 Documentation/config.txt | 6 +++++-
 builtin/gc.c             | 8 +++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index d63db3f12c..cf862d3edf 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -1564,7 +1564,11 @@ gc.bigPackThreshold::
 	except that all packs that meet the threshold are kept, not
 	just the base pack. Defaults to zero. Common unit suffixes of
 	'k', 'm', or 'g' are supported.
-
++
+Note that if the number of kept packs is more than gc.autoPackLimit,
+this configuration variable is ignored, all packs except the base pack
+will be repacked. After this the number of packs should go below
+gc.autoPackLimit and gc.bigPackThreshold should be respected again.
 
 gc.logExpiry::
 	If the file gc.log exists, then `git gc --auto` won't run
diff --git a/builtin/gc.c b/builtin/gc.c
index 53a0500898..74d3aaa270 100644
--- a/builtin/gc.c
+++ b/builtin/gc.c
@@ -254,8 +254,14 @@ static int need_to_gc(void)
 	if (too_many_packs()) {
 		struct string_list keep_pack = STRING_LIST_INIT_NODUP;
 
-		if (big_pack_threshold)
+		if (big_pack_threshold) {
 			find_base_packs(&keep_pack, big_pack_threshold);
+			if (keep_pack.nr >= gc_auto_pack_limit) {
+				big_pack_threshold = 0;
+				string_list_clear(&keep_pack, 0);
+				find_base_packs(&keep_pack, 0);
+			}
+		}
 
 		add_repack_all_option(&keep_pack);
 		string_list_clear(&keep_pack, 0);
-- 
2.17.0.rc0.348.gd5a49e0b6f


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v4 6/7] gc --auto: exclude base pack if not enough mem to "repack -ad"
  2018-03-24  7:25       ` [PATCH v4 0/7] nd/repack-keep-pack updates Nguyễn Thái Ngọc Duy
                           ` (4 preceding siblings ...)
  2018-03-24  7:25         ` [PATCH v4 5/7] gc: handle a corner case in gc.bigPackThreshold Nguyễn Thái Ngọc Duy
@ 2018-03-24  7:25         ` Nguyễn Thái Ngọc Duy
  2018-03-24  7:25         ` [PATCH v4 7/7] pack-objects: show some progress when counting kept objects Nguyễn Thái Ngọc Duy
  6 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-24  7:25 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

pack-objects could be a big memory hog especially on large repos,
everybody knows that. The suggestion to stick a .keep file on the
giant base pack to avoid this problem is also known for a long time.

Recent patches add an option to do just this, but it has to be either
configured or activated manually. This patch lets `git gc --auto`
activate this mode automatically when it thinks `repack -ad` will use
a lot of memory and start affecting the system due to swapping or
flushing OS cache.

gc --auto decides to do this based on an estimation of pack-objects
memory usage, which is quite accurate at least for the heap part, and
whether that fits in half of system memory (the assumption here is for
desktop environment where there are many other applications running).

This mechanism only kicks in if gc.bigBasePackThreshold is not configured.
If it is, it is assumed that the user already knows what they want.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 Documentation/git-gc.txt |  9 +++-
 builtin/gc.c             | 98 +++++++++++++++++++++++++++++++++++++++-
 builtin/pack-objects.c   |  2 +-
 config.mak.uname         |  1 +
 git-compat-util.h        |  4 ++
 pack-objects.h           |  2 +
 t/t6500-gc.sh            |  7 +++
 7 files changed, 119 insertions(+), 4 deletions(-)

diff --git a/Documentation/git-gc.txt b/Documentation/git-gc.txt
index 0adf381b52..7549094900 100644
--- a/Documentation/git-gc.txt
+++ b/Documentation/git-gc.txt
@@ -58,8 +58,13 @@ If the number of packs exceeds the value of `gc.autoPackLimit`,
 then existing packs (except those marked with a `.keep` file
 or over `gc.bigPackThreshold` limit)
 are consolidated into a single pack by using the `-A` option of
-'git repack'. Setting `gc.autoPackLimit` to 0 disables
-automatic consolidation of packs.
+'git repack'.
+If the amount of memory is estimated not enough for `git repack` to
+run smoothly and `gc.bigPackThreshold` is not set, the largest
+pack will also be excluded (this is the equivalent of running `git gc`
+with `--keep-base-pack`).
+Setting `gc.autoPackLimit` to 0 disables automatic consolidation of
+packs.
 
 --prune=<date>::
 	Prune loose objects older than date (default is 2 weeks ago,
diff --git a/builtin/gc.c b/builtin/gc.c
index 74d3aaa270..d0b04e369e 100644
--- a/builtin/gc.c
+++ b/builtin/gc.c
@@ -20,6 +20,10 @@
 #include "argv-array.h"
 #include "commit.h"
 #include "packfile.h"
+#include "pack.h"
+#include "pack-objects.h"
+#include "blob.h"
+#include "tree.h"
 
 #define FAILED_RUN "failed to run %s"
 
@@ -40,6 +44,7 @@ static const char *gc_log_expire = "1.day.ago";
 static const char *prune_expire = "2.weeks.ago";
 static const char *prune_worktrees_expire = "3.months.ago";
 static unsigned long big_pack_threshold;
+static unsigned long max_delta_cache_size = DEFAULT_DELTA_CACHE_SIZE;
 
 static struct argv_array pack_refs_cmd = ARGV_ARRAY_INIT;
 static struct argv_array reflog = ARGV_ARRAY_INIT;
@@ -128,6 +133,7 @@ static void gc_config(void)
 	git_config_get_expiry("gc.logexpiry", &gc_log_expire);
 
 	git_config_get_ulong("gc.bigpackthreshold", &big_pack_threshold);
+	git_config_get_ulong("pack.deltacachesize", &max_delta_cache_size);
 
 	git_config(git_default_config, NULL);
 }
@@ -167,7 +173,8 @@ static int too_many_loose_objects(void)
 	return needed;
 }
 
-static void find_base_packs(struct string_list *packs, unsigned long limit)
+static struct packed_git *find_base_packs(struct string_list *packs,
+					  unsigned long limit)
 {
 	struct packed_git *p, *base = NULL;
 
@@ -186,6 +193,8 @@ static void find_base_packs(struct string_list *packs, unsigned long limit)
 
 	if (base)
 		string_list_append(packs, base->pack_name);
+
+	return base;
 }
 
 static int too_many_packs(void)
@@ -211,6 +220,79 @@ static int too_many_packs(void)
 	return gc_auto_pack_limit < cnt;
 }
 
+static uint64_t total_ram(void)
+{
+#if defined(HAVE_SYSINFO)
+	struct sysinfo si;
+
+	if (!sysinfo(&si))
+		return si.totalram;
+#elif defined(HAVE_BSD_SYSCTL) && (defined(HW_MEMSIZE) || defined(HW_PHYSMEM))
+	int64_t physical_memory;
+	int mib[2];
+	size_t length;
+
+	mib[0] = CTL_HW;
+# if defined(HW_MEMSIZE)
+	mib[1] = HW_MEMSIZE;
+# else
+	mib[1] = HW_PHYSMEM;
+# endif
+	length = sizeof(int64_t);
+	if (!sysctl(mib, 2, &physical_memory, &length, NULL, 0))
+		return physical_memory;
+#elif defined(GIT_WINDOWS_NATIVE)
+	MEMORYSTATUSEX memInfo;
+
+	memInfo.dwLength = sizeof(MEMORYSTATUSEX);
+	if (GlobalMemoryStatusEx(&memInfo))
+		return memInfo.ullTotalPhys;
+#endif
+	return 0;
+}
+
+static uint64_t estimate_repack_memory(struct packed_git *pack)
+{
+	unsigned long nr_objects = approximate_object_count();
+	size_t os_cache, heap;
+
+	if (!pack || !nr_objects)
+		return 0;
+
+	/*
+	 * First we have to scan through at least one pack.
+	 * Assume enough room in OS file cache to keep the entire pack
+	 * or we may accidentally evict data of other processes from
+	 * the cache.
+	 */
+	os_cache = pack->pack_size + pack->index_size;
+	/* then pack-objects needs lots more for book keeping */
+	heap = sizeof(struct object_entry) * nr_objects;
+	/*
+	 * internal rev-list --all --objects takes up some memory too,
+	 * let's say half of it is for blobs
+	 */
+	heap += sizeof(struct blob) * nr_objects / 2;
+	/*
+	 * and the other half is for trees (commits and tags are
+	 * usually insignificant)
+	 */
+	heap += sizeof(struct tree) * nr_objects / 2;
+	/* and then obj_hash[], underestimated in fact */
+	heap += sizeof(struct object *) * nr_objects;
+	/* revindex is used also */
+	heap += sizeof(struct revindex_entry) * nr_objects;
+	/*
+	 * read_sha1_file() (either at delta calculation phase, or
+	 * writing phase) also fills up the delta base cache
+	 */
+	heap += delta_base_cache_limit;
+	/* and of course pack-objects has its own delta cache */
+	heap += max_delta_cache_size;
+
+	return os_cache + heap;
+}
+
 static int keep_one_pack(struct string_list_item *item, void *data)
 {
 	argv_array_pushf(&repack, "--keep-pack=%s", basename(item->string));
@@ -261,6 +343,20 @@ static int need_to_gc(void)
 				string_list_clear(&keep_pack, 0);
 				find_base_packs(&keep_pack, 0);
 			}
+		} else {
+			struct packed_git *p = find_base_packs(&keep_pack, 0);
+			uint64_t mem_have, mem_want;
+
+			mem_have = total_ram();
+			mem_want = estimate_repack_memory(p);
+
+			/*
+			 * Only allow 1/2 of memory for pack-objects, leave
+			 * the rest for the OS and other processes in the
+			 * system.
+			 */
+			if (!mem_have || mem_want < mem_have / 2)
+				string_list_clear(&keep_pack, 0);
 		}
 
 		add_repack_all_option(&keep_pack);
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 7ec70131a9..8b2f8b72bf 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -80,7 +80,7 @@ static uint16_t write_bitmap_options;
 static int exclude_promisor_objects;
 
 static unsigned long delta_cache_size = 0;
-static unsigned long max_delta_cache_size = 256 * 1024 * 1024;
+static unsigned long max_delta_cache_size = DEFAULT_DELTA_CACHE_SIZE;
 static unsigned long cache_max_small_delta_size = 1000;
 
 static unsigned long window_memory_limit = 0;
diff --git a/config.mak.uname b/config.mak.uname
index 6a1d0de0cc..ae9cbccec1 100644
--- a/config.mak.uname
+++ b/config.mak.uname
@@ -37,6 +37,7 @@ ifeq ($(uname_S),Linux)
 	HAVE_GETDELIM = YesPlease
 	SANE_TEXT_GREP=-a
 	FREAD_READS_DIRECTORIES = UnfortunatelyYes
+	BASIC_CFLAGS += -DHAVE_SYSINFO
 endif
 ifeq ($(uname_S),GNU/kFreeBSD)
 	HAVE_ALLOCA_H = YesPlease
diff --git a/git-compat-util.h b/git-compat-util.h
index 07e383257b..e373af48b8 100644
--- a/git-compat-util.h
+++ b/git-compat-util.h
@@ -284,6 +284,10 @@ extern char *gitdirname(char *);
 #include <openssl/err.h>
 #endif
 
+#ifdef HAVE_SYSINFO
+# include <sys/sysinfo.h>
+#endif
+
 /* On most systems <netdb.h> would have given us this, but
  * not on some systems (e.g. z/OS).
  */
diff --git a/pack-objects.h b/pack-objects.h
index 03f1191659..af4f46c026 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -1,6 +1,8 @@
 #ifndef PACK_OBJECTS_H
 #define PACK_OBJECTS_H
 
+#define DEFAULT_DELTA_CACHE_SIZE (256 * 1024 * 1024)
+
 struct object_entry {
 	struct pack_idx_entry idx;
 	unsigned long size;	/* uncompressed size */
diff --git a/t/t6500-gc.sh b/t/t6500-gc.sh
index a95969af2a..100f287b97 100755
--- a/t/t6500-gc.sh
+++ b/t/t6500-gc.sh
@@ -5,6 +5,13 @@ test_description='basic git gc tests
 
 . ./test-lib.sh
 
+test_expect_success 'setup' '
+	# do not let the amount of physical memory affects gc
+	# behavior, make sure we always pack everything to one pack by
+	# default
+	git config gc.bigPackThreshold 2g
+'
+
 test_expect_success 'gc empty repository' '
 	git gc
 '
-- 
2.17.0.rc0.348.gd5a49e0b6f


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v4 7/7] pack-objects: show some progress when counting kept objects
  2018-03-24  7:25       ` [PATCH v4 0/7] nd/repack-keep-pack updates Nguyễn Thái Ngọc Duy
                           ` (5 preceding siblings ...)
  2018-03-24  7:25         ` [PATCH v4 6/7] gc --auto: exclude base pack if not enough mem to "repack -ad" Nguyễn Thái Ngọc Duy
@ 2018-03-24  7:25         ` Nguyễn Thái Ngọc Duy
  6 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-24  7:25 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

We only show progress when there are new objects to be packed. But
when --keep-pack is specified on the base pack, we will exclude most
of objects. This makes 'pack-objects' stay silent for a long time
while the counting phase is going.

Let's show some progress whenever we visit an object instead. The old
"Counting objects" is renamed to "Enumerating objects" and a new
progress "Counting objects" line is added.

This new "Counting objects" line should progress pretty quick when the
system is beefy. But when the system is under pressure, the reading
object header done in this phase could be slow and showing progress is
an improvement over staying silent in the current code.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 8b2f8b72bf..1379b4cb92 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -44,7 +44,7 @@ static const char *pack_usage[] = {
 static struct packing_data to_pack;
 
 static struct pack_idx_entry **written_list;
-static uint32_t nr_result, nr_written;
+static uint32_t nr_result, nr_written, nr_seen;
 
 static int non_empty;
 static int reuse_delta = 1, reuse_object = 1;
@@ -1096,6 +1096,8 @@ static int add_object_entry(const struct object_id *oid, enum object_type type,
 	off_t found_offset = 0;
 	uint32_t index_pos;
 
+	display_progress(progress_state, ++nr_seen);
+
 	if (have_duplicate_entry(oid, exclude, &index_pos))
 		return 0;
 
@@ -1111,8 +1113,6 @@ static int add_object_entry(const struct object_id *oid, enum object_type type,
 	create_object_entry(oid, type, pack_name_hash(name),
 			    exclude, name && no_try_delta(name),
 			    index_pos, found_pack, found_offset);
-
-	display_progress(progress_state, nr_result);
 	return 1;
 }
 
@@ -1123,6 +1123,8 @@ static int add_object_entry_from_bitmap(const struct object_id *oid,
 {
 	uint32_t index_pos;
 
+	display_progress(progress_state, ++nr_seen);
+
 	if (have_duplicate_entry(oid, 0, &index_pos))
 		return 0;
 
@@ -1130,8 +1132,6 @@ static int add_object_entry_from_bitmap(const struct object_id *oid,
 		return 0;
 
 	create_object_entry(oid, type, name_hash, 0, 0, index_pos, pack, offset);
-
-	display_progress(progress_state, nr_result);
 	return 1;
 }
 
@@ -1717,6 +1717,10 @@ static void get_object_details(void)
 	uint32_t i;
 	struct object_entry **sorted_by_offset;
 
+	if (progress)
+		progress_state = start_progress(_("Counting objects"),
+						to_pack.nr_objects);
+
 	sorted_by_offset = xcalloc(to_pack.nr_objects, sizeof(struct object_entry *));
 	for (i = 0; i < to_pack.nr_objects; i++)
 		sorted_by_offset[i] = to_pack.objects + i;
@@ -1727,7 +1731,9 @@ static void get_object_details(void)
 		check_object(entry);
 		if (big_file_threshold < entry->size)
 			entry->no_try_delta = 1;
+		display_progress(progress_state, i + 1);
 	}
+	stop_progress(&progress_state);
 
 	/*
 	 * This must happen in a second pass, since we rely on the delta
@@ -3212,7 +3218,7 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 	}
 
 	if (progress)
-		progress_state = start_progress(_("Counting objects"), 0);
+		progress_state = start_progress(_("Enumerating objects"), 0);
 	if (!use_internal_rev_list)
 		read_object_list_from_stdin();
 	else {
-- 
2.17.0.rc0.348.gd5a49e0b6f


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* Re: [PATCH v7 06/13] pack-objects: move in_pack out of struct object_entry
  2018-03-24  6:33               ` [PATCH v7 06/13] pack-objects: move in_pack " Nguyễn Thái Ngọc Duy
@ 2018-03-24  9:42                 ` Ævar Arnfjörð Bjarmason
  2018-03-24 12:26                   ` Duy Nguyen
  2018-03-24 12:13                 ` Ævar Arnfjörð Bjarmason
  2018-03-30 20:48                 ` Jeff King
  2 siblings, 1 reply; 273+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2018-03-24  9:42 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: e, git, gitster, peff


On Sat, Mar 24 2018, Nguyễn Thái Ngọc Duy wrote:

> +	if (pack->in_pack_by_idx) {
> +		if (p->index <= 0)
> +			die("BUG: found_pack should be NULL "
> +					"instead of having non-positive index");
> +			e->in_pack_idx = p->index;
> +	} else

The indentation after the die() here is wrong. GCC complaining about it:
    
    ./pack-objects.h: In function ‘oe_set_in_pack’:
    ./pack-objects.h:203:3: warning: this ‘if’ clause does not guard... [-Wmisleading-indentation]
       if (p->index <= 0)
       ^~
    ./pack-objects.h:206:4: note: ...this statement, but the latter is misleadingly indented as if it were guarded by the ‘if’
        e->in_pack_idx = p->index;
        ^

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v7 06/13] pack-objects: move in_pack out of struct object_entry
  2018-03-24  6:33               ` [PATCH v7 06/13] pack-objects: move in_pack " Nguyễn Thái Ngọc Duy
  2018-03-24  9:42                 ` Ævar Arnfjörð Bjarmason
@ 2018-03-24 12:13                 ` Ævar Arnfjörð Bjarmason
  2018-03-30 20:48                 ` Jeff King
  2 siblings, 0 replies; 273+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2018-03-24 12:13 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: e, git, gitster, peff


On Sat, Mar 24 2018, Nguyễn Thái Ngọc Duy wrote:

> Instead of using 8 bytes (on 64 bit arch) to store a pointer to a
> pack. Use an index instead since the number of packs should be
> relatively small.
>
> This limits the number of packs we can handle to 1k. Since we can't be
> sure people can never run into the situation where they have more than
> 1k pack files. Provide a fall back route for it.
>
> If we find out they have too many packs, the new in_pack_by_idx[]
> array (which has at most 1k elements) will not be used. Instead we
> allocate in_pack[] array that holds nr_objects elements. This is
> similar to how the optional in_pack_pos field is handled.
>
> The new simple test is just to make sure the too-many-packs code path
> is at least executed. The true test is running
>
>     make test GIT_TEST_FULL_IN_PACK_ARRAY=1

Aside from the tiny nit in 87efk9yfm2.fsf@evledraar.gmail.com this looks
good to me.

I've tested this with the same method noted in
87vadpxv27.fsf@evledraar.gmail.com against the version before it on
similar test data, and got:

 * Reduction in user time by 0.42%
 * Reduction in system time by 3.17%
 * Reduction in RSS by 0.003209%
 * Reduction in page faults by 0% & 0.006539% (time(1) reports two different numbers)
 * Reduction in the I of I/O by 99.504132% (note: from 4840 bytes to 24 bytes, so some fluke...)
 * Reduction in the O of I/O by 0%

I.e. there's no notable change at all, but I thought it would be useful
to re-run this for context.

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v7 06/13] pack-objects: move in_pack out of struct object_entry
  2018-03-24  9:42                 ` Ævar Arnfjörð Bjarmason
@ 2018-03-24 12:26                   ` Duy Nguyen
  0 siblings, 0 replies; 273+ messages in thread
From: Duy Nguyen @ 2018-03-24 12:26 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Eric Wong, Git Mailing List, Junio C Hamano, Jeff King

On Sat, Mar 24, 2018 at 10:42 AM, Ævar Arnfjörð Bjarmason
<avarab@gmail.com> wrote:
>
> On Sat, Mar 24 2018, Nguyễn Thái Ngọc Duy wrote:
>
>> +     if (pack->in_pack_by_idx) {
>> +             if (p->index <= 0)
>> +                     die("BUG: found_pack should be NULL "
>> +                                     "instead of having non-positive index");
>> +                     e->in_pack_idx = p->index;
>> +     } else
>
> The indentation after the die() here is wrong. GCC complaining about it:
>
>     ./pack-objects.h: In function ‘oe_set_in_pack’:
>     ./pack-objects.h:203:3: warning: this ‘if’ clause does not guard... [-Wmisleading-indentation]
>        if (p->index <= 0)
>        ^~
>     ./pack-objects.h:206:4: note: ...this statement, but the latter is misleadingly indented as if it were guarded by the ‘if’
>         e->in_pack_idx = p->index;
>         ^

Thanks. My gcc reported the same thing but only when not used with ccache, hm...
-- 
Duy

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v7 00/13] nd/pack-objects-pack-struct updates
  2018-03-24  6:33             ` [PATCH v7 00/13] " Nguyễn Thái Ngọc Duy
                                 ` (12 preceding siblings ...)
  2018-03-24  6:33               ` [PATCH v7 13/13] pack-objects: reorder members to shrink " Nguyễn Thái Ngọc Duy
@ 2018-03-26 15:13               ` Jeff King
  2018-03-26 17:04                 ` Duy Nguyen
  2018-03-31 10:02               ` [PATCH v8 00/15] " Nguyễn Thái Ngọc Duy
  14 siblings, 1 reply; 273+ messages in thread
From: Jeff King @ 2018-03-26 15:13 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: avarab, e, git, gitster

On Sat, Mar 24, 2018 at 07:33:40AM +0100, Nguyễn Thái Ngọc Duy wrote:

> +unsigned long oe_get_size_slow(struct packing_data *pack,
> +			       const struct object_entry *e)
> +{
> +	struct packed_git *p;
> +	struct pack_window *w_curs;
> +	unsigned char *buf;
> +	enum object_type type;
> +	unsigned long used, avail, size;
> +
> +	if (e->type_ != OBJ_OFS_DELTA && e->type_ != OBJ_REF_DELTA) {
> +		read_lock();
> +		if (sha1_object_info(e->idx.oid.hash, &size) < 0)
> +			die(_("unable to get size of %s"),
> +			    oid_to_hex(&e->idx.oid));
> +		read_unlock();
> +		return size;
> +	}
> +
> +	p = oe_in_pack(pack, e);
> +	if (!p)
> +		die("BUG: when e->type is a delta, it must belong to a pack");
> +
> +	read_lock();
> +	w_curs = NULL;
> +	buf = use_pack(p, &w_curs, e->in_pack_offset, &avail);
> +	used = unpack_object_header_buffer(buf, avail, &type, &size);
> +	if (used == 0)
> +		die(_("unable to parse object header of %s"),
> +		    oid_to_hex(&e->idx.oid));
> +
> +	unuse_pack(&w_curs);
> +	read_unlock();
> +	return size;
> +}

It took me a while to figure out why this treated deltas and non-deltas
differently. At first I thought it was an optimization (since we can
find non-delta sizes quickly by looking at the headers).  But I think
it's just that you want to know the size of the actual _delta_, not the
reconstructed object. And there's no way to ask sha1_object_info() for
that.

Perhaps the _extended version of that function should learn an
OBJECT_INFO_NO_DEREF flag or something to tell it return the true delta
type and size. Then this whole function could just become a single call.

But short of that, it's probably worth a comment explaining what's going
on.

> +static void prepare_in_pack_by_idx(struct packing_data *pdata)
> +{
> +	struct packed_git **mapping, *p;
> +	int cnt = 0, nr = 1 << OE_IN_PACK_BITS;
> +
> +	if (getenv("GIT_TEST_FULL_IN_PACK_ARRAY")) {
> +		/*
> +		 * leave in_pack_by_idx NULL to force in_pack[] to be
> +		 * used instead
> +		 */
> +		return;
> +	}

Minor nit, but can we use git_env_bool() here? It's just as easy, and
it's less surprising in some corner cases.

>  struct object_entry *packlist_alloc(struct packing_data *pdata,
>  				    const unsigned char *sha1,
>  				    uint32_t index_pos)
>  {
>  	struct object_entry *new_entry;
>  
> +	if (!pdata->nr_objects) {
> +		prepare_in_pack_by_idx(pdata);
> +		if (getenv("GIT_TEST_OE_SIZE_BITS")) {
> +			int bits = atoi(getenv("GIT_TEST_OE_SIZE_BITS"));;
> +			pdata->oe_size_limit = 1 << bits;
> +		}
> +		if (!pdata->oe_size_limit)
> +			pdata->oe_size_limit = 1 << OE_SIZE_BITS;
> +	}

Ditto here; I think this could just be:

  pdata->oe_size_limit = git_env_ulong("GIT_TEST_OE_SIZE_BITS",
                                       (1 << OE_SIZE_BITS));

>  	if (pdata->nr_objects >= pdata->nr_alloc) {
>  		pdata->nr_alloc = (pdata->nr_alloc  + 1024) * 3 / 2;
>  		REALLOC_ARRAY(pdata->objects, pdata->nr_alloc);
> +
> +		if (!pdata->in_pack_by_idx)
> +			REALLOC_ARRAY(pdata->in_pack, pdata->nr_alloc);
>  	}

I was going to complain that we don't use ALLOC_GROW() here, but
actually that part is in the context. ;)

> @@ -35,7 +36,9 @@ enum dfs_state {
>   *
>   * "size" is the uncompressed object size. Compressed size of the raw
>   * data for an object in a pack is not stored anywhere but is computed
> - * and made available when reverse .idx is made.
> + * and made available when reverse .idx is made. Note that when an
> + * delta is reused, "size" is the uncompressed _delta_ size, not the
> + * canonical one after the delta has been applied.

s/an delta/a delta/

> +Running tests with special setups
> +---------------------------------
> +
> +The whole test suite could be run to test some special features
> +that cannot be easily covered by a few specific test cases. These
> +could be enabled by running the test suite with correct GIT_TEST_
> +environment set.
> +
> +GIT_TEST_SPLIT_INDEX forces split-index mode on the whole test suite.
> +
> +GIT_TEST_FULL_IN_PACK_ARRAY exercises the uncommon pack-objects code
> +path where there are more than 1024 packs even if the actual number of
> +packs in repository is below this limit.
> +
> +GIT_TEST_OE_SIZE_BITS=<bits> exercises the uncommon pack-objects
> +code path where we do not cache objecct size in memory and read it
> +from existing packs on demand. This normally only happens when the
> +object size is over 2GB. This variable forces the code path on any
> +object larger than 2^<bits> bytes.

It's nice to have these available to test the uncommon cases. But I have
a feeling nobody will ever run them, since it requires extra effort (and
takes a full test run).

I see there's a one-off test for GIT_TEST_FULL_IN_PACK_ARRAY, which I
think is a good idea, since it makes sure the code is exercised in a
normal test suite run. Should we do the same for GIT_TEST_OE_SIZE_BITS?

Also, s/objecct/object/. :)

> [...]

I haven't done an in-depth read of each patch yet; this was just what
jumped out at me from reading the interdiff.

-Peff

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v7 00/13] nd/pack-objects-pack-struct updates
  2018-03-26 15:13               ` [PATCH v7 00/13] nd/pack-objects-pack-struct updates Jeff King
@ 2018-03-26 17:04                 ` Duy Nguyen
  2018-03-27 16:53                   ` Jeff King
  0 siblings, 1 reply; 273+ messages in thread
From: Duy Nguyen @ 2018-03-26 17:04 UTC (permalink / raw)
  To: Jeff King
  Cc: Ævar Arnfjörð Bjarmason, Eric Wong,
	Git Mailing List, Junio C Hamano

On Mon, Mar 26, 2018 at 5:13 PM, Jeff King <peff@peff.net> wrote:
> On Sat, Mar 24, 2018 at 07:33:40AM +0100, Nguyễn Thái Ngọc Duy wrote:
>
>> +unsigned long oe_get_size_slow(struct packing_data *pack,
>> +                            const struct object_entry *e)
>> +{
>> +     struct packed_git *p;
>> +     struct pack_window *w_curs;
>> +     unsigned char *buf;
>> +     enum object_type type;
>> +     unsigned long used, avail, size;
>> +
>> +     if (e->type_ != OBJ_OFS_DELTA && e->type_ != OBJ_REF_DELTA) {
>> +             read_lock();
>> +             if (sha1_object_info(e->idx.oid.hash, &size) < 0)
>> +                     die(_("unable to get size of %s"),
>> +                         oid_to_hex(&e->idx.oid));
>> +             read_unlock();
>> +             return size;
>> +     }
>> +
>> +     p = oe_in_pack(pack, e);
>> +     if (!p)
>> +             die("BUG: when e->type is a delta, it must belong to a pack");
>> +
>> +     read_lock();
>> +     w_curs = NULL;
>> +     buf = use_pack(p, &w_curs, e->in_pack_offset, &avail);
>> +     used = unpack_object_header_buffer(buf, avail, &type, &size);
>> +     if (used == 0)
>> +             die(_("unable to parse object header of %s"),
>> +                 oid_to_hex(&e->idx.oid));
>> +
>> +     unuse_pack(&w_curs);
>> +     read_unlock();
>> +     return size;
>> +}
>
> It took me a while to figure out why this treated deltas and non-deltas
> differently. At first I thought it was an optimization (since we can
> find non-delta sizes quickly by looking at the headers).  But I think
> it's just that you want to know the size of the actual _delta_, not the
> reconstructed object. And there's no way to ask sha1_object_info() for
> that.
>
> Perhaps the _extended version of that function should learn an
> OBJECT_INFO_NO_DEREF flag or something to tell it return the true delta
> type and size. Then this whole function could just become a single call.
>
> But short of that, it's probably worth a comment explaining what's going
> on.

I thought the elaboration on "size" in the big comment block in front
of struct object_entry was enough. I was wrong. Will add something
here.

>> +Running tests with special setups
>> +---------------------------------
>> +
>> +The whole test suite could be run to test some special features
>> +that cannot be easily covered by a few specific test cases. These
>> +could be enabled by running the test suite with correct GIT_TEST_
>> +environment set.
>> +
>> +GIT_TEST_SPLIT_INDEX forces split-index mode on the whole test suite.
>> +
>> +GIT_TEST_FULL_IN_PACK_ARRAY exercises the uncommon pack-objects code
>> +path where there are more than 1024 packs even if the actual number of
>> +packs in repository is below this limit.
>> +
>> +GIT_TEST_OE_SIZE_BITS=<bits> exercises the uncommon pack-objects
>> +code path where we do not cache objecct size in memory and read it
>> +from existing packs on demand. This normally only happens when the
>> +object size is over 2GB. This variable forces the code path on any
>> +object larger than 2^<bits> bytes.
>
> It's nice to have these available to test the uncommon cases. But I have
> a feeling nobody will ever run them, since it requires extra effort (and
> takes a full test run).

I know :) I also know that this does not interfere with
GIT_TEST_SPLIT_INDEX, which is being run in Travis. So the plan (after
this series is merged) is to make Travis second run to do something
like

make test GIT_TEST_SPLIT...=1 GIT_TEST_FULL..=1 GIT_TEST_OE..=4

we don't waste more cpu cycles and we can make sure these code paths
are always run (at least on one platform)

> I see there's a one-off test for GIT_TEST_FULL_IN_PACK_ARRAY, which I
> think is a good idea, since it makes sure the code is exercised in a
> normal test suite run. Should we do the same for GIT_TEST_OE_SIZE_BITS?

I think the problem with OE_SIZE_BITS is it has many different code
paths (like reused deltas) which is hard to make sure it runs. But yes
I think I could construct a pack that executes both code paths in
oe_get_size_slow(). Will do in a reroll.

> I haven't done an in-depth read of each patch yet; this was just what
> jumped out at me from reading the interdiff.

I would really appreciate it if you could find some time to do it. The
bugs I found in this round proved that I had no idea what's really
going on in pack-objects. Sure I know the big picture but that's far
from enough to do changes like this.
-- 
Duy

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v4 06/11] pack-objects: move in_pack out of struct object_entry
  2018-03-16 18:31         ` [PATCH v4 06/11] pack-objects: move in_pack " Nguyễn Thái Ngọc Duy
@ 2018-03-26 20:39           ` Stefan Beller
  0 siblings, 0 replies; 273+ messages in thread
From: Stefan Beller @ 2018-03-26 20:39 UTC (permalink / raw)
  To: Duy Nguyen
  Cc: Ævar Arnfjörð Bjarmason, Eric Wong, git,
	Junio C Hamano, Jeff King

Hi,

sorry for the late review, as I am pointed here indirectly via
https://public-inbox.org/git/xmqqy3iebpsw.fsf@gitster-ct.c.googlers.com/

On Fri, Mar 16, 2018 at 11:33 AM Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
wrote:

> +LIMITATIONS
> +-----------
> +
> +This command could only handle 16384 existing pack files at a time.

s/could/can/ ?

> @@ -3191,6 +3200,9 @@ int cmd_pack_objects(int argc, const char **argv,
const char *prefix)
>                  }
>          }

> +       /* make sure IN_PACK(0) return NULL */

I was confused for a while staring at this comment, /s/0/NULL/
would have helped me.

> +static inline unsigned int oe_add_pack(struct packing_data *pack,
> +                                      struct packed_git *p)
> +{
> +       if (pack->in_pack_count >= (1 << OE_IN_PACK_BITS))
> +               die(_("too many packs to handle in one go. "
> +                     "Please add .keep files to exclude\n"
> +                     "some pack files and keep the number "
> +                     "of non-kept files below %d."),
> +                   1 << OE_IN_PACK_BITS);

The packs are indexed 0..N-1, so we can actually handle N
packs I presume. But if we actually have N, then we'd run the

   /* make sure IN_PACK(0) return NULL */
   oe_add_pack(.., NULL);

as N+1, hence the user can only do N-1 ?

Oh wait! the code below makes me think we index from 1..N,
treating index 0 special as uninitialized? So we actually can only
store N-1 ?


> +       if (p) {
> +               if (p->index > 0)

s/>/!=/ ?

The new index variable is only used in these three
inlined header functions, and in_pack_count is strictly
positive, so index as well as in_pack_count could be made
unsigned?

Given that oe_add_pack returns an unsigned, I would actually
prefer to have in_pack_count an unsigned as well.

> +                       die("BUG: this packed is already indexed");
> +               p->index = pack->in_pack_count;
> +       }
> +       pack->in_pack[pack->in_pack_count] = p;
> +       return pack->in_pack_count++;
> +}
> +
> +static inline struct packed_git *oe_in_pack(const struct packing_data
*pack,
> +                                           const struct object_entry *e)
> +{
> +       return pack->in_pack[e->in_pack_idx];
> +
> +}

extra new line after return?

> +static inline void oe_set_in_pack(struct object_entry *e,
> +                                 struct packed_git *p)
> +{
> +       if (p->index <= 0)
> +               die("BUG: found_pack should be NULL "
> +                   "instead of having non-positive index");

Do we also want to guard against
     p->index > (1 << OE_IN_PACK_BITS)
here? Also there is a BUG() macro, that would be better
as it reports line file/number, but we cannot use it here as
it is a header inline.

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v7 00/13] nd/pack-objects-pack-struct updates
  2018-03-26 17:04                 ` Duy Nguyen
@ 2018-03-27 16:53                   ` Jeff King
  0 siblings, 0 replies; 273+ messages in thread
From: Jeff King @ 2018-03-27 16:53 UTC (permalink / raw)
  To: Duy Nguyen
  Cc: Ævar Arnfjörð Bjarmason, Eric Wong,
	Git Mailing List, Junio C Hamano

On Mon, Mar 26, 2018 at 07:04:54PM +0200, Duy Nguyen wrote:

> >> +unsigned long oe_get_size_slow(struct packing_data *pack,
> >> +                            const struct object_entry *e)
> [...]
> > But short of that, it's probably worth a comment explaining what's going
> > on.
> 
> I thought the elaboration on "size" in the big comment block in front
> of struct object_entry was enough. I was wrong. Will add something
> here.

It may be my fault for reading the interdiff, which didn't include that
comment. I was literally just thinking something like:

  /*
   * Return the size of the object without doing any delta
   * reconstruction (so non-deltas are true object sizes, but
   * deltas return the size of the delta data).
   */

> > I see there's a one-off test for GIT_TEST_FULL_IN_PACK_ARRAY, which I
> > think is a good idea, since it makes sure the code is exercised in a
> > normal test suite run. Should we do the same for GIT_TEST_OE_SIZE_BITS?
> 
> I think the problem with OE_SIZE_BITS is it has many different code
> paths (like reused deltas) which is hard to make sure it runs. But yes
> I think I could construct a pack that executes both code paths in
> oe_get_size_slow(). Will do in a reroll.

OK. If it's too painful to construct a good example, don't worry about
it.  It sounds like we're unlikely to get full coverage anyway.

> > I haven't done an in-depth read of each patch yet; this was just what
> > jumped out at me from reading the interdiff.
> 
> I would really appreciate it if you could find some time to do it. The
> bugs I found in this round proved that I had no idea what's really
> going on in pack-objects. Sure I know the big picture but that's far
> from enough to do changes like this.

I didn't get to it today, but I'll try to give it a careful read. There
are quite a few corners of pack-objects I don't know well, but I think
at this point I may be the most expert of remaining people. Scary. :)

-Peff

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v7 02/13] pack-objects: turn type and in_pack_type to bitfields
  2018-03-24  6:33               ` [PATCH v7 02/13] pack-objects: turn type and in_pack_type to bitfields Nguyễn Thái Ngọc Duy
@ 2018-03-30 20:18                 ` Jeff King
  0 siblings, 0 replies; 273+ messages in thread
From: Jeff King @ 2018-03-30 20:18 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: avarab, e, git, gitster

On Sat, Mar 24, 2018 at 07:33:42AM +0100, Nguyễn Thái Ngọc Duy wrote:

> +static inline void oe_set_type(struct object_entry *e,
> +			       enum object_type type)
> +{
> +	if (type >= OBJ_ANY)
> +		die("BUG: OBJ_ANY cannot be set in pack-objects code");

A minor nit, but this (and other new assertions) should probably be
BUG().

> +	e->type_valid = type >= OBJ_NONE;
> +	e->type_ = (unsigned)type;

Hmm, so if !e->type_valid, then we may write utter garbage into
e->type_. That's OK, since everybody will access it via oe_type(), but I
wonder if we could trigger weird compiler behavior. I guess the unsigned
cast makes the truncation well-defined.

-Peff

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v7 03/13] pack-objects: use bitfield for object_entry::dfs_state
  2018-03-24  6:33               ` [PATCH v7 03/13] pack-objects: use bitfield for object_entry::dfs_state Nguyễn Thái Ngọc Duy
@ 2018-03-30 20:23                 ` Jeff King
  0 siblings, 0 replies; 273+ messages in thread
From: Jeff King @ 2018-03-30 20:23 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: avarab, e, git, gitster

On Sat, Mar 24, 2018 at 07:33:43AM +0100, Nguyễn Thái Ngọc Duy wrote:

> Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
> ---

This probably needs some explanation for people digging in history (even
if it's "this is to shrink the size as part of a larger struct-shrinking
effort" so they know to dig around in the nearby history).

> diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
> index 647c01ea34..83f8154865 100644
> --- a/builtin/pack-objects.c
> +++ b/builtin/pack-objects.c
> @@ -3049,6 +3049,9 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
>  		OPT_END(),
>  	};
>  
> +	if (DFS_NUM_STATES > (1 << OE_DFS_STATE_BITS))
> +		die("BUG: too many dfs states, increase OE_DFS_STATE_BITS");

I thought this was off-by-one at first, but NUM_STATES is one more than
the highest state, so it's right.

I suspect all of the dfs and depth stuff could be pulled into a separate
array that is used only during that depth search. But as you have it
squished down here, I think we may be getting it "for free" in between
other non-word-aligned values in the struct.

-Peff

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v7 04/13] pack-objects: use bitfield for object_entry::depth
  2018-03-24  6:33               ` [PATCH v7 04/13] pack-objects: use bitfield for object_entry::depth Nguyễn Thái Ngọc Duy
@ 2018-03-30 20:26                 ` Jeff King
  0 siblings, 0 replies; 273+ messages in thread
From: Jeff King @ 2018-03-30 20:26 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: avarab, e, git, gitster

On Sat, Mar 24, 2018 at 07:33:44AM +0100, Nguyễn Thái Ngọc Duy wrote:

> Because of struct packing from now on we can only handle max depth
> 4095 (or even lower when new booleans are added in this struct). This
> should be ok since long delta chain will cause significant slow down
> anyway.

OK. This is the first user-facing change, but I think it really
shouldn't hurt anybody. My experiments a while ago showed that chains
longer than 50 aren't really worth it, but so this could probably shrink
to something like 8 bits if we really needed it to.

> diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
> index 83f8154865..205e1f646c 100644
> --- a/builtin/pack-objects.c
> +++ b/builtin/pack-objects.c
> @@ -3068,6 +3068,10 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
>  	if (pack_to_stdout != !base_name || argc)
>  		usage_with_options(pack_usage, pack_objects_options);
>  
> +	if (depth >= (1 << OE_DEPTH_BITS))
> +		die(_("delta chain depth %d is greater than maximum limit %d"),
> +		    depth, (1 << OE_DEPTH_BITS) - 1);

Since this is introducing a new limit, I wonder if we should issue a
warning and just clamp it to the maximum value. That would be kinder to
people who may have existing (admittedly dumb) setups.

-Peff

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v7 05/13] pack-objects: move in_pack_pos out of struct object_entry
  2018-03-24  6:33               ` [PATCH v7 05/13] pack-objects: move in_pack_pos out of struct object_entry Nguyễn Thái Ngọc Duy
@ 2018-03-30 20:30                 ` Jeff King
  0 siblings, 0 replies; 273+ messages in thread
From: Jeff King @ 2018-03-30 20:30 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: avarab, e, git, gitster

On Sat, Mar 24, 2018 at 07:33:45AM +0100, Nguyễn Thái Ngọc Duy wrote:

> This field is only need for pack-bitmap, which is an optional
> feature. Move it to a separate array that is only allocated when
> pack-bitmap is used (it's not freed in the same way that objects[] is
> not).

I had trouble parsing the parenthetical in the last sentence. It does
make sense if you read it hard enough, but maybe:

  (like objects[], it is not freed, since we need it until the end of
  the process)

would be more clear?

The patch itself seems OK.

-Peff

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v7 06/13] pack-objects: move in_pack out of struct object_entry
  2018-03-24  6:33               ` [PATCH v7 06/13] pack-objects: move in_pack " Nguyễn Thái Ngọc Duy
  2018-03-24  9:42                 ` Ævar Arnfjörð Bjarmason
  2018-03-24 12:13                 ` Ævar Arnfjörð Bjarmason
@ 2018-03-30 20:48                 ` Jeff King
  2018-03-31  4:51                   ` Duy Nguyen
  2 siblings, 1 reply; 273+ messages in thread
From: Jeff King @ 2018-03-30 20:48 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: avarab, e, git, gitster

On Sat, Mar 24, 2018 at 07:33:46AM +0100, Nguyễn Thái Ngọc Duy wrote:

> diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
> index e1244918a5..b41610569e 100644
> --- a/builtin/pack-objects.c
> +++ b/builtin/pack-objects.c
> @@ -29,6 +29,8 @@
>  #include "list.h"
>  #include "packfile.h"
>  
> +#define IN_PACK(obj) oe_in_pack(&to_pack, obj)

How come this one gets a macro, but the earlier conversions don't?

I guess the problem is that oe_in_pack() is defined in the generic
pack-objects.h, but &to_pack is only in builtin/pack-objects.c?

I wonder if it would be that bad to just say oe_in_pack(&to_pack, obj)
everywhere. It's longer, but it makes the code slightly less magical to
read.

> @@ -1074,7 +1076,7 @@ static void create_object_entry(const struct object_id *oid,
>  	else
>  		nr_result++;
>  	if (found_pack) {
> -		entry->in_pack = found_pack;
> +		oe_set_in_pack(&to_pack, entry, found_pack);
>  		entry->in_pack_offset = found_offset;
>  	}

it's funny to see in_pack as an external thing, but in_pack_offset still
in the struct. I guess there's nothing to be gained there, since the
offset really does need to be individual (and large).

> diff --git a/cache.h b/cache.h
> index 862bdff83a..b90feb3802 100644
> --- a/cache.h
> +++ b/cache.h
> @@ -1635,6 +1635,7 @@ extern struct packed_git {
>  	int index_version;
>  	time_t mtime;
>  	int pack_fd;
> +	int index;		/* for builtin/pack-objects.c */
>  	unsigned pack_local:1,
>  		 pack_keep:1,
>  		 freshened:1,

It's pretty gross to infect this global struct. But I'm not sure there's
an easier way to do it with constant-time lookups. You'd have to build
the packed_git index preemptively in pack-objects, and then always just
pass around the index numbers.  And even that is kind of dicey, since
the packed_git list can grow while we're running.

The alternative is a hash table mapping packed_git pointers into numeric
indices. Yuck.

> +static void prepare_in_pack_by_idx(struct packing_data *pdata)
> +{
> +	struct packed_git **mapping, *p;
> +	int cnt = 0, nr = 1 << OE_IN_PACK_BITS;
> +
> +	if (getenv("GIT_TEST_FULL_IN_PACK_ARRAY")) {
> +		/*
> +		 * leave in_pack_by_idx NULL to force in_pack[] to be
> +		 * used instead
> +		 */
> +		return;
> +	}
> +
> +	ALLOC_ARRAY(mapping, nr);
> +	mapping[cnt++] = NULL; /* zero index must be mapped to NULL */

Why? I guess because index==0 is a sentinel for "we're using the small
index numbers?"

> +	prepare_packed_git();
> +	for (p = packed_git; p; p = p->next, cnt++) {
> +		if (cnt == nr) {
> +			free(mapping);
> +			return;
> +		}
> +		p->index = cnt;
> +		mapping[cnt] = p;
> +	}
> +	pdata->in_pack_by_idx = mapping;
> +}

What happens if we later have to reprepare_packed_git() and end up with
more packs? We only call this for the first pack.

It may well be handled, but I'm having trouble following the code to see
if it is. And I doubt that case is covered by our test suite (since it
inherently involves a race).

>  /*
> + * The size of struct nearly determines pack-objects's memory
> + * consumption. This struct is packed tight for that reason. When you
> + * add or reorder something in this struct, think a bit about this.
> + *

It's funny to see this warning come in the middle. Should it be part of
the final struct reordering at the end?

-Peff

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v7 07/13] pack-objects: refer to delta objects by index instead of pointer
  2018-03-24  6:33               ` [PATCH v7 07/13] pack-objects: refer to delta objects by index instead of pointer Nguyễn Thái Ngọc Duy
@ 2018-03-30 20:53                 ` Jeff King
  0 siblings, 0 replies; 273+ messages in thread
From: Jeff King @ 2018-03-30 20:53 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: avarab, e, git, gitster

On Sat, Mar 24, 2018 at 07:33:47AM +0100, Nguyễn Thái Ngọc Duy wrote:

> These delta pointers always point to elements in the objects[] array
> in packing_data struct. We can only hold maximum 4G of those objects
> because the array size in nr_objects is uint32_t. We could use
> uint32_t indexes to address these elements instead of pointers. On
> 64-bit architecture (8 bytes per pointer) this would save 4 bytes per
> pointer.
> 
> Convert these delta pointers to indexes. Since we need to handle NULL
> pointers as well, the index is shifted by one [1].
> 
> [1] This means we can only index 2^32-2 objects even though nr_objects
>     could contain 2^32-1 objects. It should not be a problem in
>     practice because when we grow objects[], nr_alloc would probably
>     blow up long before nr_objects hits the wall.

Hmm, that may be something we eventually fix. I suspect all of this code
does some pretty horrible things as you approach 2^32 objects, though.
I've never tried to make such a pack, but it may be within the realm of
possibility. The .idx file would be 80+GB, but the packfile might not be
much bigger if specially crafted.

I guess that's outside the realm of reasonable, though, so we can assume
that nobody would _really_ want to do that anytime soon. And anything
malicious would probably die long before this code triggers.

> Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
> ---
>  builtin/pack-objects.c | 116 ++++++++++++++++++++++-------------------
>  pack-objects.h         |  67 ++++++++++++++++++++++--
>  2 files changed, 124 insertions(+), 59 deletions(-)

The patch itself looks OK. This is one of the nicer ones, because it
really doesn't involve any extra storage management, just some accessor
functions.

-Peff

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v7 08/13] pack-objects: shrink z_delta_size field in struct object_entry
  2018-03-24  6:33               ` [PATCH v7 08/13] pack-objects: shrink z_delta_size field in struct object_entry Nguyễn Thái Ngọc Duy
@ 2018-03-30 20:59                 ` Jeff King
  2018-03-31  4:40                   ` Duy Nguyen
  0 siblings, 1 reply; 273+ messages in thread
From: Jeff King @ 2018-03-30 20:59 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: avarab, e, git, gitster

On Sat, Mar 24, 2018 at 07:33:48AM +0100, Nguyễn Thái Ngọc Duy wrote:

> We only cache deltas when it's smaller than a certain limit. This limit
> defaults to 1000 but save its compressed length in a 64-bit field.
> Shrink that field down to 16 bits, so you can only cache 65kb deltas.
> Larger deltas must be recomputed at when the pack is written down.

Unlike the depth, I don't think there's any _inherent_ reason you
couldn't throw, say, 1MB deltas into the cache (if you sized it large
enough). But I doubt such deltas are really all that common. Here are
the top 10 in linux.git:

  $ git cat-file --batch-all-objects --batch-check='%(deltabase) %(objectsize:disk)' |
    grep -v ^00000 | sort -k 2nr | head
  a02b6794337286bc12c907c33d5d75537c240bd0 769103
  b28d4b64c05da02c5e8c684dcb9422876225ebdc 327116
  1e98ce86ed19aff9ba721d13a749ff08088c9922 325257
  a02b6794337286bc12c907c33d5d75537c240bd0 240647
  c550d99286c01867dfb26e432417f3106acf8611 177896
  5977795854f852c2b95dd023fd03cace023ee41c 119737
  4ccf9681c45d01d17376f7e0d266532a4460f5f8 112671
  b39fb6821faa9e7bc36de738152a2817b4bf3654 112657
  2645d6239b74bebd661436762e819b831095b084 103980
  b8ce7fe5d8def58dc63b7ae099eff7bd07e4e845 101014

It's possible some weird workload would want to tweak this. Say you were
storing a ton of delta-capable files that were big and always differed
by a megabyte. And it was somehow really important to you to tradeoff
memory for CPU during the write phase of a pack.

That seems pretty unlikely to bite anybody (and that was the best I
could come up with as a devil's advocate against it).

> Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
> ---
>  Documentation/config.txt |  3 ++-
>  builtin/pack-objects.c   | 22 ++++++++++++++++------
>  pack-objects.h           |  3 ++-
>  3 files changed, 20 insertions(+), 8 deletions(-)

Patch looks OK.

-Peff

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v7 10/13] pack-objects: clarify the use of object_entry::size
  2018-03-24  6:33               ` [PATCH v7 10/13] pack-objects: clarify the use of object_entry::size Nguyễn Thái Ngọc Duy
@ 2018-03-30 21:04                 ` Jeff King
  2018-03-31  4:35                   ` Duy Nguyen
  0 siblings, 1 reply; 273+ messages in thread
From: Jeff King @ 2018-03-30 21:04 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: avarab, e, git, gitster

On Sat, Mar 24, 2018 at 07:33:50AM +0100, Nguyễn Thái Ngọc Duy wrote:

> While this field most of the time contains the canonical object size,
> there is one case it does not: when we have found that the base object
> of the delta in question is also to be packed, we will very happily
> reuse the delta by copying it over instead of regenerating the new
> delta.
> 
> "size" in this case will record the delta size, not canonical object
> size. Later on in write_reuse_object(), we reconstruct the delta
> header and "size" is used for this purpose. When this happens, the
> "type" field contains a delta type instead of a canonical type.
> Highlight this in the code since it could be tricky to see.

Thanks for digging down here. I have definitely been confused by this in
the past.

The subject says "clarify" so I was a little surprised to see code
changes. It looks like we're just avoiding reassigning on top of the
value repeatedly, which is part of that clarification. It looks like a
noop to me.

-Peff

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v7 11/13] pack-objects: shrink size field in struct object_entry
  2018-03-24  6:33               ` [PATCH v7 11/13] pack-objects: shrink size field in struct object_entry Nguyễn Thái Ngọc Duy
@ 2018-03-30 21:18                 ` Jeff King
  0 siblings, 0 replies; 273+ messages in thread
From: Jeff King @ 2018-03-30 21:18 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: avarab, e, git, gitster

On Sat, Mar 24, 2018 at 07:33:51AM +0100, Nguyễn Thái Ngọc Duy wrote:

> It's very very rare that an uncompressed object is larger than 4GB
> (partly because Git does not handle those large files very well to
> begin with). Let's optimize it for the common case where object size
> is smaller than this limit.
> 
> Shrink size field down to 32 bits [1] and one overflow bit. If the
> size is too large, we read it back from disk. As noted in the previous
> patch, we need to return the delta size instead of canonical size when
> the to-be-reused object entry type is a delta instead of a canonical
> one.
> 
> Add two compare helpers that can take advantage of the overflow
> bit (e.g. if the file is 4GB+, chances are it's already larger than
> core.bigFileThreshold and there's no point in comparing the actual
> value).
> 
> Another note about oe_get_size_slow(). This function MUST be thread
> safe because SIZE() macro is used inside try_delta() which may run in
> parallel. Outside parallel code, no-contention locking should be dirt
> cheap (or insignificant compared to i/o access anyway). To exercise
> this code, it's best to run the test suite with something like
> 
>     make test GIT_TEST_OE_SIZE_BITS=2
> 
> which forces this code on all objects larger than 3 bytes.

OK, makes sense. Since we need it to be thread-safe, we have to use
read_lock(). Which means that oe_get_size_slow() is defined in
builtin/pack-objects.c. But the object_entry is defined in the
more-generic pack-objects.h.

So anybody besides builtin/pack-objects.c will have to implement their
own fallback when e->size_valid isn't true. Which is a little odd, but I
guess nobody else needs that field. It might bite us in the future, but
I'm willing to cross my fingers for now (the pack-objects.h header is
really just there to support the bitmap writing code, but even that
could in theory all get shoved into a single translation unit if we had
to).

> [1] it's actually already 32 bits on Windows

And linux-i386. :)

> +unsigned long oe_get_size_slow(struct packing_data *pack,
> +			       const struct object_entry *e)
> +{

I think I already replied about this earlier, so I'll skim over it this
time.

> diff --git a/pack-objects.c b/pack-objects.c
> index 13f2b2bff2..59c6e40a02 100644
> --- a/pack-objects.c
> +++ b/pack-objects.c
> @@ -120,8 +120,15 @@ struct object_entry *packlist_alloc(struct packing_data *pdata,
>  {
>  	struct object_entry *new_entry;
>  
> -	if (!pdata->nr_objects)
> +	if (!pdata->nr_objects) {
>  		prepare_in_pack_by_idx(pdata);
> +		if (getenv("GIT_TEST_OE_SIZE_BITS")) {
> +			int bits = atoi(getenv("GIT_TEST_OE_SIZE_BITS"));;
> +			pdata->oe_size_limit = 1 << bits;
> +		}
> +		if (!pdata->oe_size_limit)
> +			pdata->oe_size_limit = 1 << OE_SIZE_BITS;
> +	}

This needs to be "1U << OE_SIZE_BITS". Shifting a signed integer 31 bits
is undefined.

No, I'm not that clever or careful myself. I ran the whole test suite
with SANITIZE=address,undefined and it turned this up, as well as a
similar case for OE_DELTA_SIZE_BITS.

> +	uint32_t size_:OE_SIZE_BITS;
> +	uint32_t size_valid:1;

A uint32_t bitfield? Would it make more sense to just call these
"unsigned", since we're specifying the precision already?

> +unsigned long oe_get_size_slow(struct packing_data *pack,
> +			       const struct object_entry *e);
> +static inline unsigned long oe_size(struct packing_data *pack,
> +				    const struct object_entry *e)
> +{
> +	if (e->size_valid)
> +		return e->size_;
> +
> +	return oe_get_size_slow(pack, e);
> +}

If oe_get_size_slow() fails to find an object's size, it dies. I'm
trying to think of whether that might hit funny corner cases with
racing. I don't _think_ so, because if the object truly goes away, we'd
be screwed during the writing phase anyway.

> +static inline int oe_size_less_than(struct packing_data *pack,
> +				    const struct object_entry *lhs,
> +				    unsigned long rhs)
> +{
> +	if (lhs->size_valid)
> +		return lhs->size_ < rhs;
> +	if (rhs < pack->oe_size_limit) /* rhs < 2^x <= lhs ? */
> +		return 0;
> +	return oe_get_size_slow(pack, lhs) < rhs;
> +}

Clever.

> +static inline void oe_set_size(struct packing_data *pack,
> +			       struct object_entry *e,
> +			       unsigned long size)
> +{
> +	if (size < pack->oe_size_limit) {
> +		e->size_ = size;
> +		e->size_valid = 1;
> +	} else {
> +		e->size_valid = 0;
> +		if (oe_get_size_slow(pack, e) != size)
> +			die("BUG: 'size' is supposed to be the object size!");
> +	}
> +}

That's an expensive assertion. But I guess this isn't supposed to happen
very frequently, so it's probably OK.

-Peff

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v7 12/13] pack-objects: shrink delta_size field in struct object_entry
  2018-03-24  6:33               ` [PATCH v7 12/13] pack-objects: shrink delta_size " Nguyễn Thái Ngọc Duy
@ 2018-03-30 21:24                 ` Jeff King
  2018-03-31  4:21                   ` Duy Nguyen
  2018-03-31  9:10                   ` Duy Nguyen
  0 siblings, 2 replies; 273+ messages in thread
From: Jeff King @ 2018-03-30 21:24 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: avarab, e, git, gitster

On Sat, Mar 24, 2018 at 07:33:52AM +0100, Nguyễn Thái Ngọc Duy wrote:

> Allowing a delta size of 64 bits is crazy. Shrink this field down to
> 31 bits with one overflow bit.
> 
> If we find an existing delta larger than 2GB, we do not cache
> delta_size at all and will get the value from oe_size(), potentially
> from disk if it's larger than 4GB.

Since we have a fallback, we can put this slider wherever we want.
Probably something like 20 bits would be plenty, if we ever needed to
squeeze in a few more small-bit items.

> @@ -2004,10 +2006,12 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
>  	delta_buf = create_delta(src->index, trg->data, trg_size, &delta_size, max_size);
>  	if (!delta_buf)
>  		return 0;
> +	if (delta_size >= (1 << OE_DELTA_SIZE_BITS))
> +		return 0;

This is the other spot that needs to be "1U".

How come this doesn't get a pdata->oe_delta_size_limit like we have
pdata->oe_size_limit? Would we want a matching
$GIT_TEST_OE_DELTA_SIZE_BITS to test it, too?

-Peff

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v7 13/13] pack-objects: reorder members to shrink struct object_entry
  2018-03-24  6:33               ` [PATCH v7 13/13] pack-objects: reorder members to shrink " Nguyễn Thái Ngọc Duy
@ 2018-03-30 21:26                 ` Jeff King
  2018-03-31  4:10                   ` Duy Nguyen
  0 siblings, 1 reply; 273+ messages in thread
From: Jeff King @ 2018-03-30 21:26 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: avarab, e, git, gitster

On Sat, Mar 24, 2018 at 07:33:53AM +0100, Nguyễn Thái Ngọc Duy wrote:

> Previous patches leave lots of holes and padding in this struct. This
> patch reorders the members and shrinks the struct down to 80 bytes
> (from 136 bytes, before any field shrinking is done) with 16 bits to
> spare (and a couple more in in_pack_header_size when we really run out
> of bits).

Out of curiosity, did you count this yourself, or did you double-check
with a few compilers to make sure they all produce the same result?

So having read the whole thing now, I think most of my original concerns
have been addressed. I do think readability takes a hit, but it's not
_too_ bad. There are a few things that have become more brittle, but I
can't think of anything on the horizon that would bite us.

-Peff

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v7 13/13] pack-objects: reorder members to shrink struct object_entry
  2018-03-30 21:26                 ` Jeff King
@ 2018-03-31  4:10                   ` Duy Nguyen
  0 siblings, 0 replies; 273+ messages in thread
From: Duy Nguyen @ 2018-03-31  4:10 UTC (permalink / raw)
  To: Jeff King
  Cc: Ævar Arnfjörð Bjarmason, Eric Wong,
	Git Mailing List, Junio C Hamano

On Fri, Mar 30, 2018 at 11:26 PM, Jeff King <peff@peff.net> wrote:
> On Sat, Mar 24, 2018 at 07:33:53AM +0100, Nguyễn Thái Ngọc Duy wrote:
>
>> Previous patches leave lots of holes and padding in this struct. This
>> patch reorders the members and shrinks the struct down to 80 bytes
>> (from 136 bytes, before any field shrinking is done) with 16 bits to
>> spare (and a couple more in in_pack_header_size when we really run out
>> of bits).
>
> Out of curiosity, did you count this yourself, or did you double-check
> with a few compilers to make sure they all produce the same result?

I used pahole though only with .o files created by gcc 64-bit. I'll
try the 32-bit version and clang as well.
-- 
Duy

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v7 12/13] pack-objects: shrink delta_size field in struct object_entry
  2018-03-30 21:24                 ` Jeff King
@ 2018-03-31  4:21                   ` Duy Nguyen
  2018-03-31  9:10                   ` Duy Nguyen
  1 sibling, 0 replies; 273+ messages in thread
From: Duy Nguyen @ 2018-03-31  4:21 UTC (permalink / raw)
  To: Jeff King
  Cc: Ævar Arnfjörð Bjarmason, Eric Wong,
	Git Mailing List, Junio C Hamano

On Fri, Mar 30, 2018 at 11:24 PM, Jeff King <peff@peff.net> wrote:
> On Sat, Mar 24, 2018 at 07:33:52AM +0100, Nguyễn Thái Ngọc Duy wrote:
>> @@ -2004,10 +2006,12 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
>>       delta_buf = create_delta(src->index, trg->data, trg_size, &delta_size, max_size);
>>       if (!delta_buf)
>>               return 0;
>> +     if (delta_size >= (1 << OE_DELTA_SIZE_BITS))
>> +             return 0;
>
> This is the other spot that needs to be "1U".
>
> How come this doesn't get a pdata->oe_delta_size_limit like we have
> pdata->oe_size_limit? Would we want a matching
> $GIT_TEST_OE_DELTA_SIZE_BITS to test it, too?

Probably. This change does not look as risky as the others (no
complicated fallback). But without $GIT_TEST_OE_DELTA_SIZE_BITS it's
hard to know how the new code reacts when we get over the limit. I
will add it.
-- 
Duy

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v7 10/13] pack-objects: clarify the use of object_entry::size
  2018-03-30 21:04                 ` Jeff King
@ 2018-03-31  4:35                   ` Duy Nguyen
  2018-03-31 10:13                     ` Jeff King
  0 siblings, 1 reply; 273+ messages in thread
From: Duy Nguyen @ 2018-03-31  4:35 UTC (permalink / raw)
  To: Jeff King
  Cc: Ævar Arnfjörð Bjarmason, Eric Wong,
	Git Mailing List, Junio C Hamano

On Fri, Mar 30, 2018 at 11:04 PM, Jeff King <peff@peff.net> wrote:
> The subject says "clarify" so I was a little surprised to see code
> changes. It looks like we're just avoiding reassigning on top of the
> value repeatedly, which is part of that clarification. It looks like a
> noop to me.

Oh well... I was counting on the new name (in_pack_size, which follows
in_pack_type naming convention) to emphasize it (and the new "delta
size" comment to point out where in_pack_size contains a delta size.
-- 
Duy

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v7 08/13] pack-objects: shrink z_delta_size field in struct object_entry
  2018-03-30 20:59                 ` Jeff King
@ 2018-03-31  4:40                   ` Duy Nguyen
  2018-03-31 10:17                     ` Jeff King
  0 siblings, 1 reply; 273+ messages in thread
From: Duy Nguyen @ 2018-03-31  4:40 UTC (permalink / raw)
  To: Jeff King
  Cc: Ævar Arnfjörð Bjarmason, Eric Wong,
	Git Mailing List, Junio C Hamano

On Fri, Mar 30, 2018 at 10:59 PM, Jeff King <peff@peff.net> wrote:
> On Sat, Mar 24, 2018 at 07:33:48AM +0100, Nguyễn Thái Ngọc Duy wrote:
>
>> We only cache deltas when it's smaller than a certain limit. This limit
>> defaults to 1000 but save its compressed length in a 64-bit field.
>> Shrink that field down to 16 bits, so you can only cache 65kb deltas.
>> Larger deltas must be recomputed at when the pack is written down.
>
> Unlike the depth, I don't think there's any _inherent_ reason you
> couldn't throw, say, 1MB deltas into the cache (if you sized it large
> enough). But I doubt such deltas are really all that common. Here are
> the top 10 in linux.git:
>
>   $ git cat-file --batch-all-objects --batch-check='%(deltabase) %(objectsize:disk)' |
>     grep -v ^00000 | sort -k 2nr | head
>   a02b6794337286bc12c907c33d5d75537c240bd0 769103
>   b28d4b64c05da02c5e8c684dcb9422876225ebdc 327116
>   1e98ce86ed19aff9ba721d13a749ff08088c9922 325257
>   a02b6794337286bc12c907c33d5d75537c240bd0 240647
>   c550d99286c01867dfb26e432417f3106acf8611 177896
>   5977795854f852c2b95dd023fd03cace023ee41c 119737
>   4ccf9681c45d01d17376f7e0d266532a4460f5f8 112671
>   b39fb6821faa9e7bc36de738152a2817b4bf3654 112657
>   2645d6239b74bebd661436762e819b831095b084 103980
>   b8ce7fe5d8def58dc63b7ae099eff7bd07e4e845 101014
>
> It's possible some weird workload would want to tweak this. Say you were
> storing a ton of delta-capable files that were big and always differed
> by a megabyte. And it was somehow really important to you to tradeoff
> memory for CPU during the write phase of a pack.

We're not short on spare bits so I will try to raise this limit to 1MB
(not because you mentioned 1MB, but because the largest size in your
output is close to 1MB).
-- 
Duy

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v7 06/13] pack-objects: move in_pack out of struct object_entry
  2018-03-30 20:48                 ` Jeff King
@ 2018-03-31  4:51                   ` Duy Nguyen
  2018-03-31 10:20                     ` Jeff King
  0 siblings, 1 reply; 273+ messages in thread
From: Duy Nguyen @ 2018-03-31  4:51 UTC (permalink / raw)
  To: Jeff King
  Cc: Ævar Arnfjörð Bjarmason, Eric Wong,
	Git Mailing List, Junio C Hamano

On Fri, Mar 30, 2018 at 10:48 PM, Jeff King <peff@peff.net> wrote:
> On Sat, Mar 24, 2018 at 07:33:46AM +0100, Nguyễn Thái Ngọc Duy wrote:
>
>> diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
>> index e1244918a5..b41610569e 100644
>> --- a/builtin/pack-objects.c
>> +++ b/builtin/pack-objects.c
>> @@ -29,6 +29,8 @@
>>  #include "list.h"
>>  #include "packfile.h"
>>
>> +#define IN_PACK(obj) oe_in_pack(&to_pack, obj)
>
> How come this one gets a macro, but the earlier conversions don't?
>
> I guess the problem is that oe_in_pack() is defined in the generic
> pack-objects.h, but &to_pack is only in builtin/pack-objects.c?
>
> I wonder if it would be that bad to just say oe_in_pack(&to_pack, obj)
> everywhere. It's longer, but it makes the code slightly less magical to
> read.

Longer was exactly why I added these macros (with the hope that the
macro upper case names already ring a "it's magical" bell). Should I
drop all these macros? Some code becomes a lot more verbose though.

>> +static void prepare_in_pack_by_idx(struct packing_data *pdata)
>> +{
>> +     struct packed_git **mapping, *p;
>> +     int cnt = 0, nr = 1 << OE_IN_PACK_BITS;
>> +
>> +     if (getenv("GIT_TEST_FULL_IN_PACK_ARRAY")) {
>> +             /*
>> +              * leave in_pack_by_idx NULL to force in_pack[] to be
>> +              * used instead
>> +              */
>> +             return;
>> +     }
>> +
>> +     ALLOC_ARRAY(mapping, nr);
>> +     mapping[cnt++] = NULL; /* zero index must be mapped to NULL */
>
> Why? I guess because index==0 is a sentinel for "we're using the small
> index numbers?"

No because by default all values in object_entry is zero (or NULL). If
I remember correctly, some code will skip setting in_pack pointer to
leave it NULL. When we convert it to an index, it should also point to
NULL.

>> +     prepare_packed_git();
>> +     for (p = packed_git; p; p = p->next, cnt++) {
>> +             if (cnt == nr) {
>> +                     free(mapping);
>> +                     return;
>> +             }
>> +             p->index = cnt;
>> +             mapping[cnt] = p;
>> +     }
>> +     pdata->in_pack_by_idx = mapping;
>> +}
>
> What happens if we later have to reprepare_packed_git() and end up with
> more packs? We only call this for the first pack.
>
> It may well be handled, but I'm having trouble following the code to see
> if it is. And I doubt that case is covered by our test suite (since it
> inherently involves a race).

I don't think I covered this case. But since "index" field in
packed_git should be zero for the new packs, we could check and either
add it to in_pack_by_idx[].

>>  /*
>> + * The size of struct nearly determines pack-objects's memory
>> + * consumption. This struct is packed tight for that reason. When you
>> + * add or reorder something in this struct, think a bit about this.
>> + *
>
> It's funny to see this warning come in the middle. Should it be part of
> the final struct reordering at the end?

It was at the end in some version, the I shuffled the patches and
forgot about this one :)
-- 
Duy

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v7 12/13] pack-objects: shrink delta_size field in struct object_entry
  2018-03-30 21:24                 ` Jeff King
  2018-03-31  4:21                   ` Duy Nguyen
@ 2018-03-31  9:10                   ` Duy Nguyen
  1 sibling, 0 replies; 273+ messages in thread
From: Duy Nguyen @ 2018-03-31  9:10 UTC (permalink / raw)
  To: Jeff King
  Cc: Ævar Arnfjörð Bjarmason, Eric Wong,
	Git Mailing List, Junio C Hamano

On Fri, Mar 30, 2018 at 11:24 PM, Jeff King <peff@peff.net> wrote:
> How come this doesn't get a pdata->oe_delta_size_limit like we have
> pdata->oe_size_limit? Would we want a matching
> $GIT_TEST_OE_DELTA_SIZE_BITS to test it, too?

Nope. This changes how the delta chain is formed (e.g. produces
shorter chains) and apparently some tests rely on that, like t5303.
-- 
Duy

^ permalink raw reply	[flat|nested] 273+ messages in thread

* [PATCH v8 00/15] nd/pack-objects-pack-struct updates
  2018-03-24  6:33             ` [PATCH v7 00/13] " Nguyễn Thái Ngọc Duy
                                 ` (13 preceding siblings ...)
  2018-03-26 15:13               ` [PATCH v7 00/13] nd/pack-objects-pack-struct updates Jeff King
@ 2018-03-31 10:02               ` Nguyễn Thái Ngọc Duy
  2018-03-31 10:02                 ` [PATCH v8 01/15] t/README: mention about running the test suite in special modes Nguyễn Thái Ngọc Duy
                                   ` (16 more replies)
  14 siblings, 17 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-31 10:02 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

v8 changes

- prefer BUG() over die()
- do "1U <<" instead of "1 << " to avoid undefined behavior with
  signed shifting.
- add more comments based on Jeff's feedback
- plug a leak in try_delta() when delta_size is too large
- be kind and set depth/cache_max_small_delta_size to max limit
  instead of dying when the user gives a value over limit
- make travis execute pack-objects uncommon code
- use git_env_*() instead of manually handling getenv() values
- fallback code for when a new pack is added when pack-objects is
  running
- Compressed cached delta size limit is increased from 64k to 1MB
- Cached delta size limit is decreased from 2G to 1MB

Interdiff

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index c774821930..b5bba2c228 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -1439,7 +1439,7 @@ static void check_object(struct object_entry *entry)
 			goto give_up;
 
 		if (type < 0)
-			die("BUG: invalid type %d", type);
+			BUG("invalid type %d", type);
 		entry->in_pack_type = type;
 
 		/*
@@ -1861,6 +1861,11 @@ static pthread_mutex_t progress_mutex;
 
 #endif
 
+/*
+ * Return the size of the object without doing any delta
+ * reconstruction (so non-deltas are true object sizes, but deltas
+ * return the size of the delta data).
+ */
 unsigned long oe_get_size_slow(struct packing_data *pack,
 			       const struct object_entry *e)
 {
@@ -1881,7 +1886,7 @@ unsigned long oe_get_size_slow(struct packing_data *pack,
 
 	p = oe_in_pack(pack, e);
 	if (!p)
-		die("BUG: when e->type is a delta, it must belong to a pack");
+		BUG("when e->type is a delta, it must belong to a pack");
 
 	read_lock();
 	w_curs = NULL;
@@ -2006,8 +2011,10 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	delta_buf = create_delta(src->index, trg->data, trg_size, &delta_size, max_size);
 	if (!delta_buf)
 		return 0;
-	if (delta_size >= (1 << OE_DELTA_SIZE_BITS))
+	if (delta_size >= (1U << OE_DELTA_SIZE_BITS)) {
+		free(delta_buf);
 		return 0;
+	}
 
 	if (DELTA(trg_entry)) {
 		/* Prefer only shallower same-sized deltas. */
@@ -2163,7 +2170,7 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
 			unsigned long size;
 
 			size = do_compress(&entry->delta_data, DELTA_SIZE(entry));
-			if (size < (1 << OE_Z_DELTA_BITS)) {
+			if (size < (1U << OE_Z_DELTA_BITS)) {
 				entry->z_delta_size = size;
 				cache_lock();
 				delta_cache_size -= DELTA_SIZE(entry);
@@ -3131,7 +3138,7 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 	};
 
 	if (DFS_NUM_STATES > (1 << OE_DFS_STATE_BITS))
-		die("BUG: too many dfs states, increase OE_DFS_STATE_BITS");
+		BUG("too many dfs states, increase OE_DFS_STATE_BITS");
 
 	check_replace_refs = 0;
 
@@ -3149,12 +3156,16 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 	if (pack_to_stdout != !base_name || argc)
 		usage_with_options(pack_usage, pack_objects_options);
 
-	if (depth >= (1 << OE_DEPTH_BITS))
-		die(_("delta chain depth %d is greater than maximum limit %d"),
-		    depth, (1 << OE_DEPTH_BITS) - 1);
-	if (cache_max_small_delta_size >= (1 << OE_Z_DELTA_BITS))
-		die(_("pack.deltaCacheLimit is greater than maximum limit %d"),
-		    (1 << OE_Z_DELTA_BITS) - 1);
+	if (depth >= (1 << OE_DEPTH_BITS)) {
+		warning(_("delta chain depth %d is too deep, forcing %d"),
+			depth, (1 << OE_DEPTH_BITS) - 1);
+		depth = (1 << OE_DEPTH_BITS) - 1;
+	}
+	if (cache_max_small_delta_size >= (1U << OE_Z_DELTA_BITS)) {
+		warning(_("pack.deltaCacheLimit is too high, forcing %d"),
+			(1U << OE_Z_DELTA_BITS) - 1);
+		cache_max_small_delta_size = (1U << OE_Z_DELTA_BITS) - 1;
+	}
 
 	argv_array_push(&rp, "pack-objects");
 	if (thin) {
@@ -3274,6 +3285,8 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 		}
 	}
 
+	prepare_packing_data(&to_pack);
+
 	if (progress)
 		progress_state = start_progress(_("Counting objects"), 0);
 	if (!use_internal_rev_list)
diff --git a/ci/run-tests.sh b/ci/run-tests.sh
index 73e273fac7..857d144ee8 100755
--- a/ci/run-tests.sh
+++ b/ci/run-tests.sh
@@ -10,7 +10,10 @@ ln -s "$cache_dir/.prove" t/.prove
 make --quiet test
 if test "$jobname" = "linux-gcc"
 then
-	GIT_TEST_SPLIT_INDEX=YesPlease make --quiet test
+	export GIT_TEST_SPLIT_INDEX=YesPlease
+	export GIT_TEST_FULL_IN_PACK_ARRAY=true
+	export GIT_TEST_OE_SIZE=10
+	make --quiet test
 fi
 
 check_unignored_build_artifacts
diff --git a/pack-objects.c b/pack-objects.c
index 59c6e40a02..bf2e0a808d 100644
--- a/pack-objects.c
+++ b/pack-objects.c
@@ -3,6 +3,7 @@
 #include "pack.h"
 #include "pack-objects.h"
 #include "packfile.h"
+#include "config.h"
 
 static uint32_t locate_object_entry_hash(struct packing_data *pdata,
 					 const unsigned char *sha1,
@@ -90,18 +91,14 @@ struct object_entry *packlist_find(struct packing_data *pdata,
 static void prepare_in_pack_by_idx(struct packing_data *pdata)
 {
 	struct packed_git **mapping, *p;
-	int cnt = 0, nr = 1 << OE_IN_PACK_BITS;
-
-	if (getenv("GIT_TEST_FULL_IN_PACK_ARRAY")) {
-		/*
-		 * leave in_pack_by_idx NULL to force in_pack[] to be
-		 * used instead
-		 */
-		return;
-	}
+	int cnt = 0, nr = 1U << OE_IN_PACK_BITS;
 
 	ALLOC_ARRAY(mapping, nr);
-	mapping[cnt++] = NULL; /* zero index must be mapped to NULL */
+	/*
+	 * oe_in_pack() on an all-zero'd object_entry
+	 * (i.e. in_pack_idx also zero) should return NULL.
+	 */
+	mapping[cnt++] = NULL;
 	prepare_packed_git();
 	for (p = packed_git; p; p = p->next, cnt++) {
 		if (cnt == nr) {
@@ -114,21 +111,50 @@ static void prepare_in_pack_by_idx(struct packing_data *pdata)
 	pdata->in_pack_by_idx = mapping;
 }
 
+/*
+ * A new pack appears after prepare_in_pack_by_idx() has been
+ * run. This is likely a race.
+ *
+ * We could map this new pack to in_pack_by_idx[] array, but then we
+ * have to deal with full array anyway. And since it's hard to test
+ * this fall back code, just stay simple and fall back to using
+ * in_pack[] array.
+ */
+void oe_map_new_pack(struct packing_data *pack,
+		     struct packed_git *p)
+{
+	uint32_t i;
+
+	REALLOC_ARRAY(pack->in_pack, pack->nr_alloc);
+
+	for (i = 0; i < pack->nr_objects; i++)
+		pack->in_pack[i] = oe_in_pack(pack, pack->objects + i);
+
+	FREE_AND_NULL(pack->in_pack_by_idx);
+}
+
+/* assume pdata is already zero'd by caller */
+void prepare_packing_data(struct packing_data *pdata)
+{
+	if (git_env_bool("GIT_TEST_FULL_IN_PACK_ARRAY", 0)) {
+		/*
+		 * do not initialize in_pack_by_idx[] to force the
+		 * slow path in oe_in_pack()
+		 */
+	} else {
+		prepare_in_pack_by_idx(pdata);
+	}
+
+	pdata->oe_size_limit = git_env_ulong("GIT_TEST_OE_SIZE",
+					     1U << OE_SIZE_BITS);
+}
+
 struct object_entry *packlist_alloc(struct packing_data *pdata,
 				    const unsigned char *sha1,
 				    uint32_t index_pos)
 {
 	struct object_entry *new_entry;
 
-	if (!pdata->nr_objects) {
-		prepare_in_pack_by_idx(pdata);
-		if (getenv("GIT_TEST_OE_SIZE_BITS")) {
-			int bits = atoi(getenv("GIT_TEST_OE_SIZE_BITS"));;
-			pdata->oe_size_limit = 1 << bits;
-		}
-		if (!pdata->oe_size_limit)
-			pdata->oe_size_limit = 1 << OE_SIZE_BITS;
-	}
 	if (pdata->nr_objects >= pdata->nr_alloc) {
 		pdata->nr_alloc = (pdata->nr_alloc  + 1024) * 3 / 2;
 		REALLOC_ARRAY(pdata->objects, pdata->nr_alloc);
diff --git a/pack-objects.h b/pack-objects.h
index c20f67e25b..60192cce1f 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -4,9 +4,13 @@
 #define OE_DFS_STATE_BITS	2
 #define OE_DEPTH_BITS		12
 #define OE_IN_PACK_BITS		10
-#define OE_Z_DELTA_BITS		16
+#define OE_Z_DELTA_BITS		20
+/*
+ * Note that oe_set_size() becomes expensive when the given size is
+ * above this limit. Don't lower it too much.
+ */
 #define OE_SIZE_BITS		31
-#define OE_DELTA_SIZE_BITS	31
+#define OE_DELTA_SIZE_BITS	20
 
 /*
  * State flags for depth-first search used for analyzing delta cycles.
@@ -36,7 +40,7 @@ enum dfs_state {
  *
  * "size" is the uncompressed object size. Compressed size of the raw
  * data for an object in a pack is not stored anywhere but is computed
- * and made available when reverse .idx is made. Note that when an
+ * and made available when reverse .idx is made. Note that when a
  * delta is reused, "size" is the uncompressed _delta_ size, not the
  * canonical one after the delta has been applied.
  *
@@ -77,15 +81,15 @@ struct object_entry {
 	void *delta_data;	/* cached delta (uncompressed) */
 	off_t in_pack_offset;
 	uint32_t hash;			/* name hint hash */
-	uint32_t size_:OE_SIZE_BITS;
+	unsigned size_:OE_SIZE_BITS;
 	unsigned size_valid:1;
 	uint32_t delta_idx;	/* delta base object */
 	uint32_t delta_child_idx; /* deltified objects who bases me */
 	uint32_t delta_sibling_idx; /* other deltified objects who
 				     * uses the same base as me
 				     */
-	uint32_t delta_size_:OE_DELTA_SIZE_BITS; /* delta data size (uncompressed) */
-	uint32_t delta_size_valid:1;
+	unsigned delta_size_:OE_DELTA_SIZE_BITS; /* delta data size (uncompressed) */
+	unsigned delta_size_valid:1;
 	unsigned in_pack_idx:OE_IN_PACK_BITS;	/* already in pack */
 	unsigned z_delta_size:OE_Z_DELTA_BITS;
 	unsigned type_valid:1;
@@ -103,7 +107,15 @@ struct object_entry {
 	unsigned char in_pack_header_size;
 	unsigned depth:OE_DEPTH_BITS;
 
-	/* size: 80, bit_padding: 20 bits, holes: 1 bit */
+	/*
+	 * pahole results on 64-bit linux (gcc and clang)
+	 *
+	 *   size: 80, bit_padding: 20 bits, holes: 8 bits
+	 *
+	 * and on 32-bit (gcc)
+	 *
+	 *   size: 76, bit_padding: 20 bits, holes: 8 bits
+	 */
 };
 
 struct packing_data {
@@ -127,6 +139,7 @@ struct packing_data {
 	uintmax_t oe_size_limit;
 };
 
+void prepare_packing_data(struct packing_data *pdata);
 struct object_entry *packlist_alloc(struct packing_data *pdata,
 				    const unsigned char *sha1,
 				    uint32_t index_pos);
@@ -164,7 +177,7 @@ static inline void oe_set_type(struct object_entry *e,
 			       enum object_type type)
 {
 	if (type >= OBJ_ANY)
-		die("BUG: OBJ_ANY cannot be set in pack-objects code");
+		BUG("OBJ_ANY cannot be set in pack-objects code");
 
 	e->type_valid = type >= OBJ_NONE;
 	e->type_ = (unsigned)type;
@@ -190,21 +203,20 @@ static inline struct packed_git *oe_in_pack(const struct packing_data *pack,
 		return pack->in_pack_by_idx[e->in_pack_idx];
 	else
 		return pack->in_pack[e - pack->objects];
-
 }
 
+void oe_map_new_pack(struct packing_data *pack,
+		     struct packed_git *p);
 static inline void oe_set_in_pack(struct packing_data *pack,
 				  struct object_entry *e,
 				  struct packed_git *p)
 {
-	if (pack->in_pack_by_idx) {
-		if (p->index <= 0)
-			die("BUG: found_pack should be NULL "
-					"instead of having non-positive index");
+	if (!p->index)
+		oe_map_new_pack(pack, p);
+	if (pack->in_pack_by_idx)
 		e->in_pack_idx = p->index;
-	} else
+	else
 		pack->in_pack[e - pack->objects] = p;
-
 }
 
 static inline struct object_entry *oe_delta(
@@ -307,7 +319,7 @@ static inline void oe_set_size(struct packing_data *pack,
 	} else {
 		e->size_valid = 0;
 		if (oe_get_size_slow(pack, e) != size)
-			die("BUG: 'size' is supposed to be the object size!");
+			BUG("'size' is supposed to be the object size!");
 	}
 }
 
@@ -326,7 +338,7 @@ static inline void oe_set_delta_size(struct packing_data *pack,
 	e->delta_size_ = size;
 	e->delta_size_valid = e->delta_size_ == size;
 	if (!e->delta_size_valid && size != oe_size(pack, e))
-		die("BUG: this can only happen in check_object() "
+		BUG("this can only happen in check_object() "
 		    "where delta size is the same as entry size");
 }
 
diff --git a/t/README b/t/README
index 02bfb3fed5..c01d210c15 100644
--- a/t/README
+++ b/t/README
@@ -291,16 +291,26 @@ expect the rest to function correctly.
 and know what setup is needed for it.  Or when you want to run
 everything up to a certain test.
 
+
+Running tests with special setups
+---------------------------------
+
+The whole test suite could be run to test some special features
+that cannot be easily covered by a few specific test cases. These
+could be enabled by running the test suite with correct GIT_TEST_
+environment set.
+
+GIT_TEST_SPLIT_INDEX forces split-index mode on the whole test suite.
+
 GIT_TEST_FULL_IN_PACK_ARRAY exercises the uncommon pack-objects code
 path where there are more than 1024 packs even if the actual number of
 packs in repository is below this limit.
 
-GIT_TEST_OE_SIZE_BITS=<bits> exercises the uncommon pack-objects
-code path where we do not cache objecct size in memory and read it
-from existing packs on demand. This normally only happens when the
-object size is over 2GB. This variable forces the code path on any
-object larger than 2^<bits> bytes.
-
+GIT_TEST_OE_SIZE=<n> exercises the uncommon pack-objects code path
+where we do not cache object size in memory and read it from existing
+packs on demand. This normally only happens when the object size is
+over 2GB. This variable forces the code path on any object larger than
+<n> bytes.
 
 Naming Tests
 ------------
Nguyễn Thái Ngọc Duy (15):
  t/README: mention about running the test suite in special modes
  pack-objects: a bit of document about struct object_entry
  pack-objects: turn type and in_pack_type to bitfields
  pack-objects: use bitfield for object_entry::dfs_state
  pack-objects: use bitfield for object_entry::depth
  pack-objects: move in_pack_pos out of struct object_entry
  pack-objects: move in_pack out of struct object_entry
  pack-objects: refer to delta objects by index instead of pointer
  pack-objects: shrink z_delta_size field in struct object_entry
  pack-objects: don't check size when the object is bad
  pack-objects: clarify the use of object_entry::size
  pack-objects: shrink size field in struct object_entry
  pack-objects: shrink delta_size field in struct object_entry
  pack-objects: reorder members to shrink struct object_entry
  ci: exercise the whole test suite with uncommon code in pack-objects

 Documentation/config.txt           |   4 +-
 Documentation/git-pack-objects.txt |   4 +-
 Documentation/git-repack.txt       |   4 +-
 builtin/pack-objects.c             | 366 +++++++++++++++++++----------
 cache.h                            |   3 +
 ci/run-tests.sh                    |   5 +-
 object.h                           |   1 -
 pack-bitmap-write.c                |  14 +-
 pack-bitmap.c                      |   2 +-
 pack-bitmap.h                      |   4 +-
 pack-objects.c                     |  69 ++++++
 pack-objects.h                     | 312 ++++++++++++++++++++++--
 t/README                           |  20 ++
 t/t5300-pack-object.sh             |   5 +
 14 files changed, 650 insertions(+), 163 deletions(-)

-- 
2.17.0.rc2.515.g4feb9b7923


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v8 01/15] t/README: mention about running the test suite in special modes
  2018-03-31 10:02               ` [PATCH v8 00/15] " Nguyễn Thái Ngọc Duy
@ 2018-03-31 10:02                 ` Nguyễn Thái Ngọc Duy
  2018-03-31 10:02                 ` [PATCH v8 02/15] pack-objects: a bit of document about struct object_entry Nguyễn Thái Ngọc Duy
                                   ` (15 subsequent siblings)
  16 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-31 10:02 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

From: Duy Nguyen <pclouds@gmail.com>

There are features that would benefit from running the whole test
suite instead of just a few test cases written specifically for
them. Split-index mode is one of them. Document it.

This step is required because a few patches later, we will be
introduce more test modes like this to test some corner cases of
pack-objects as much as possible.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 t/README | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/t/README b/t/README
index 1a1361a806..09eb2b9768 100644
--- a/t/README
+++ b/t/README
@@ -292,6 +292,16 @@ and know what setup is needed for it.  Or when you want to run
 everything up to a certain test.
 
 
+Running tests with special setups
+---------------------------------
+
+The whole test suite could be run to test some special features
+that cannot be easily covered by a few specific test cases. These
+could be enabled by running the test suite with correct GIT_TEST_
+environment set.
+
+GIT_TEST_SPLIT_INDEX forces split-index mode on the whole test suite.
+
 Naming Tests
 ------------
 
-- 
2.17.0.rc2.515.g4feb9b7923


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v8 02/15] pack-objects: a bit of document about struct object_entry
  2018-03-31 10:02               ` [PATCH v8 00/15] " Nguyễn Thái Ngọc Duy
  2018-03-31 10:02                 ` [PATCH v8 01/15] t/README: mention about running the test suite in special modes Nguyễn Thái Ngọc Duy
@ 2018-03-31 10:02                 ` Nguyễn Thái Ngọc Duy
  2018-03-31 10:02                 ` [PATCH v8 03/15] pack-objects: turn type and in_pack_type to bitfields Nguyễn Thái Ngọc Duy
                                   ` (14 subsequent siblings)
  16 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-31 10:02 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

The role of this comment block becomes more important after we shuffle
fields around to shrink this struct. It will be much harder to see what
field is related to what.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 pack-objects.h | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/pack-objects.h b/pack-objects.h
index 03f1191659..c0a1f61aac 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -1,6 +1,51 @@
 #ifndef PACK_OBJECTS_H
 #define PACK_OBJECTS_H
 
+/*
+ * basic object info
+ * -----------------
+ * idx.oid is filled up before delta searching starts. idx.crc32 is
+ * only valid after the object is written out and will be used for
+ * generating the index. idx.offset will be both gradually set and
+ * used in writing phase (base objects get offset first, then deltas
+ * refer to them)
+ *
+ * "size" is the uncompressed object size. Compressed size of the raw
+ * data for an object in a pack is not stored anywhere but is computed
+ * and made available when reverse .idx is made.
+ *
+ * "hash" contains a path name hash which is used for sorting the
+ * delta list and also during delta searching. Once prepare_pack()
+ * returns it's no longer needed.
+ *
+ * source pack info
+ * ----------------
+ * The (in_pack, in_pack_offset) tuple contains the location of the
+ * object in the source pack. in_pack_header_size allows quickly
+ * skipping the header and going straight to the zlib stream.
+ *
+ * "type" and "in_pack_type" both describe object type. in_pack_type
+ * may contain a delta type, while type is always the canonical type.
+ *
+ * deltas
+ * ------
+ * Delta links (delta, delta_child and delta_sibling) are created to
+ * reflect that delta graph from the source pack then updated or added
+ * during delta searching phase when we find better deltas.
+ *
+ * delta_child and delta_sibling are last needed in
+ * compute_write_order(). "delta" and "delta_size" must remain valid
+ * at object writing phase in case the delta is not cached.
+ *
+ * If a delta is cached in memory and is compressed, delta_data points
+ * to the data and z_delta_size contains the compressed size. If it's
+ * uncompressed [1], z_delta_size must be zero. delta_size is always
+ * the uncompressed size and must be valid even if the delta is not
+ * cached.
+ *
+ * [1] during try_delta phase we don't bother with compressing because
+ * the delta could be quickly replaced with a better one.
+ */
 struct object_entry {
 	struct pack_idx_entry idx;
 	unsigned long size;	/* uncompressed size */
-- 
2.17.0.rc2.515.g4feb9b7923


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v8 03/15] pack-objects: turn type and in_pack_type to bitfields
  2018-03-31 10:02               ` [PATCH v8 00/15] " Nguyễn Thái Ngọc Duy
  2018-03-31 10:02                 ` [PATCH v8 01/15] t/README: mention about running the test suite in special modes Nguyễn Thái Ngọc Duy
  2018-03-31 10:02                 ` [PATCH v8 02/15] pack-objects: a bit of document about struct object_entry Nguyễn Thái Ngọc Duy
@ 2018-03-31 10:02                 ` Nguyễn Thái Ngọc Duy
  2018-03-31 10:03                 ` [PATCH v8 04/15] pack-objects: use bitfield for object_entry::dfs_state Nguyễn Thái Ngọc Duy
                                   ` (13 subsequent siblings)
  16 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-31 10:02 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

An extra field type_valid is added to carry the equivalent of OBJ_BAD
in the original "type" field. in_pack_type always contains a valid
type so we only need 3 bits for it.

A note about accepting OBJ_NONE as "valid" type. The function
read_object_list_from_stdin() can pass this value [1] and it
eventually calls create_object_entry() where current code skip setting
"type" field if the incoming type is zero. This does not have any bad
side effects because "type" field should be memset()'d anyway.

But since we also need to set type_valid now, skipping oe_set_type()
leaves type_valid zero/false, which will make oe_type() return
OBJ_BAD, not OBJ_NONE anymore. Apparently we do care about OBJ_NONE in
prepare_pack(). This switch from OBJ_NONE to OBJ_BAD may trigger

    fatal: unable to get type of object ...

Accepting OBJ_NONE [2] does sound wrong, but this is how it is has
been for a very long time and I haven't time to dig in further.

[1] See 5c49c11686 (pack-objects: better check_object() performances -
    2007-04-16)

[2] 21666f1aae (convert object type handling from a string to a number
    - 2007-02-26)

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 60 ++++++++++++++++++++++++------------------
 cache.h                |  2 ++
 object.h               |  1 -
 pack-bitmap-write.c    |  6 ++---
 pack-objects.h         | 20 ++++++++++++--
 5 files changed, 58 insertions(+), 31 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 5c674b2843..7133baa63f 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -265,7 +265,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 	struct git_istream *st = NULL;
 
 	if (!usable_delta) {
-		if (entry->type == OBJ_BLOB &&
+		if (oe_type(entry) == OBJ_BLOB &&
 		    entry->size > big_file_threshold &&
 		    (st = open_istream(entry->idx.oid.hash, &type, &size, NULL)) != NULL)
 			buf = NULL;
@@ -371,7 +371,7 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 	struct pack_window *w_curs = NULL;
 	struct revindex_entry *revidx;
 	off_t offset;
-	enum object_type type = entry->type;
+	enum object_type type = oe_type(entry);
 	off_t datalen;
 	unsigned char header[MAX_PACK_OBJECT_HEADER],
 		      dheader[MAX_PACK_OBJECT_HEADER];
@@ -480,11 +480,12 @@ static off_t write_object(struct hashfile *f,
 		to_reuse = 0;	/* explicit */
 	else if (!entry->in_pack)
 		to_reuse = 0;	/* can't reuse what we don't have */
-	else if (entry->type == OBJ_REF_DELTA || entry->type == OBJ_OFS_DELTA)
+	else if (oe_type(entry) == OBJ_REF_DELTA ||
+		 oe_type(entry) == OBJ_OFS_DELTA)
 				/* check_object() decided it for us ... */
 		to_reuse = usable_delta;
 				/* ... but pack split may override that */
-	else if (entry->type != entry->in_pack_type)
+	else if (oe_type(entry) != entry->in_pack_type)
 		to_reuse = 0;	/* pack has delta which is unusable */
 	else if (entry->delta)
 		to_reuse = 0;	/* we want to pack afresh */
@@ -705,8 +706,8 @@ static struct object_entry **compute_write_order(void)
 	 * And then all remaining commits and tags.
 	 */
 	for (i = last_untagged; i < to_pack.nr_objects; i++) {
-		if (objects[i].type != OBJ_COMMIT &&
-		    objects[i].type != OBJ_TAG)
+		if (oe_type(&objects[i]) != OBJ_COMMIT &&
+		    oe_type(&objects[i]) != OBJ_TAG)
 			continue;
 		add_to_write_order(wo, &wo_end, &objects[i]);
 	}
@@ -715,7 +716,7 @@ static struct object_entry **compute_write_order(void)
 	 * And then all the trees.
 	 */
 	for (i = last_untagged; i < to_pack.nr_objects; i++) {
-		if (objects[i].type != OBJ_TREE)
+		if (oe_type(&objects[i]) != OBJ_TREE)
 			continue;
 		add_to_write_order(wo, &wo_end, &objects[i]);
 	}
@@ -1066,8 +1067,7 @@ static void create_object_entry(const struct object_id *oid,
 
 	entry = packlist_alloc(&to_pack, oid->hash, index_pos);
 	entry->hash = hash;
-	if (type)
-		entry->type = type;
+	oe_set_type(entry, type);
 	if (exclude)
 		entry->preferred_base = 1;
 	else
@@ -1407,6 +1407,7 @@ static void check_object(struct object_entry *entry)
 		unsigned long avail;
 		off_t ofs;
 		unsigned char *buf, c;
+		enum object_type type;
 
 		buf = use_pack(p, &w_curs, entry->in_pack_offset, &avail);
 
@@ -1415,11 +1416,15 @@ static void check_object(struct object_entry *entry)
 		 * since non-delta representations could still be reused.
 		 */
 		used = unpack_object_header_buffer(buf, avail,
-						   &entry->in_pack_type,
+						   &type,
 						   &entry->size);
 		if (used == 0)
 			goto give_up;
 
+		if (type < 0)
+			BUG("invalid type %d", type);
+		entry->in_pack_type = type;
+
 		/*
 		 * Determine if this is a delta and if so whether we can
 		 * reuse it or not.  Otherwise let's find out as cheaply as
@@ -1428,9 +1433,9 @@ static void check_object(struct object_entry *entry)
 		switch (entry->in_pack_type) {
 		default:
 			/* Not a delta hence we've already got all we need. */
-			entry->type = entry->in_pack_type;
+			oe_set_type(entry, entry->in_pack_type);
 			entry->in_pack_header_size = used;
-			if (entry->type < OBJ_COMMIT || entry->type > OBJ_BLOB)
+			if (oe_type(entry) < OBJ_COMMIT || oe_type(entry) > OBJ_BLOB)
 				goto give_up;
 			unuse_pack(&w_curs);
 			return;
@@ -1484,7 +1489,7 @@ static void check_object(struct object_entry *entry)
 			 * deltify other objects against, in order to avoid
 			 * circular deltas.
 			 */
-			entry->type = entry->in_pack_type;
+			oe_set_type(entry, entry->in_pack_type);
 			entry->delta = base_entry;
 			entry->delta_size = entry->size;
 			entry->delta_sibling = base_entry->delta_child;
@@ -1493,7 +1498,7 @@ static void check_object(struct object_entry *entry)
 			return;
 		}
 
-		if (entry->type) {
+		if (oe_type(entry)) {
 			/*
 			 * This must be a delta and we already know what the
 			 * final object type is.  Let's extract the actual
@@ -1516,7 +1521,7 @@ static void check_object(struct object_entry *entry)
 		unuse_pack(&w_curs);
 	}
 
-	entry->type = sha1_object_info(entry->idx.oid.hash, &entry->size);
+	oe_set_type(entry, sha1_object_info(entry->idx.oid.hash, &entry->size));
 	/*
 	 * The error condition is checked in prepare_pack().  This is
 	 * to permit a missing preferred base object to be ignored
@@ -1559,6 +1564,7 @@ static void drop_reused_delta(struct object_entry *entry)
 {
 	struct object_entry **p = &entry->delta->delta_child;
 	struct object_info oi = OBJECT_INFO_INIT;
+	enum object_type type;
 
 	while (*p) {
 		if (*p == entry)
@@ -1570,16 +1576,18 @@ static void drop_reused_delta(struct object_entry *entry)
 	entry->depth = 0;
 
 	oi.sizep = &entry->size;
-	oi.typep = &entry->type;
+	oi.typep = &type;
 	if (packed_object_info(entry->in_pack, entry->in_pack_offset, &oi) < 0) {
 		/*
 		 * We failed to get the info from this pack for some reason;
 		 * fall back to sha1_object_info, which may find another copy.
-		 * And if that fails, the error will be recorded in entry->type
+		 * And if that fails, the error will be recorded in oe_type(entry)
 		 * and dealt with in prepare_pack().
 		 */
-		entry->type = sha1_object_info(entry->idx.oid.hash,
-					       &entry->size);
+		oe_set_type(entry, sha1_object_info(entry->idx.oid.hash,
+						    &entry->size));
+	} else {
+		oe_set_type(entry, type);
 	}
 }
 
@@ -1747,10 +1755,12 @@ static int type_size_sort(const void *_a, const void *_b)
 {
 	const struct object_entry *a = *(struct object_entry **)_a;
 	const struct object_entry *b = *(struct object_entry **)_b;
+	enum object_type a_type = oe_type(a);
+	enum object_type b_type = oe_type(b);
 
-	if (a->type > b->type)
+	if (a_type > b_type)
 		return -1;
-	if (a->type < b->type)
+	if (a_type < b_type)
 		return 1;
 	if (a->hash > b->hash)
 		return -1;
@@ -1826,7 +1836,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	void *delta_buf;
 
 	/* Don't bother doing diffs between different types */
-	if (trg_entry->type != src_entry->type)
+	if (oe_type(trg_entry) != oe_type(src_entry))
 		return -1;
 
 	/*
@@ -2432,11 +2442,11 @@ static void prepare_pack(int window, int depth)
 
 		if (!entry->preferred_base) {
 			nr_deltas++;
-			if (entry->type < 0)
+			if (oe_type(entry) < 0)
 				die("unable to get type of object %s",
 				    oid_to_hex(&entry->idx.oid));
 		} else {
-			if (entry->type < 0) {
+			if (oe_type(entry) < 0) {
 				/*
 				 * This object is not found, but we
 				 * don't have to include it anyway.
@@ -2545,7 +2555,7 @@ static void read_object_list_from_stdin(void)
 			die("expected object ID, got garbage:\n %s", line);
 
 		add_preferred_base_object(p + 1);
-		add_object_entry(&oid, 0, p + 1, 0);
+		add_object_entry(&oid, OBJ_NONE, p + 1, 0);
 	}
 }
 
diff --git a/cache.h b/cache.h
index 21fbcc2414..862bdff83a 100644
--- a/cache.h
+++ b/cache.h
@@ -373,6 +373,8 @@ extern void free_name_hash(struct index_state *istate);
 #define read_blob_data_from_cache(path, sz) read_blob_data_from_index(&the_index, (path), (sz))
 #endif
 
+#define TYPE_BITS 3
+
 enum object_type {
 	OBJ_BAD = -1,
 	OBJ_NONE = 0,
diff --git a/object.h b/object.h
index 87563d9056..8ce294d6ec 100644
--- a/object.h
+++ b/object.h
@@ -25,7 +25,6 @@ struct object_array {
 
 #define OBJECT_ARRAY_INIT { 0, 0, NULL }
 
-#define TYPE_BITS   3
 /*
  * object flag allocation:
  * revision.h:      0---------10                                26
diff --git a/pack-bitmap-write.c b/pack-bitmap-write.c
index e01f992884..fd11f08940 100644
--- a/pack-bitmap-write.c
+++ b/pack-bitmap-write.c
@@ -64,12 +64,12 @@ void bitmap_writer_build_type_index(struct pack_idx_entry **index,
 
 		entry->in_pack_pos = i;
 
-		switch (entry->type) {
+		switch (oe_type(entry)) {
 		case OBJ_COMMIT:
 		case OBJ_TREE:
 		case OBJ_BLOB:
 		case OBJ_TAG:
-			real_type = entry->type;
+			real_type = oe_type(entry);
 			break;
 
 		default:
@@ -98,7 +98,7 @@ void bitmap_writer_build_type_index(struct pack_idx_entry **index,
 		default:
 			die("Missing type information for %s (%d/%d)",
 			    oid_to_hex(&entry->idx.oid), real_type,
-			    entry->type);
+			    oe_type(entry));
 		}
 	}
 }
diff --git a/pack-objects.h b/pack-objects.h
index c0a1f61aac..b4a83a6123 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -59,8 +59,9 @@ struct object_entry {
 	void *delta_data;	/* cached delta (uncompressed) */
 	unsigned long delta_size;	/* delta data size (uncompressed) */
 	unsigned long z_delta_size;	/* delta data size (compressed) */
-	enum object_type type;
-	enum object_type in_pack_type;	/* could be delta */
+	unsigned type_:TYPE_BITS;
+	unsigned in_pack_type:TYPE_BITS; /* could be delta */
+	unsigned type_valid:1;
 	uint32_t hash;			/* name hint hash */
 	unsigned int in_pack_pos;
 	unsigned char in_pack_header_size;
@@ -123,4 +124,19 @@ static inline uint32_t pack_name_hash(const char *name)
 	return hash;
 }
 
+static inline enum object_type oe_type(const struct object_entry *e)
+{
+	return e->type_valid ? e->type_ : OBJ_BAD;
+}
+
+static inline void oe_set_type(struct object_entry *e,
+			       enum object_type type)
+{
+	if (type >= OBJ_ANY)
+		BUG("OBJ_ANY cannot be set in pack-objects code");
+
+	e->type_valid = type >= OBJ_NONE;
+	e->type_ = (unsigned)type;
+}
+
 #endif
-- 
2.17.0.rc2.515.g4feb9b7923


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v8 04/15] pack-objects: use bitfield for object_entry::dfs_state
  2018-03-31 10:02               ` [PATCH v8 00/15] " Nguyễn Thái Ngọc Duy
                                   ` (2 preceding siblings ...)
  2018-03-31 10:02                 ` [PATCH v8 03/15] pack-objects: turn type and in_pack_type to bitfields Nguyễn Thái Ngọc Duy
@ 2018-03-31 10:03                 ` Nguyễn Thái Ngọc Duy
  2018-03-31 10:03                 ` [PATCH v8 05/15] pack-objects: use bitfield for object_entry::depth Nguyễn Thái Ngọc Duy
                                   ` (12 subsequent siblings)
  16 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-31 10:03 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c |  3 +++
 pack-objects.h         | 28 +++++++++++++++++-----------
 2 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 7133baa63f..ebb6e034cb 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -3049,6 +3049,9 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 		OPT_END(),
 	};
 
+	if (DFS_NUM_STATES > (1 << OE_DFS_STATE_BITS))
+		BUG("too many dfs states, increase OE_DFS_STATE_BITS");
+
 	check_replace_refs = 0;
 
 	reset_pack_idx_option(&pack_idx_opts);
diff --git a/pack-objects.h b/pack-objects.h
index b4a83a6123..080ef62d31 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -1,6 +1,21 @@
 #ifndef PACK_OBJECTS_H
 #define PACK_OBJECTS_H
 
+#define OE_DFS_STATE_BITS	2
+
+/*
+ * State flags for depth-first search used for analyzing delta cycles.
+ *
+ * The depth is measured in delta-links to the base (so if A is a delta
+ * against B, then A has a depth of 1, and B a depth of 0).
+ */
+enum dfs_state {
+	DFS_NONE = 0,
+	DFS_ACTIVE,
+	DFS_DONE,
+	DFS_NUM_STATES
+};
+
 /*
  * basic object info
  * -----------------
@@ -73,19 +88,10 @@ struct object_entry {
 	unsigned no_try_delta:1;
 	unsigned tagged:1; /* near the very tip of refs */
 	unsigned filled:1; /* assigned write-order */
+	unsigned dfs_state:OE_DFS_STATE_BITS;
 
-	/*
-	 * State flags for depth-first search used for analyzing delta cycles.
-	 *
-	 * The depth is measured in delta-links to the base (so if A is a delta
-	 * against B, then A has a depth of 1, and B a depth of 0).
-	 */
-	enum {
-		DFS_NONE = 0,
-		DFS_ACTIVE,
-		DFS_DONE
-	} dfs_state;
 	int depth;
+
 };
 
 struct packing_data {
-- 
2.17.0.rc2.515.g4feb9b7923


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v8 05/15] pack-objects: use bitfield for object_entry::depth
  2018-03-31 10:02               ` [PATCH v8 00/15] " Nguyễn Thái Ngọc Duy
                                   ` (3 preceding siblings ...)
  2018-03-31 10:03                 ` [PATCH v8 04/15] pack-objects: use bitfield for object_entry::dfs_state Nguyễn Thái Ngọc Duy
@ 2018-03-31 10:03                 ` Nguyễn Thái Ngọc Duy
  2018-03-31 10:03                 ` [PATCH v8 06/15] pack-objects: move in_pack_pos out of struct object_entry Nguyễn Thái Ngọc Duy
                                   ` (11 subsequent siblings)
  16 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-31 10:03 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

Because of struct packing from now on we can only handle max depth
4095 (or even lower when new booleans are added in this struct). This
should be ok since long delta chain will cause significant slow down
anyway.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 Documentation/config.txt           | 1 +
 Documentation/git-pack-objects.txt | 4 +++-
 Documentation/git-repack.txt       | 4 +++-
 builtin/pack-objects.c             | 6 ++++++
 pack-objects.h                     | 5 ++---
 5 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index f57e9cf10c..9bd3f5a789 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -2412,6 +2412,7 @@ pack.window::
 pack.depth::
 	The maximum delta depth used by linkgit:git-pack-objects[1] when no
 	maximum depth is given on the command line. Defaults to 50.
+	Maximum value is 4095.
 
 pack.windowMemory::
 	The maximum size of memory that is consumed by each thread
diff --git a/Documentation/git-pack-objects.txt b/Documentation/git-pack-objects.txt
index 81bc490ac5..3503c9e3e6 100644
--- a/Documentation/git-pack-objects.txt
+++ b/Documentation/git-pack-objects.txt
@@ -96,7 +96,9 @@ base-name::
 	it too deep affects the performance on the unpacker
 	side, because delta data needs to be applied that many
 	times to get to the necessary object.
-	The default value for --window is 10 and --depth is 50.
++
+The default value for --window is 10 and --depth is 50. The maximum
+depth is 4095.
 
 --window-memory=<n>::
 	This option provides an additional limit on top of `--window`;
diff --git a/Documentation/git-repack.txt b/Documentation/git-repack.txt
index ae750e9e11..25c83c4927 100644
--- a/Documentation/git-repack.txt
+++ b/Documentation/git-repack.txt
@@ -90,7 +90,9 @@ other objects in that pack they already have locally.
 	space. `--depth` limits the maximum delta depth; making it too deep
 	affects the performance on the unpacker side, because delta data needs
 	to be applied that many times to get to the necessary object.
-	The default value for --window is 10 and --depth is 50.
++
+The default value for --window is 10 and --depth is 50. The maximum
+depth is 4095.
 
 --threads=<n>::
 	This option is passed through to `git pack-objects`.
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index ebb6e034cb..2ce05626d2 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -3068,6 +3068,12 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 	if (pack_to_stdout != !base_name || argc)
 		usage_with_options(pack_usage, pack_objects_options);
 
+	if (depth >= (1 << OE_DEPTH_BITS)) {
+		warning(_("delta chain depth %d is too deep, forcing %d"),
+			depth, (1 << OE_DEPTH_BITS) - 1);
+		depth = (1 << OE_DEPTH_BITS) - 1;
+	}
+
 	argv_array_push(&rp, "pack-objects");
 	if (thin) {
 		use_internal_rev_list = 1;
diff --git a/pack-objects.h b/pack-objects.h
index 080ef62d31..cdce1648de 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -2,6 +2,7 @@
 #define PACK_OBJECTS_H
 
 #define OE_DFS_STATE_BITS	2
+#define OE_DEPTH_BITS		12
 
 /*
  * State flags for depth-first search used for analyzing delta cycles.
@@ -89,9 +90,7 @@ struct object_entry {
 	unsigned tagged:1; /* near the very tip of refs */
 	unsigned filled:1; /* assigned write-order */
 	unsigned dfs_state:OE_DFS_STATE_BITS;
-
-	int depth;
-
+	unsigned depth:OE_DEPTH_BITS;
 };
 
 struct packing_data {
-- 
2.17.0.rc2.515.g4feb9b7923


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v8 06/15] pack-objects: move in_pack_pos out of struct object_entry
  2018-03-31 10:02               ` [PATCH v8 00/15] " Nguyễn Thái Ngọc Duy
                                   ` (4 preceding siblings ...)
  2018-03-31 10:03                 ` [PATCH v8 05/15] pack-objects: use bitfield for object_entry::depth Nguyễn Thái Ngọc Duy
@ 2018-03-31 10:03                 ` Nguyễn Thái Ngọc Duy
  2018-03-31 10:03                 ` [PATCH v8 07/15] pack-objects: move in_pack " Nguyễn Thái Ngọc Duy
                                   ` (10 subsequent siblings)
  16 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-31 10:03 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

This field is only need for pack-bitmap, which is an optional
feature. Move it to a separate array that is only allocated when
pack-bitmap is used (like objects[], it is not freed, since we need it
until the end of the process)

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c |  3 ++-
 pack-bitmap-write.c    |  8 +++++---
 pack-bitmap.c          |  2 +-
 pack-bitmap.h          |  4 +++-
 pack-objects.h         | 16 +++++++++++++++-
 5 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 2ce05626d2..7a672366bf 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -879,7 +879,8 @@ static void write_pack_file(void)
 
 			if (write_bitmap_index) {
 				bitmap_writer_set_checksum(oid.hash);
-				bitmap_writer_build_type_index(written_list, nr_written);
+				bitmap_writer_build_type_index(
+					&to_pack, written_list, nr_written);
 			}
 
 			finish_tmp_packfile(&tmpname, pack_tmp_name,
diff --git a/pack-bitmap-write.c b/pack-bitmap-write.c
index fd11f08940..f7c897515b 100644
--- a/pack-bitmap-write.c
+++ b/pack-bitmap-write.c
@@ -48,7 +48,8 @@ void bitmap_writer_show_progress(int show)
 /**
  * Build the initial type index for the packfile
  */
-void bitmap_writer_build_type_index(struct pack_idx_entry **index,
+void bitmap_writer_build_type_index(struct packing_data *to_pack,
+				    struct pack_idx_entry **index,
 				    uint32_t index_nr)
 {
 	uint32_t i;
@@ -57,12 +58,13 @@ void bitmap_writer_build_type_index(struct pack_idx_entry **index,
 	writer.trees = ewah_new();
 	writer.blobs = ewah_new();
 	writer.tags = ewah_new();
+	ALLOC_ARRAY(to_pack->in_pack_pos, to_pack->nr_objects);
 
 	for (i = 0; i < index_nr; ++i) {
 		struct object_entry *entry = (struct object_entry *)index[i];
 		enum object_type real_type;
 
-		entry->in_pack_pos = i;
+		oe_set_in_pack_pos(to_pack, entry, i);
 
 		switch (oe_type(entry)) {
 		case OBJ_COMMIT:
@@ -147,7 +149,7 @@ static uint32_t find_object_pos(const unsigned char *sha1)
 			"(object %s is missing)", sha1_to_hex(sha1));
 	}
 
-	return entry->in_pack_pos;
+	return oe_in_pack_pos(writer.to_pack, entry);
 }
 
 static void show_object(struct object *object, const char *name, void *data)
diff --git a/pack-bitmap.c b/pack-bitmap.c
index 9270983e5f..865d9ecc4e 100644
--- a/pack-bitmap.c
+++ b/pack-bitmap.c
@@ -1032,7 +1032,7 @@ int rebuild_existing_bitmaps(struct packing_data *mapping,
 		oe = packlist_find(mapping, sha1, NULL);
 
 		if (oe)
-			reposition[i] = oe->in_pack_pos + 1;
+			reposition[i] = oe_in_pack_pos(mapping, oe) + 1;
 	}
 
 	rebuild = bitmap_new();
diff --git a/pack-bitmap.h b/pack-bitmap.h
index 3742a00e14..5ded2f139a 100644
--- a/pack-bitmap.h
+++ b/pack-bitmap.h
@@ -44,7 +44,9 @@ int rebuild_existing_bitmaps(struct packing_data *mapping, khash_sha1 *reused_bi
 
 void bitmap_writer_show_progress(int show);
 void bitmap_writer_set_checksum(unsigned char *sha1);
-void bitmap_writer_build_type_index(struct pack_idx_entry **index, uint32_t index_nr);
+void bitmap_writer_build_type_index(struct packing_data *to_pack,
+				    struct pack_idx_entry **index,
+				    uint32_t index_nr);
 void bitmap_writer_reuse_bitmaps(struct packing_data *to_pack);
 void bitmap_writer_select_commits(struct commit **indexed_commits,
 		unsigned int indexed_commits_nr, int max_bitmaps);
diff --git a/pack-objects.h b/pack-objects.h
index cdce1648de..71ea992c3c 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -79,7 +79,6 @@ struct object_entry {
 	unsigned in_pack_type:TYPE_BITS; /* could be delta */
 	unsigned type_valid:1;
 	uint32_t hash;			/* name hint hash */
-	unsigned int in_pack_pos;
 	unsigned char in_pack_header_size;
 	unsigned preferred_base:1; /*
 				    * we do not pack this, but is available
@@ -99,6 +98,8 @@ struct packing_data {
 
 	int32_t *index;
 	uint32_t index_size;
+
+	unsigned int *in_pack_pos;
 };
 
 struct object_entry *packlist_alloc(struct packing_data *pdata,
@@ -144,4 +145,17 @@ static inline void oe_set_type(struct object_entry *e,
 	e->type_ = (unsigned)type;
 }
 
+static inline unsigned int oe_in_pack_pos(const struct packing_data *pack,
+					  const struct object_entry *e)
+{
+	return pack->in_pack_pos[e - pack->objects];
+}
+
+static inline void oe_set_in_pack_pos(const struct packing_data *pack,
+				      const struct object_entry *e,
+				      unsigned int pos)
+{
+	pack->in_pack_pos[e - pack->objects] = pos;
+}
+
 #endif
-- 
2.17.0.rc2.515.g4feb9b7923


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v8 07/15] pack-objects: move in_pack out of struct object_entry
  2018-03-31 10:02               ` [PATCH v8 00/15] " Nguyễn Thái Ngọc Duy
                                   ` (5 preceding siblings ...)
  2018-03-31 10:03                 ` [PATCH v8 06/15] pack-objects: move in_pack_pos out of struct object_entry Nguyễn Thái Ngọc Duy
@ 2018-03-31 10:03                 ` Nguyễn Thái Ngọc Duy
  2018-03-31 10:03                 ` [PATCH v8 08/15] pack-objects: refer to delta objects by index instead of pointer Nguyễn Thái Ngọc Duy
                                   ` (9 subsequent siblings)
  16 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-31 10:03 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

Instead of using 8 bytes (on 64 bit arch) to store a pointer to a
pack. Use an index instead since the number of packs should be
relatively small.

This limits the number of packs we can handle to 1k. Since we can't be
sure people can never run into the situation where they have more than
1k pack files. Provide a fall back route for it.

If we find out they have too many packs, the new in_pack_by_idx[]
array (which has at most 1k elements) will not be used. Instead we
allocate in_pack[] array that holds nr_objects elements. This is
similar to how the optional in_pack_pos field is handled.

The new simple test is just to make sure the too-many-packs code path
is at least executed. The true test is running

    make test GIT_TEST_FULL_IN_PACK_ARRAY=1

to take advantage of other special case tests.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 28 +++++++++++-------
 cache.h                |  1 +
 pack-objects.c         | 66 ++++++++++++++++++++++++++++++++++++++++++
 pack-objects.h         | 36 ++++++++++++++++++++++-
 t/README               |  4 +++
 t/t5300-pack-object.sh |  5 ++++
 6 files changed, 128 insertions(+), 12 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 7a672366bf..4e5812a053 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -29,6 +29,8 @@
 #include "list.h"
 #include "packfile.h"
 
+#define IN_PACK(obj) oe_in_pack(&to_pack, obj)
+
 static const char *pack_usage[] = {
 	N_("git pack-objects --stdout [<options>...] [< <ref-list> | < <object-list>]"),
 	N_("git pack-objects [<options>...] <base-name> [< <ref-list> | < <object-list>]"),
@@ -367,7 +369,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 				unsigned long limit, int usable_delta)
 {
-	struct packed_git *p = entry->in_pack;
+	struct packed_git *p = IN_PACK(entry);
 	struct pack_window *w_curs = NULL;
 	struct revindex_entry *revidx;
 	off_t offset;
@@ -478,7 +480,7 @@ static off_t write_object(struct hashfile *f,
 
 	if (!reuse_object)
 		to_reuse = 0;	/* explicit */
-	else if (!entry->in_pack)
+	else if (!IN_PACK(entry))
 		to_reuse = 0;	/* can't reuse what we don't have */
 	else if (oe_type(entry) == OBJ_REF_DELTA ||
 		 oe_type(entry) == OBJ_OFS_DELTA)
@@ -1074,7 +1076,7 @@ static void create_object_entry(const struct object_id *oid,
 	else
 		nr_result++;
 	if (found_pack) {
-		entry->in_pack = found_pack;
+		oe_set_in_pack(&to_pack, entry, found_pack);
 		entry->in_pack_offset = found_offset;
 	}
 
@@ -1399,8 +1401,8 @@ static void cleanup_preferred_base(void)
 
 static void check_object(struct object_entry *entry)
 {
-	if (entry->in_pack) {
-		struct packed_git *p = entry->in_pack;
+	if (IN_PACK(entry)) {
+		struct packed_git *p = IN_PACK(entry);
 		struct pack_window *w_curs = NULL;
 		const unsigned char *base_ref = NULL;
 		struct object_entry *base_entry;
@@ -1535,14 +1537,16 @@ static int pack_offset_sort(const void *_a, const void *_b)
 {
 	const struct object_entry *a = *(struct object_entry **)_a;
 	const struct object_entry *b = *(struct object_entry **)_b;
+	const struct packed_git *a_in_pack = IN_PACK(a);
+	const struct packed_git *b_in_pack = IN_PACK(b);
 
 	/* avoid filesystem trashing with loose objects */
-	if (!a->in_pack && !b->in_pack)
+	if (!a_in_pack && !b_in_pack)
 		return oidcmp(&a->idx.oid, &b->idx.oid);
 
-	if (a->in_pack < b->in_pack)
+	if (a_in_pack < b_in_pack)
 		return -1;
-	if (a->in_pack > b->in_pack)
+	if (a_in_pack > b_in_pack)
 		return 1;
 	return a->in_pack_offset < b->in_pack_offset ? -1 :
 			(a->in_pack_offset > b->in_pack_offset);
@@ -1578,7 +1582,7 @@ static void drop_reused_delta(struct object_entry *entry)
 
 	oi.sizep = &entry->size;
 	oi.typep = &type;
-	if (packed_object_info(entry->in_pack, entry->in_pack_offset, &oi) < 0) {
+	if (packed_object_info(IN_PACK(entry), entry->in_pack_offset, &oi) < 0) {
 		/*
 		 * We failed to get the info from this pack for some reason;
 		 * fall back to sha1_object_info, which may find another copy.
@@ -1848,8 +1852,8 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	 * it, we will still save the transfer cost, as we already know
 	 * the other side has it and we won't send src_entry at all.
 	 */
-	if (reuse_delta && trg_entry->in_pack &&
-	    trg_entry->in_pack == src_entry->in_pack &&
+	if (reuse_delta && IN_PACK(trg_entry) &&
+	    IN_PACK(trg_entry) == IN_PACK(src_entry) &&
 	    !src_entry->preferred_base &&
 	    trg_entry->in_pack_type != OBJ_REF_DELTA &&
 	    trg_entry->in_pack_type != OBJ_OFS_DELTA)
@@ -3193,6 +3197,8 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 		}
 	}
 
+	prepare_packing_data(&to_pack);
+
 	if (progress)
 		progress_state = start_progress(_("Counting objects"), 0);
 	if (!use_internal_rev_list)
diff --git a/cache.h b/cache.h
index 862bdff83a..b90feb3802 100644
--- a/cache.h
+++ b/cache.h
@@ -1635,6 +1635,7 @@ extern struct packed_git {
 	int index_version;
 	time_t mtime;
 	int pack_fd;
+	int index;		/* for builtin/pack-objects.c */
 	unsigned pack_local:1,
 		 pack_keep:1,
 		 freshened:1,
diff --git a/pack-objects.c b/pack-objects.c
index 9558d13834..09f0a88865 100644
--- a/pack-objects.c
+++ b/pack-objects.c
@@ -2,6 +2,8 @@
 #include "object.h"
 #include "pack.h"
 #include "pack-objects.h"
+#include "packfile.h"
+#include "config.h"
 
 static uint32_t locate_object_entry_hash(struct packing_data *pdata,
 					 const unsigned char *sha1,
@@ -86,6 +88,64 @@ struct object_entry *packlist_find(struct packing_data *pdata,
 	return &pdata->objects[pdata->index[i] - 1];
 }
 
+static void prepare_in_pack_by_idx(struct packing_data *pdata)
+{
+	struct packed_git **mapping, *p;
+	int cnt = 0, nr = 1U << OE_IN_PACK_BITS;
+
+	ALLOC_ARRAY(mapping, nr);
+	/*
+	 * oe_in_pack() on an all-zero'd object_entry
+	 * (i.e. in_pack_idx also zero) should return NULL.
+	 */
+	mapping[cnt++] = NULL;
+	prepare_packed_git();
+	for (p = packed_git; p; p = p->next, cnt++) {
+		if (cnt == nr) {
+			free(mapping);
+			return;
+		}
+		p->index = cnt;
+		mapping[cnt] = p;
+	}
+	pdata->in_pack_by_idx = mapping;
+}
+
+/*
+ * A new pack appears after prepare_in_pack_by_idx() has been
+ * run. This is likely a race.
+ *
+ * We could map this new pack to in_pack_by_idx[] array, but then we
+ * have to deal with full array anyway. And since it's hard to test
+ * this fall back code, just stay simple and fall back to using
+ * in_pack[] array.
+ */
+void oe_map_new_pack(struct packing_data *pack,
+		     struct packed_git *p)
+{
+	uint32_t i;
+
+	REALLOC_ARRAY(pack->in_pack, pack->nr_alloc);
+
+	for (i = 0; i < pack->nr_objects; i++)
+		pack->in_pack[i] = oe_in_pack(pack, pack->objects + i);
+
+	FREE_AND_NULL(pack->in_pack_by_idx);
+}
+
+/* assume pdata is already zero'd by caller */
+void prepare_packing_data(struct packing_data *pdata)
+{
+	if (git_env_bool("GIT_TEST_FULL_IN_PACK_ARRAY", 0)) {
+		/*
+		 * do not initialize in_pack_by_idx[] to force the
+		 * slow path in oe_in_pack()
+		 */
+	} else {
+		prepare_in_pack_by_idx(pdata);
+	}
+}
+
 struct object_entry *packlist_alloc(struct packing_data *pdata,
 				    const unsigned char *sha1,
 				    uint32_t index_pos)
@@ -95,6 +155,9 @@ struct object_entry *packlist_alloc(struct packing_data *pdata,
 	if (pdata->nr_objects >= pdata->nr_alloc) {
 		pdata->nr_alloc = (pdata->nr_alloc  + 1024) * 3 / 2;
 		REALLOC_ARRAY(pdata->objects, pdata->nr_alloc);
+
+		if (!pdata->in_pack_by_idx)
+			REALLOC_ARRAY(pdata->in_pack, pdata->nr_alloc);
 	}
 
 	new_entry = pdata->objects + pdata->nr_objects++;
@@ -107,5 +170,8 @@ struct object_entry *packlist_alloc(struct packing_data *pdata,
 	else
 		pdata->index[index_pos] = pdata->nr_objects;
 
+	if (pdata->in_pack)
+		pdata->in_pack[pdata->nr_objects - 1] = NULL;
+
 	return new_entry;
 }
diff --git a/pack-objects.h b/pack-objects.h
index 71ea992c3c..4a8aa56042 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -3,6 +3,7 @@
 
 #define OE_DFS_STATE_BITS	2
 #define OE_DEPTH_BITS		12
+#define OE_IN_PACK_BITS		10
 
 /*
  * State flags for depth-first search used for analyzing delta cycles.
@@ -65,7 +66,7 @@ enum dfs_state {
 struct object_entry {
 	struct pack_idx_entry idx;
 	unsigned long size;	/* uncompressed size */
-	struct packed_git *in_pack;	/* already in pack */
+	unsigned in_pack_idx:OE_IN_PACK_BITS;	/* already in pack */
 	off_t in_pack_offset;
 	struct object_entry *delta;	/* delta base object */
 	struct object_entry *delta_child; /* deltified objects who bases me */
@@ -100,8 +101,18 @@ struct packing_data {
 	uint32_t index_size;
 
 	unsigned int *in_pack_pos;
+
+	/*
+	 * Only one of these can be non-NULL and they have different
+	 * sizes. if in_pack_by_idx is allocated, oe_in_pack() returns
+	 * the pack of an object using in_pack_idx field. If not,
+	 * in_pack[] array is used the same way as in_pack_pos[]
+	 */
+	struct packed_git **in_pack_by_idx;
+	struct packed_git **in_pack;
 };
 
+void prepare_packing_data(struct packing_data *pdata);
 struct object_entry *packlist_alloc(struct packing_data *pdata,
 				    const unsigned char *sha1,
 				    uint32_t index_pos);
@@ -158,4 +169,27 @@ static inline void oe_set_in_pack_pos(const struct packing_data *pack,
 	pack->in_pack_pos[e - pack->objects] = pos;
 }
 
+static inline struct packed_git *oe_in_pack(const struct packing_data *pack,
+					    const struct object_entry *e)
+{
+	if (pack->in_pack_by_idx)
+		return pack->in_pack_by_idx[e->in_pack_idx];
+	else
+		return pack->in_pack[e - pack->objects];
+}
+
+void oe_map_new_pack(struct packing_data *pack,
+		     struct packed_git *p);
+static inline void oe_set_in_pack(struct packing_data *pack,
+				  struct object_entry *e,
+				  struct packed_git *p)
+{
+	if (!p->index)
+		oe_map_new_pack(pack, p);
+	if (pack->in_pack_by_idx)
+		e->in_pack_idx = p->index;
+	else
+		pack->in_pack[e - pack->objects] = p;
+}
+
 #endif
diff --git a/t/README b/t/README
index 09eb2b9768..c6130ff16d 100644
--- a/t/README
+++ b/t/README
@@ -302,6 +302,10 @@ environment set.
 
 GIT_TEST_SPLIT_INDEX forces split-index mode on the whole test suite.
 
+GIT_TEST_FULL_IN_PACK_ARRAY exercises the uncommon pack-objects code
+path where there are more than 1024 packs even if the actual number of
+packs in repository is below this limit.
+
 Naming Tests
 ------------
 
diff --git a/t/t5300-pack-object.sh b/t/t5300-pack-object.sh
index 9c68b99251..5c076637ff 100755
--- a/t/t5300-pack-object.sh
+++ b/t/t5300-pack-object.sh
@@ -457,6 +457,11 @@ test_expect_success !PTHREADS,C_LOCALE_OUTPUT 'pack-objects --threads=N or pack.
 	grep -F "no threads support, ignoring pack.threads" err
 '
 
+test_expect_success 'pack-objects in too-many-packs mode' '
+	GIT_TEST_FULL_IN_PACK_ARRAY=1 git repack -ad &&
+	git fsck
+'
+
 #
 # WARNING!
 #
-- 
2.17.0.rc2.515.g4feb9b7923


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v8 08/15] pack-objects: refer to delta objects by index instead of pointer
  2018-03-31 10:02               ` [PATCH v8 00/15] " Nguyễn Thái Ngọc Duy
                                   ` (6 preceding siblings ...)
  2018-03-31 10:03                 ` [PATCH v8 07/15] pack-objects: move in_pack " Nguyễn Thái Ngọc Duy
@ 2018-03-31 10:03                 ` Nguyễn Thái Ngọc Duy
  2018-03-31 10:03                 ` [PATCH v8 09/15] pack-objects: shrink z_delta_size field in struct object_entry Nguyễn Thái Ngọc Duy
                                   ` (8 subsequent siblings)
  16 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-31 10:03 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

These delta pointers always point to elements in the objects[] array
in packing_data struct. We can only hold maximum 4G of those objects
because the array size in nr_objects is uint32_t. We could use
uint32_t indexes to address these elements instead of pointers. On
64-bit architecture (8 bytes per pointer) this would save 4 bytes per
pointer.

Convert these delta pointers to indexes. Since we need to handle NULL
pointers as well, the index is shifted by one [1].

[1] This means we can only index 2^32-2 objects even though nr_objects
    could contain 2^32-1 objects. It should not be a problem in
    practice because when we grow objects[], nr_alloc would probably
    blow up long before nr_objects hits the wall.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 116 ++++++++++++++++++++++-------------------
 pack-objects.h         |  67 ++++++++++++++++++++++--
 2 files changed, 124 insertions(+), 59 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 4e5812a053..118c8fd993 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -30,6 +30,12 @@
 #include "packfile.h"
 
 #define IN_PACK(obj) oe_in_pack(&to_pack, obj)
+#define DELTA(obj) oe_delta(&to_pack, obj)
+#define DELTA_CHILD(obj) oe_delta_child(&to_pack, obj)
+#define DELTA_SIBLING(obj) oe_delta_sibling(&to_pack, obj)
+#define SET_DELTA(obj, val) oe_set_delta(&to_pack, obj, val)
+#define SET_DELTA_CHILD(obj, val) oe_set_delta_child(&to_pack, obj, val)
+#define SET_DELTA_SIBLING(obj, val) oe_set_delta_sibling(&to_pack, obj, val)
 
 static const char *pack_usage[] = {
 	N_("git pack-objects --stdout [<options>...] [< <ref-list> | < <object-list>]"),
@@ -127,11 +133,11 @@ static void *get_delta(struct object_entry *entry)
 	buf = read_sha1_file(entry->idx.oid.hash, &type, &size);
 	if (!buf)
 		die("unable to read %s", oid_to_hex(&entry->idx.oid));
-	base_buf = read_sha1_file(entry->delta->idx.oid.hash, &type,
+	base_buf = read_sha1_file(DELTA(entry)->idx.oid.hash, &type,
 				  &base_size);
 	if (!base_buf)
 		die("unable to read %s",
-		    oid_to_hex(&entry->delta->idx.oid));
+		    oid_to_hex(&DELTA(entry)->idx.oid));
 	delta_buf = diff_delta(base_buf, base_size,
 			       buf, size, &delta_size, 0);
 	if (!delta_buf || delta_size != entry->delta_size)
@@ -288,12 +294,12 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 		size = entry->delta_size;
 		buf = entry->delta_data;
 		entry->delta_data = NULL;
-		type = (allow_ofs_delta && entry->delta->idx.offset) ?
+		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	} else {
 		buf = get_delta(entry);
 		size = entry->delta_size;
-		type = (allow_ofs_delta && entry->delta->idx.offset) ?
+		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	}
 
@@ -317,7 +323,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 		 * encoding of the relative offset for the delta
 		 * base from this object's position in the pack.
 		 */
-		off_t ofs = entry->idx.offset - entry->delta->idx.offset;
+		off_t ofs = entry->idx.offset - DELTA(entry)->idx.offset;
 		unsigned pos = sizeof(dheader) - 1;
 		dheader[pos] = ofs & 127;
 		while (ofs >>= 7)
@@ -343,7 +349,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 			return 0;
 		}
 		hashwrite(f, header, hdrlen);
-		hashwrite(f, entry->delta->idx.oid.hash, 20);
+		hashwrite(f, DELTA(entry)->idx.oid.hash, 20);
 		hdrlen += 20;
 	} else {
 		if (limit && hdrlen + datalen + 20 >= limit) {
@@ -379,8 +385,8 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 		      dheader[MAX_PACK_OBJECT_HEADER];
 	unsigned hdrlen;
 
-	if (entry->delta)
-		type = (allow_ofs_delta && entry->delta->idx.offset) ?
+	if (DELTA(entry))
+		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	hdrlen = encode_in_pack_object_header(header, sizeof(header),
 					      type, entry->size);
@@ -408,7 +414,7 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 	}
 
 	if (type == OBJ_OFS_DELTA) {
-		off_t ofs = entry->idx.offset - entry->delta->idx.offset;
+		off_t ofs = entry->idx.offset - DELTA(entry)->idx.offset;
 		unsigned pos = sizeof(dheader) - 1;
 		dheader[pos] = ofs & 127;
 		while (ofs >>= 7)
@@ -427,7 +433,7 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 			return 0;
 		}
 		hashwrite(f, header, hdrlen);
-		hashwrite(f, entry->delta->idx.oid.hash, 20);
+		hashwrite(f, DELTA(entry)->idx.oid.hash, 20);
 		hdrlen += 20;
 		reused_delta++;
 	} else {
@@ -467,13 +473,13 @@ static off_t write_object(struct hashfile *f,
 	else
 		limit = pack_size_limit - write_offset;
 
-	if (!entry->delta)
+	if (!DELTA(entry))
 		usable_delta = 0;	/* no delta */
 	else if (!pack_size_limit)
 	       usable_delta = 1;	/* unlimited packfile */
-	else if (entry->delta->idx.offset == (off_t)-1)
+	else if (DELTA(entry)->idx.offset == (off_t)-1)
 		usable_delta = 0;	/* base was written to another pack */
-	else if (entry->delta->idx.offset)
+	else if (DELTA(entry)->idx.offset)
 		usable_delta = 1;	/* base already exists in this pack */
 	else
 		usable_delta = 0;	/* base could end up in another pack */
@@ -489,7 +495,7 @@ static off_t write_object(struct hashfile *f,
 				/* ... but pack split may override that */
 	else if (oe_type(entry) != entry->in_pack_type)
 		to_reuse = 0;	/* pack has delta which is unusable */
-	else if (entry->delta)
+	else if (DELTA(entry))
 		to_reuse = 0;	/* we want to pack afresh */
 	else
 		to_reuse = 1;	/* we have it in-pack undeltified,
@@ -541,12 +547,12 @@ static enum write_one_status write_one(struct hashfile *f,
 	}
 
 	/* if we are deltified, write out base object first. */
-	if (e->delta) {
+	if (DELTA(e)) {
 		e->idx.offset = 1; /* now recurse */
-		switch (write_one(f, e->delta, offset)) {
+		switch (write_one(f, DELTA(e), offset)) {
 		case WRITE_ONE_RECURSIVE:
 			/* we cannot depend on this one */
-			e->delta = NULL;
+			SET_DELTA(e, NULL);
 			break;
 		default:
 			break;
@@ -608,34 +614,34 @@ static void add_descendants_to_write_order(struct object_entry **wo,
 			/* add this node... */
 			add_to_write_order(wo, endp, e);
 			/* all its siblings... */
-			for (s = e->delta_sibling; s; s = s->delta_sibling) {
+			for (s = DELTA_SIBLING(e); s; s = DELTA_SIBLING(s)) {
 				add_to_write_order(wo, endp, s);
 			}
 		}
 		/* drop down a level to add left subtree nodes if possible */
-		if (e->delta_child) {
+		if (DELTA_CHILD(e)) {
 			add_to_order = 1;
-			e = e->delta_child;
+			e = DELTA_CHILD(e);
 		} else {
 			add_to_order = 0;
 			/* our sibling might have some children, it is next */
-			if (e->delta_sibling) {
-				e = e->delta_sibling;
+			if (DELTA_SIBLING(e)) {
+				e = DELTA_SIBLING(e);
 				continue;
 			}
 			/* go back to our parent node */
-			e = e->delta;
-			while (e && !e->delta_sibling) {
+			e = DELTA(e);
+			while (e && !DELTA_SIBLING(e)) {
 				/* we're on the right side of a subtree, keep
 				 * going up until we can go right again */
-				e = e->delta;
+				e = DELTA(e);
 			}
 			if (!e) {
 				/* done- we hit our original root node */
 				return;
 			}
 			/* pass it off to sibling at this level */
-			e = e->delta_sibling;
+			e = DELTA_SIBLING(e);
 		}
 	};
 }
@@ -646,7 +652,7 @@ static void add_family_to_write_order(struct object_entry **wo,
 {
 	struct object_entry *root;
 
-	for (root = e; root->delta; root = root->delta)
+	for (root = e; DELTA(root); root = DELTA(root))
 		; /* nothing */
 	add_descendants_to_write_order(wo, endp, root);
 }
@@ -661,8 +667,8 @@ static struct object_entry **compute_write_order(void)
 	for (i = 0; i < to_pack.nr_objects; i++) {
 		objects[i].tagged = 0;
 		objects[i].filled = 0;
-		objects[i].delta_child = NULL;
-		objects[i].delta_sibling = NULL;
+		SET_DELTA_CHILD(&objects[i], NULL);
+		SET_DELTA_SIBLING(&objects[i], NULL);
 	}
 
 	/*
@@ -672,11 +678,11 @@ static struct object_entry **compute_write_order(void)
 	 */
 	for (i = to_pack.nr_objects; i > 0;) {
 		struct object_entry *e = &objects[--i];
-		if (!e->delta)
+		if (!DELTA(e))
 			continue;
 		/* Mark me as the first child */
-		e->delta_sibling = e->delta->delta_child;
-		e->delta->delta_child = e;
+		e->delta_sibling_idx = DELTA(e)->delta_child_idx;
+		SET_DELTA_CHILD(DELTA(e), e);
 	}
 
 	/*
@@ -1493,10 +1499,10 @@ static void check_object(struct object_entry *entry)
 			 * circular deltas.
 			 */
 			oe_set_type(entry, entry->in_pack_type);
-			entry->delta = base_entry;
+			SET_DELTA(entry, base_entry);
 			entry->delta_size = entry->size;
-			entry->delta_sibling = base_entry->delta_child;
-			base_entry->delta_child = entry;
+			entry->delta_sibling_idx = base_entry->delta_child_idx;
+			SET_DELTA_CHILD(base_entry, entry);
 			unuse_pack(&w_curs);
 			return;
 		}
@@ -1567,17 +1573,19 @@ static int pack_offset_sort(const void *_a, const void *_b)
  */
 static void drop_reused_delta(struct object_entry *entry)
 {
-	struct object_entry **p = &entry->delta->delta_child;
+	unsigned *idx = &to_pack.objects[entry->delta_idx - 1].delta_child_idx;
 	struct object_info oi = OBJECT_INFO_INIT;
 	enum object_type type;
 
-	while (*p) {
-		if (*p == entry)
-			*p = (*p)->delta_sibling;
+	while (*idx) {
+		struct object_entry *oe = &to_pack.objects[*idx - 1];
+
+		if (oe == entry)
+			*idx = oe->delta_sibling_idx;
 		else
-			p = &(*p)->delta_sibling;
+			idx = &oe->delta_sibling_idx;
 	}
-	entry->delta = NULL;
+	SET_DELTA(entry, NULL);
 	entry->depth = 0;
 
 	oi.sizep = &entry->size;
@@ -1617,7 +1625,7 @@ static void break_delta_chains(struct object_entry *entry)
 
 	for (cur = entry, total_depth = 0;
 	     cur;
-	     cur = cur->delta, total_depth++) {
+	     cur = DELTA(cur), total_depth++) {
 		if (cur->dfs_state == DFS_DONE) {
 			/*
 			 * We've already seen this object and know it isn't
@@ -1642,7 +1650,7 @@ static void break_delta_chains(struct object_entry *entry)
 		 * it's not a delta, we're done traversing, but we'll mark it
 		 * done to save time on future traversals.
 		 */
-		if (!cur->delta) {
+		if (!DELTA(cur)) {
 			cur->dfs_state = DFS_DONE;
 			break;
 		}
@@ -1665,7 +1673,7 @@ static void break_delta_chains(struct object_entry *entry)
 		 * We keep all commits in the chain that we examined.
 		 */
 		cur->dfs_state = DFS_ACTIVE;
-		if (cur->delta->dfs_state == DFS_ACTIVE) {
+		if (DELTA(cur)->dfs_state == DFS_ACTIVE) {
 			drop_reused_delta(cur);
 			cur->dfs_state = DFS_DONE;
 			break;
@@ -1680,7 +1688,7 @@ static void break_delta_chains(struct object_entry *entry)
 	 * an extra "next" pointer to keep going after we reset cur->delta.
 	 */
 	for (cur = entry; cur; cur = next) {
-		next = cur->delta;
+		next = DELTA(cur);
 
 		/*
 		 * We should have a chain of zero or more ACTIVE states down to
@@ -1865,7 +1873,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 
 	/* Now some size filtering heuristics. */
 	trg_size = trg_entry->size;
-	if (!trg_entry->delta) {
+	if (!DELTA(trg_entry)) {
 		max_size = trg_size/2 - 20;
 		ref_depth = 1;
 	} else {
@@ -1941,7 +1949,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	if (!delta_buf)
 		return 0;
 
-	if (trg_entry->delta) {
+	if (DELTA(trg_entry)) {
 		/* Prefer only shallower same-sized deltas. */
 		if (delta_size == trg_entry->delta_size &&
 		    src->depth + 1 >= trg->depth) {
@@ -1970,7 +1978,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 		free(delta_buf);
 	}
 
-	trg_entry->delta = src_entry;
+	SET_DELTA(trg_entry, src_entry);
 	trg_entry->delta_size = delta_size;
 	trg->depth = src->depth + 1;
 
@@ -1979,13 +1987,13 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 
 static unsigned int check_delta_limit(struct object_entry *me, unsigned int n)
 {
-	struct object_entry *child = me->delta_child;
+	struct object_entry *child = DELTA_CHILD(me);
 	unsigned int m = n;
 	while (child) {
 		unsigned int c = check_delta_limit(child, n + 1);
 		if (m < c)
 			m = c;
-		child = child->delta_sibling;
+		child = DELTA_SIBLING(child);
 	}
 	return m;
 }
@@ -2054,7 +2062,7 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
 		 * otherwise they would become too deep.
 		 */
 		max_depth = depth;
-		if (entry->delta_child) {
+		if (DELTA_CHILD(entry)) {
 			max_depth -= check_delta_limit(entry, 0);
 			if (max_depth <= 0)
 				goto next;
@@ -2104,7 +2112,7 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
 		 * depth, leaving it in the window is pointless.  we
 		 * should evict it first.
 		 */
-		if (entry->delta && max_depth <= n->depth)
+		if (DELTA(entry) && max_depth <= n->depth)
 			continue;
 
 		/*
@@ -2112,7 +2120,7 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
 		 * currently deltified object, to keep it longer.  It will
 		 * be the first base object to be attempted next.
 		 */
-		if (entry->delta) {
+		if (DELTA(entry)) {
 			struct unpacked swap = array[best_base];
 			int dist = (window + idx - best_base) % window;
 			int dst = best_base;
@@ -2433,7 +2441,7 @@ static void prepare_pack(int window, int depth)
 	for (i = 0; i < to_pack.nr_objects; i++) {
 		struct object_entry *entry = to_pack.objects + i;
 
-		if (entry->delta)
+		if (DELTA(entry))
 			/* This happens if we decided to reuse existing
 			 * delta from a pack.  "reuse_delta &&" is implied.
 			 */
diff --git a/pack-objects.h b/pack-objects.h
index 4a8aa56042..272ddeeedb 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -68,11 +68,11 @@ struct object_entry {
 	unsigned long size;	/* uncompressed size */
 	unsigned in_pack_idx:OE_IN_PACK_BITS;	/* already in pack */
 	off_t in_pack_offset;
-	struct object_entry *delta;	/* delta base object */
-	struct object_entry *delta_child; /* deltified objects who bases me */
-	struct object_entry *delta_sibling; /* other deltified objects who
-					     * uses the same base as me
-					     */
+	uint32_t delta_idx;	/* delta base object */
+	uint32_t delta_child_idx; /* deltified objects who bases me */
+	uint32_t delta_sibling_idx; /* other deltified objects who
+				     * uses the same base as me
+				     */
 	void *delta_data;	/* cached delta (uncompressed) */
 	unsigned long delta_size;	/* delta data size (uncompressed) */
 	unsigned long z_delta_size;	/* delta data size (compressed) */
@@ -192,4 +192,61 @@ static inline void oe_set_in_pack(struct packing_data *pack,
 		pack->in_pack[e - pack->objects] = p;
 }
 
+static inline struct object_entry *oe_delta(
+		const struct packing_data *pack,
+		const struct object_entry *e)
+{
+	if (e->delta_idx)
+		return &pack->objects[e->delta_idx - 1];
+	return NULL;
+}
+
+static inline void oe_set_delta(struct packing_data *pack,
+				struct object_entry *e,
+				struct object_entry *delta)
+{
+	if (delta)
+		e->delta_idx = (delta - pack->objects) + 1;
+	else
+		e->delta_idx = 0;
+}
+
+static inline struct object_entry *oe_delta_child(
+		const struct packing_data *pack,
+		const struct object_entry *e)
+{
+	if (e->delta_child_idx)
+		return &pack->objects[e->delta_child_idx - 1];
+	return NULL;
+}
+
+static inline void oe_set_delta_child(struct packing_data *pack,
+				      struct object_entry *e,
+				      struct object_entry *delta)
+{
+	if (delta)
+		e->delta_child_idx = (delta - pack->objects) + 1;
+	else
+		e->delta_child_idx = 0;
+}
+
+static inline struct object_entry *oe_delta_sibling(
+		const struct packing_data *pack,
+		const struct object_entry *e)
+{
+	if (e->delta_sibling_idx)
+		return &pack->objects[e->delta_sibling_idx - 1];
+	return NULL;
+}
+
+static inline void oe_set_delta_sibling(struct packing_data *pack,
+					struct object_entry *e,
+					struct object_entry *delta)
+{
+	if (delta)
+		e->delta_sibling_idx = (delta - pack->objects) + 1;
+	else
+		e->delta_sibling_idx = 0;
+}
+
 #endif
-- 
2.17.0.rc2.515.g4feb9b7923


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v8 09/15] pack-objects: shrink z_delta_size field in struct object_entry
  2018-03-31 10:02               ` [PATCH v8 00/15] " Nguyễn Thái Ngọc Duy
                                   ` (7 preceding siblings ...)
  2018-03-31 10:03                 ` [PATCH v8 08/15] pack-objects: refer to delta objects by index instead of pointer Nguyễn Thái Ngọc Duy
@ 2018-03-31 10:03                 ` Nguyễn Thái Ngọc Duy
  2018-03-31 10:03                 ` [PATCH v8 10/15] pack-objects: don't check size when the object is bad Nguyễn Thái Ngọc Duy
                                   ` (7 subsequent siblings)
  16 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-31 10:03 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

We only cache deltas when it's smaller than a certain limit. This limit
defaults to 1000 but save its compressed length in a 64-bit field.
Shrink that field down to 20 bits, so you can only cache 1MB deltas.
Larger deltas must be recomputed at when the pack is written down.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 Documentation/config.txt |  3 ++-
 builtin/pack-objects.c   | 24 ++++++++++++++++++------
 pack-objects.h           |  3 ++-
 3 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index 9bd3f5a789..00fa824448 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -2449,7 +2449,8 @@ pack.deltaCacheLimit::
 	The maximum size of a delta, that is cached in
 	linkgit:git-pack-objects[1]. This cache is used to speed up the
 	writing object phase by not having to recompute the final delta
-	result once the best match for all objects is found. Defaults to 1000.
+	result once the best match for all objects is found.
+	Defaults to 1000. Maximum value is 65535.
 
 pack.threads::
 	Specifies the number of threads to spawn when searching for best
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 118c8fd993..34e285a1b7 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -2100,12 +2100,19 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
 		 * between writes at that moment.
 		 */
 		if (entry->delta_data && !pack_to_stdout) {
-			entry->z_delta_size = do_compress(&entry->delta_data,
-							  entry->delta_size);
-			cache_lock();
-			delta_cache_size -= entry->delta_size;
-			delta_cache_size += entry->z_delta_size;
-			cache_unlock();
+			unsigned long size;
+
+			size = do_compress(&entry->delta_data, entry->delta_size);
+			if (size < (1U << OE_Z_DELTA_BITS)) {
+				entry->z_delta_size = size;
+				cache_lock();
+				delta_cache_size -= entry->delta_size;
+				delta_cache_size += entry->z_delta_size;
+				cache_unlock();
+			} else {
+				FREE_AND_NULL(entry->delta_data);
+				entry->z_delta_size = 0;
+			}
 		}
 
 		/* if we made n a delta, and if n is already at max
@@ -3086,6 +3093,11 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 			depth, (1 << OE_DEPTH_BITS) - 1);
 		depth = (1 << OE_DEPTH_BITS) - 1;
 	}
+	if (cache_max_small_delta_size >= (1U << OE_Z_DELTA_BITS)) {
+		warning(_("pack.deltaCacheLimit is too high, forcing %d"),
+			(1U << OE_Z_DELTA_BITS) - 1);
+		cache_max_small_delta_size = (1U << OE_Z_DELTA_BITS) - 1;
+	}
 
 	argv_array_push(&rp, "pack-objects");
 	if (thin) {
diff --git a/pack-objects.h b/pack-objects.h
index 272ddeeedb..5613a3d040 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -4,6 +4,7 @@
 #define OE_DFS_STATE_BITS	2
 #define OE_DEPTH_BITS		12
 #define OE_IN_PACK_BITS		10
+#define OE_Z_DELTA_BITS		20
 
 /*
  * State flags for depth-first search used for analyzing delta cycles.
@@ -75,7 +76,7 @@ struct object_entry {
 				     */
 	void *delta_data;	/* cached delta (uncompressed) */
 	unsigned long delta_size;	/* delta data size (uncompressed) */
-	unsigned long z_delta_size;	/* delta data size (compressed) */
+	unsigned z_delta_size:OE_Z_DELTA_BITS;
 	unsigned type_:TYPE_BITS;
 	unsigned in_pack_type:TYPE_BITS; /* could be delta */
 	unsigned type_valid:1;
-- 
2.17.0.rc2.515.g4feb9b7923


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v8 10/15] pack-objects: don't check size when the object is bad
  2018-03-31 10:02               ` [PATCH v8 00/15] " Nguyễn Thái Ngọc Duy
                                   ` (8 preceding siblings ...)
  2018-03-31 10:03                 ` [PATCH v8 09/15] pack-objects: shrink z_delta_size field in struct object_entry Nguyễn Thái Ngọc Duy
@ 2018-03-31 10:03                 ` Nguyễn Thái Ngọc Duy
  2018-03-31 10:03                 ` [PATCH v8 11/15] pack-objects: clarify the use of object_entry::size Nguyễn Thái Ngọc Duy
                                   ` (6 subsequent siblings)
  16 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-31 10:03 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

sha1_object_info() in check_objects() may fail to locate an object in
the pack and return type OBJ_BAD. In that case, it will likely leave
the "size" field untouched. We delay error handling until later in
prepare_pack() though. Until then, do not touch "size" field.

This field should contain the default value zero, but we can't say
sha1_object_info() cannot damage it. This becomes more important later
when the object size may have to be retrieved back from the
(non-existing) pack.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 34e285a1b7..481b55c746 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -1741,7 +1741,7 @@ static void get_object_details(void)
 	for (i = 0; i < to_pack.nr_objects; i++) {
 		struct object_entry *entry = sorted_by_offset[i];
 		check_object(entry);
-		if (big_file_threshold < entry->size)
+		if (entry->type_valid && big_file_threshold < entry->size)
 			entry->no_try_delta = 1;
 	}
 
@@ -2454,7 +2454,7 @@ static void prepare_pack(int window, int depth)
 			 */
 			continue;
 
-		if (entry->size < 50)
+		if (!entry->type_valid || entry->size < 50)
 			continue;
 
 		if (entry->no_try_delta)
-- 
2.17.0.rc2.515.g4feb9b7923


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v8 11/15] pack-objects: clarify the use of object_entry::size
  2018-03-31 10:02               ` [PATCH v8 00/15] " Nguyễn Thái Ngọc Duy
                                   ` (9 preceding siblings ...)
  2018-03-31 10:03                 ` [PATCH v8 10/15] pack-objects: don't check size when the object is bad Nguyễn Thái Ngọc Duy
@ 2018-03-31 10:03                 ` Nguyễn Thái Ngọc Duy
  2018-03-31 10:03                 ` [PATCH v8 12/15] pack-objects: shrink size field in struct object_entry Nguyễn Thái Ngọc Duy
                                   ` (5 subsequent siblings)
  16 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-31 10:03 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

While this field most of the time contains the canonical object size,
there is one case it does not: when we have found that the base object
of the delta in question is also to be packed, we will very happily
reuse the delta by copying it over instead of regenerating the new
delta.

"size" in this case will record the delta size, not canonical object
size. Later on in write_reuse_object(), we reconstruct the delta
header and "size" is used for this purpose. When this happens, the
"type" field contains a delta type instead of a canonical type.
Highlight this in the code since it could be tricky to see.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 11 ++++++++---
 pack-objects.h         |  4 +++-
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 481b55c746..7a84c3f59a 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -1417,6 +1417,7 @@ static void check_object(struct object_entry *entry)
 		off_t ofs;
 		unsigned char *buf, c;
 		enum object_type type;
+		unsigned long in_pack_size;
 
 		buf = use_pack(p, &w_curs, entry->in_pack_offset, &avail);
 
@@ -1426,7 +1427,7 @@ static void check_object(struct object_entry *entry)
 		 */
 		used = unpack_object_header_buffer(buf, avail,
 						   &type,
-						   &entry->size);
+						   &in_pack_size);
 		if (used == 0)
 			goto give_up;
 
@@ -1443,6 +1444,7 @@ static void check_object(struct object_entry *entry)
 		default:
 			/* Not a delta hence we've already got all we need. */
 			oe_set_type(entry, entry->in_pack_type);
+			entry->size = in_pack_size;
 			entry->in_pack_header_size = used;
 			if (oe_type(entry) < OBJ_COMMIT || oe_type(entry) > OBJ_BLOB)
 				goto give_up;
@@ -1499,6 +1501,7 @@ static void check_object(struct object_entry *entry)
 			 * circular deltas.
 			 */
 			oe_set_type(entry, entry->in_pack_type);
+			entry->size = in_pack_size; /* delta size */
 			SET_DELTA(entry, base_entry);
 			entry->delta_size = entry->size;
 			entry->delta_sibling_idx = base_entry->delta_child_idx;
@@ -1508,13 +1511,15 @@ static void check_object(struct object_entry *entry)
 		}
 
 		if (oe_type(entry)) {
+			off_t delta_pos;
+
 			/*
 			 * This must be a delta and we already know what the
 			 * final object type is.  Let's extract the actual
 			 * object size from the delta header.
 			 */
-			entry->size = get_size_from_delta(p, &w_curs,
-					entry->in_pack_offset + entry->in_pack_header_size);
+			delta_pos = entry->in_pack_offset + entry->in_pack_header_size;
+			entry->size = get_size_from_delta(p, &w_curs, delta_pos);
 			if (entry->size == 0)
 				goto give_up;
 			unuse_pack(&w_curs);
diff --git a/pack-objects.h b/pack-objects.h
index 5613a3d040..534f5a5e4d 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -30,7 +30,9 @@ enum dfs_state {
  *
  * "size" is the uncompressed object size. Compressed size of the raw
  * data for an object in a pack is not stored anywhere but is computed
- * and made available when reverse .idx is made.
+ * and made available when reverse .idx is made. Note that when a
+ * delta is reused, "size" is the uncompressed _delta_ size, not the
+ * canonical one after the delta has been applied.
  *
  * "hash" contains a path name hash which is used for sorting the
  * delta list and also during delta searching. Once prepare_pack()
-- 
2.17.0.rc2.515.g4feb9b7923


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v8 12/15] pack-objects: shrink size field in struct object_entry
  2018-03-31 10:02               ` [PATCH v8 00/15] " Nguyễn Thái Ngọc Duy
                                   ` (10 preceding siblings ...)
  2018-03-31 10:03                 ` [PATCH v8 11/15] pack-objects: clarify the use of object_entry::size Nguyễn Thái Ngọc Duy
@ 2018-03-31 10:03                 ` Nguyễn Thái Ngọc Duy
  2018-03-31 10:03                 ` [PATCH v8 13/15] pack-objects: shrink delta_size " Nguyễn Thái Ngọc Duy
                                   ` (4 subsequent siblings)
  16 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-31 10:03 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

It's very very rare that an uncompressed object is larger than 4GB
(partly because Git does not handle those large files very well to
begin with). Let's optimize it for the common case where object size
is smaller than this limit.

Shrink size field down to 31 bits and one overflow bit. If the size is
too large, we read it back from disk. As noted in the previous patch,
we need to return the delta size instead of canonical size when the
to-be-reused object entry type is a delta instead of a canonical one.

Add two compare helpers that can take advantage of the overflow
bit (e.g. if the file is 4GB+, chances are it's already larger than
core.bigFileThreshold and there's no point in comparing the actual
value).

Another note about oe_get_size_slow(). This function MUST be thread
safe because SIZE() macro is used inside try_delta() which may run in
parallel. Outside parallel code, no-contention locking should be dirt
cheap (or insignificant compared to i/o access anyway). To exercise
this code, it's best to run the test suite with something like

    make test GIT_TEST_OE_SIZE=4

which forces this code on all objects larger than 3 bytes.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 105 +++++++++++++++++++++++++++++++----------
 pack-objects.c         |   3 ++
 pack-objects.h         |  57 +++++++++++++++++++++-
 t/README               |   6 +++
 4 files changed, 146 insertions(+), 25 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 7a84c3f59a..b4ea6290f9 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -30,6 +30,8 @@
 #include "packfile.h"
 
 #define IN_PACK(obj) oe_in_pack(&to_pack, obj)
+#define SIZE(obj) oe_size(&to_pack, obj)
+#define SET_SIZE(obj,size) oe_set_size(&to_pack, obj, size)
 #define DELTA(obj) oe_delta(&to_pack, obj)
 #define DELTA_CHILD(obj) oe_delta_child(&to_pack, obj)
 #define DELTA_SIBLING(obj) oe_delta_sibling(&to_pack, obj)
@@ -274,7 +276,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 
 	if (!usable_delta) {
 		if (oe_type(entry) == OBJ_BLOB &&
-		    entry->size > big_file_threshold &&
+		    oe_size_greater_than(&to_pack, entry, big_file_threshold) &&
 		    (st = open_istream(entry->idx.oid.hash, &type, &size, NULL)) != NULL)
 			buf = NULL;
 		else {
@@ -384,12 +386,13 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 	unsigned char header[MAX_PACK_OBJECT_HEADER],
 		      dheader[MAX_PACK_OBJECT_HEADER];
 	unsigned hdrlen;
+	unsigned long entry_size = SIZE(entry);
 
 	if (DELTA(entry))
 		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	hdrlen = encode_in_pack_object_header(header, sizeof(header),
-					      type, entry->size);
+					      type, entry_size);
 
 	offset = entry->in_pack_offset;
 	revidx = find_pack_revindex(p, offset);
@@ -406,7 +409,7 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 	datalen -= entry->in_pack_header_size;
 
 	if (!pack_to_stdout && p->index_version == 1 &&
-	    check_pack_inflate(p, &w_curs, offset, datalen, entry->size)) {
+	    check_pack_inflate(p, &w_curs, offset, datalen, entry_size)) {
 		error("corrupt packed object for %s",
 		      oid_to_hex(&entry->idx.oid));
 		unuse_pack(&w_curs);
@@ -1407,6 +1410,8 @@ static void cleanup_preferred_base(void)
 
 static void check_object(struct object_entry *entry)
 {
+	unsigned long canonical_size;
+
 	if (IN_PACK(entry)) {
 		struct packed_git *p = IN_PACK(entry);
 		struct pack_window *w_curs = NULL;
@@ -1444,7 +1449,7 @@ static void check_object(struct object_entry *entry)
 		default:
 			/* Not a delta hence we've already got all we need. */
 			oe_set_type(entry, entry->in_pack_type);
-			entry->size = in_pack_size;
+			SET_SIZE(entry, in_pack_size);
 			entry->in_pack_header_size = used;
 			if (oe_type(entry) < OBJ_COMMIT || oe_type(entry) > OBJ_BLOB)
 				goto give_up;
@@ -1501,9 +1506,9 @@ static void check_object(struct object_entry *entry)
 			 * circular deltas.
 			 */
 			oe_set_type(entry, entry->in_pack_type);
-			entry->size = in_pack_size; /* delta size */
+			SET_SIZE(entry, in_pack_size); /* delta size */
 			SET_DELTA(entry, base_entry);
-			entry->delta_size = entry->size;
+			entry->delta_size = in_pack_size;
 			entry->delta_sibling_idx = base_entry->delta_child_idx;
 			SET_DELTA_CHILD(base_entry, entry);
 			unuse_pack(&w_curs);
@@ -1519,9 +1524,10 @@ static void check_object(struct object_entry *entry)
 			 * object size from the delta header.
 			 */
 			delta_pos = entry->in_pack_offset + entry->in_pack_header_size;
-			entry->size = get_size_from_delta(p, &w_curs, delta_pos);
-			if (entry->size == 0)
+			canonical_size = get_size_from_delta(p, &w_curs, delta_pos);
+			if (canonical_size == 0)
 				goto give_up;
+			SET_SIZE(entry, canonical_size);
 			unuse_pack(&w_curs);
 			return;
 		}
@@ -1535,13 +1541,18 @@ static void check_object(struct object_entry *entry)
 		unuse_pack(&w_curs);
 	}
 
-	oe_set_type(entry, sha1_object_info(entry->idx.oid.hash, &entry->size));
-	/*
-	 * The error condition is checked in prepare_pack().  This is
-	 * to permit a missing preferred base object to be ignored
-	 * as a preferred base.  Doing so can result in a larger
-	 * pack file, but the transfer will still take place.
-	 */
+	oe_set_type(entry, sha1_object_info(entry->idx.oid.hash,
+					    &canonical_size));
+	if (entry->type_valid) {
+		SET_SIZE(entry, canonical_size);
+	} else {
+		/*
+		 * Bad object type is checked in prepare_pack().  This is
+		 * to permit a missing preferred base object to be ignored
+		 * as a preferred base.  Doing so can result in a larger
+		 * pack file, but the transfer will still take place.
+		 */
+	}
 }
 
 static int pack_offset_sort(const void *_a, const void *_b)
@@ -1581,6 +1592,7 @@ static void drop_reused_delta(struct object_entry *entry)
 	unsigned *idx = &to_pack.objects[entry->delta_idx - 1].delta_child_idx;
 	struct object_info oi = OBJECT_INFO_INIT;
 	enum object_type type;
+	unsigned long size;
 
 	while (*idx) {
 		struct object_entry *oe = &to_pack.objects[*idx - 1];
@@ -1593,7 +1605,7 @@ static void drop_reused_delta(struct object_entry *entry)
 	SET_DELTA(entry, NULL);
 	entry->depth = 0;
 
-	oi.sizep = &entry->size;
+	oi.sizep = &size;
 	oi.typep = &type;
 	if (packed_object_info(IN_PACK(entry), entry->in_pack_offset, &oi) < 0) {
 		/*
@@ -1603,10 +1615,11 @@ static void drop_reused_delta(struct object_entry *entry)
 		 * and dealt with in prepare_pack().
 		 */
 		oe_set_type(entry, sha1_object_info(entry->idx.oid.hash,
-						    &entry->size));
+						    &size));
 	} else {
 		oe_set_type(entry, type);
 	}
+	SET_SIZE(entry, size);
 }
 
 /*
@@ -1746,7 +1759,8 @@ static void get_object_details(void)
 	for (i = 0; i < to_pack.nr_objects; i++) {
 		struct object_entry *entry = sorted_by_offset[i];
 		check_object(entry);
-		if (entry->type_valid && big_file_threshold < entry->size)
+		if (entry->type_valid &&
+		    oe_size_greater_than(&to_pack, entry, big_file_threshold))
 			entry->no_try_delta = 1;
 	}
 
@@ -1775,6 +1789,8 @@ static int type_size_sort(const void *_a, const void *_b)
 	const struct object_entry *b = *(struct object_entry **)_b;
 	enum object_type a_type = oe_type(a);
 	enum object_type b_type = oe_type(b);
+	unsigned long a_size = SIZE(a);
+	unsigned long b_size = SIZE(b);
 
 	if (a_type > b_type)
 		return -1;
@@ -1788,9 +1804,9 @@ static int type_size_sort(const void *_a, const void *_b)
 		return -1;
 	if (a->preferred_base < b->preferred_base)
 		return 1;
-	if (a->size > b->size)
+	if (a_size > b_size)
 		return -1;
-	if (a->size < b->size)
+	if (a_size < b_size)
 		return 1;
 	return a < b ? -1 : (a > b);  /* newest first */
 }
@@ -1843,6 +1859,46 @@ static pthread_mutex_t progress_mutex;
 
 #endif
 
+/*
+ * Return the size of the object without doing any delta
+ * reconstruction (so non-deltas are true object sizes, but deltas
+ * return the size of the delta data).
+ */
+unsigned long oe_get_size_slow(struct packing_data *pack,
+			       const struct object_entry *e)
+{
+	struct packed_git *p;
+	struct pack_window *w_curs;
+	unsigned char *buf;
+	enum object_type type;
+	unsigned long used, avail, size;
+
+	if (e->type_ != OBJ_OFS_DELTA && e->type_ != OBJ_REF_DELTA) {
+		read_lock();
+		if (sha1_object_info(e->idx.oid.hash, &size) < 0)
+			die(_("unable to get size of %s"),
+			    oid_to_hex(&e->idx.oid));
+		read_unlock();
+		return size;
+	}
+
+	p = oe_in_pack(pack, e);
+	if (!p)
+		BUG("when e->type is a delta, it must belong to a pack");
+
+	read_lock();
+	w_curs = NULL;
+	buf = use_pack(p, &w_curs, e->in_pack_offset, &avail);
+	used = unpack_object_header_buffer(buf, avail, &type, &size);
+	if (used == 0)
+		die(_("unable to parse object header of %s"),
+		    oid_to_hex(&e->idx.oid));
+
+	unuse_pack(&w_curs);
+	read_unlock();
+	return size;
+}
+
 static int try_delta(struct unpacked *trg, struct unpacked *src,
 		     unsigned max_depth, unsigned long *mem_usage)
 {
@@ -1877,7 +1933,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 		return 0;
 
 	/* Now some size filtering heuristics. */
-	trg_size = trg_entry->size;
+	trg_size = SIZE(trg_entry);
 	if (!DELTA(trg_entry)) {
 		max_size = trg_size/2 - 20;
 		ref_depth = 1;
@@ -1889,7 +1945,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 						(max_depth - ref_depth + 1);
 	if (max_size == 0)
 		return 0;
-	src_size = src_entry->size;
+	src_size = SIZE(src_entry);
 	sizediff = src_size < trg_size ? trg_size - src_size : 0;
 	if (sizediff >= max_size)
 		return 0;
@@ -2009,7 +2065,7 @@ static unsigned long free_unpacked(struct unpacked *n)
 	free_delta_index(n->index);
 	n->index = NULL;
 	if (n->data) {
-		freed_mem += n->entry->size;
+		freed_mem += SIZE(n->entry);
 		FREE_AND_NULL(n->data);
 	}
 	n->entry = NULL;
@@ -2459,7 +2515,8 @@ static void prepare_pack(int window, int depth)
 			 */
 			continue;
 
-		if (!entry->type_valid || entry->size < 50)
+		if (!entry->type_valid ||
+		    oe_size_less_than(&to_pack, entry, 50))
 			continue;
 
 		if (entry->no_try_delta)
diff --git a/pack-objects.c b/pack-objects.c
index 09f0a88865..bf2e0a808d 100644
--- a/pack-objects.c
+++ b/pack-objects.c
@@ -144,6 +144,9 @@ void prepare_packing_data(struct packing_data *pdata)
 	} else {
 		prepare_in_pack_by_idx(pdata);
 	}
+
+	pdata->oe_size_limit = git_env_ulong("GIT_TEST_OE_SIZE",
+					     1U << OE_SIZE_BITS);
 }
 
 struct object_entry *packlist_alloc(struct packing_data *pdata,
diff --git a/pack-objects.h b/pack-objects.h
index 534f5a5e4d..27697be5c9 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -5,6 +5,11 @@
 #define OE_DEPTH_BITS		12
 #define OE_IN_PACK_BITS		10
 #define OE_Z_DELTA_BITS		20
+/*
+ * Note that oe_set_size() becomes expensive when the given size is
+ * above this limit. Don't lower it too much.
+ */
+#define OE_SIZE_BITS		31
 
 /*
  * State flags for depth-first search used for analyzing delta cycles.
@@ -68,7 +73,8 @@ enum dfs_state {
  */
 struct object_entry {
 	struct pack_idx_entry idx;
-	unsigned long size;	/* uncompressed size */
+	unsigned size_:OE_SIZE_BITS;
+	unsigned size_valid:1;
 	unsigned in_pack_idx:OE_IN_PACK_BITS;	/* already in pack */
 	off_t in_pack_offset;
 	uint32_t delta_idx;	/* delta base object */
@@ -113,6 +119,8 @@ struct packing_data {
 	 */
 	struct packed_git **in_pack_by_idx;
 	struct packed_git **in_pack;
+
+	uintmax_t oe_size_limit;
 };
 
 void prepare_packing_data(struct packing_data *pdata);
@@ -252,4 +260,51 @@ static inline void oe_set_delta_sibling(struct packing_data *pack,
 		e->delta_sibling_idx = 0;
 }
 
+unsigned long oe_get_size_slow(struct packing_data *pack,
+			       const struct object_entry *e);
+static inline unsigned long oe_size(struct packing_data *pack,
+				    const struct object_entry *e)
+{
+	if (e->size_valid)
+		return e->size_;
+
+	return oe_get_size_slow(pack, e);
+}
+
+static inline int oe_size_less_than(struct packing_data *pack,
+				    const struct object_entry *lhs,
+				    unsigned long rhs)
+{
+	if (lhs->size_valid)
+		return lhs->size_ < rhs;
+	if (rhs < pack->oe_size_limit) /* rhs < 2^x <= lhs ? */
+		return 0;
+	return oe_get_size_slow(pack, lhs) < rhs;
+}
+
+static inline int oe_size_greater_than(struct packing_data *pack,
+				       const struct object_entry *lhs,
+				       unsigned long rhs)
+{
+	if (lhs->size_valid)
+		return lhs->size_ > rhs;
+	if (rhs < pack->oe_size_limit) /* rhs < 2^x <= lhs ? */
+		return 1;
+	return oe_get_size_slow(pack, lhs) > rhs;
+}
+
+static inline void oe_set_size(struct packing_data *pack,
+			       struct object_entry *e,
+			       unsigned long size)
+{
+	if (size < pack->oe_size_limit) {
+		e->size_ = size;
+		e->size_valid = 1;
+	} else {
+		e->size_valid = 0;
+		if (oe_get_size_slow(pack, e) != size)
+			BUG("'size' is supposed to be the object size!");
+	}
+}
+
 #endif
diff --git a/t/README b/t/README
index c6130ff16d..c01d210c15 100644
--- a/t/README
+++ b/t/README
@@ -306,6 +306,12 @@ GIT_TEST_FULL_IN_PACK_ARRAY exercises the uncommon pack-objects code
 path where there are more than 1024 packs even if the actual number of
 packs in repository is below this limit.
 
+GIT_TEST_OE_SIZE=<n> exercises the uncommon pack-objects code path
+where we do not cache object size in memory and read it from existing
+packs on demand. This normally only happens when the object size is
+over 2GB. This variable forces the code path on any object larger than
+<n> bytes.
+
 Naming Tests
 ------------
 
-- 
2.17.0.rc2.515.g4feb9b7923


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v8 13/15] pack-objects: shrink delta_size field in struct object_entry
  2018-03-31 10:02               ` [PATCH v8 00/15] " Nguyễn Thái Ngọc Duy
                                   ` (11 preceding siblings ...)
  2018-03-31 10:03                 ` [PATCH v8 12/15] pack-objects: shrink size field in struct object_entry Nguyễn Thái Ngọc Duy
@ 2018-03-31 10:03                 ` Nguyễn Thái Ngọc Duy
  2018-03-31 10:03                 ` [PATCH v8 14/15] pack-objects: reorder members to shrink " Nguyễn Thái Ngọc Duy
                                   ` (3 subsequent siblings)
  16 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-31 10:03 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

Allowing a delta size of 64 bits is crazy. Shrink this field down to
20 bits with one overflow bit.

If we find an existing delta larger than 1MB, we do not cache
delta_size at all and will get the value from oe_size(), potentially
from disk if it's larger than 4GB.

Note, since DELTA_SIZE() is used in try_delta() code, it must be
thread-safe. Luckily oe_size() does guarantee this so we it is
thread-safe.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 26 ++++++++++++++++----------
 pack-objects.h         | 23 ++++++++++++++++++++++-
 2 files changed, 38 insertions(+), 11 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index b4ea6290f9..b5bba2c228 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -32,10 +32,12 @@
 #define IN_PACK(obj) oe_in_pack(&to_pack, obj)
 #define SIZE(obj) oe_size(&to_pack, obj)
 #define SET_SIZE(obj,size) oe_set_size(&to_pack, obj, size)
+#define DELTA_SIZE(obj) oe_delta_size(&to_pack, obj)
 #define DELTA(obj) oe_delta(&to_pack, obj)
 #define DELTA_CHILD(obj) oe_delta_child(&to_pack, obj)
 #define DELTA_SIBLING(obj) oe_delta_sibling(&to_pack, obj)
 #define SET_DELTA(obj, val) oe_set_delta(&to_pack, obj, val)
+#define SET_DELTA_SIZE(obj, val) oe_set_delta_size(&to_pack, obj, val)
 #define SET_DELTA_CHILD(obj, val) oe_set_delta_child(&to_pack, obj, val)
 #define SET_DELTA_SIBLING(obj, val) oe_set_delta_sibling(&to_pack, obj, val)
 
@@ -142,7 +144,7 @@ static void *get_delta(struct object_entry *entry)
 		    oid_to_hex(&DELTA(entry)->idx.oid));
 	delta_buf = diff_delta(base_buf, base_size,
 			       buf, size, &delta_size, 0);
-	if (!delta_buf || delta_size != entry->delta_size)
+	if (!delta_buf || delta_size != DELTA_SIZE(entry))
 		die("delta size changed");
 	free(buf);
 	free(base_buf);
@@ -293,14 +295,14 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 		FREE_AND_NULL(entry->delta_data);
 		entry->z_delta_size = 0;
 	} else if (entry->delta_data) {
-		size = entry->delta_size;
+		size = DELTA_SIZE(entry);
 		buf = entry->delta_data;
 		entry->delta_data = NULL;
 		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	} else {
 		buf = get_delta(entry);
-		size = entry->delta_size;
+		size = DELTA_SIZE(entry);
 		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	}
@@ -1508,7 +1510,7 @@ static void check_object(struct object_entry *entry)
 			oe_set_type(entry, entry->in_pack_type);
 			SET_SIZE(entry, in_pack_size); /* delta size */
 			SET_DELTA(entry, base_entry);
-			entry->delta_size = in_pack_size;
+			SET_DELTA_SIZE(entry, in_pack_size);
 			entry->delta_sibling_idx = base_entry->delta_child_idx;
 			SET_DELTA_CHILD(base_entry, entry);
 			unuse_pack(&w_curs);
@@ -1938,7 +1940,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 		max_size = trg_size/2 - 20;
 		ref_depth = 1;
 	} else {
-		max_size = trg_entry->delta_size;
+		max_size = DELTA_SIZE(trg_entry);
 		ref_depth = trg->depth;
 	}
 	max_size = (uint64_t)max_size * (max_depth - src->depth) /
@@ -2009,10 +2011,14 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	delta_buf = create_delta(src->index, trg->data, trg_size, &delta_size, max_size);
 	if (!delta_buf)
 		return 0;
+	if (delta_size >= (1U << OE_DELTA_SIZE_BITS)) {
+		free(delta_buf);
+		return 0;
+	}
 
 	if (DELTA(trg_entry)) {
 		/* Prefer only shallower same-sized deltas. */
-		if (delta_size == trg_entry->delta_size &&
+		if (delta_size == DELTA_SIZE(trg_entry) &&
 		    src->depth + 1 >= trg->depth) {
 			free(delta_buf);
 			return 0;
@@ -2027,7 +2033,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	free(trg_entry->delta_data);
 	cache_lock();
 	if (trg_entry->delta_data) {
-		delta_cache_size -= trg_entry->delta_size;
+		delta_cache_size -= DELTA_SIZE(trg_entry);
 		trg_entry->delta_data = NULL;
 	}
 	if (delta_cacheable(src_size, trg_size, delta_size)) {
@@ -2040,7 +2046,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	}
 
 	SET_DELTA(trg_entry, src_entry);
-	trg_entry->delta_size = delta_size;
+	SET_DELTA_SIZE(trg_entry, delta_size);
 	trg->depth = src->depth + 1;
 
 	return 1;
@@ -2163,11 +2169,11 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
 		if (entry->delta_data && !pack_to_stdout) {
 			unsigned long size;
 
-			size = do_compress(&entry->delta_data, entry->delta_size);
+			size = do_compress(&entry->delta_data, DELTA_SIZE(entry));
 			if (size < (1U << OE_Z_DELTA_BITS)) {
 				entry->z_delta_size = size;
 				cache_lock();
-				delta_cache_size -= entry->delta_size;
+				delta_cache_size -= DELTA_SIZE(entry);
 				delta_cache_size += entry->z_delta_size;
 				cache_unlock();
 			} else {
diff --git a/pack-objects.h b/pack-objects.h
index 27697be5c9..b5114a70a7 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -10,6 +10,7 @@
  * above this limit. Don't lower it too much.
  */
 #define OE_SIZE_BITS		31
+#define OE_DELTA_SIZE_BITS	20
 
 /*
  * State flags for depth-first search used for analyzing delta cycles.
@@ -83,7 +84,8 @@ struct object_entry {
 				     * uses the same base as me
 				     */
 	void *delta_data;	/* cached delta (uncompressed) */
-	unsigned long delta_size;	/* delta data size (uncompressed) */
+	unsigned delta_size_:OE_DELTA_SIZE_BITS; /* delta data size (uncompressed) */
+	unsigned delta_size_valid:1;
 	unsigned z_delta_size:OE_Z_DELTA_BITS;
 	unsigned type_:TYPE_BITS;
 	unsigned in_pack_type:TYPE_BITS; /* could be delta */
@@ -307,4 +309,23 @@ static inline void oe_set_size(struct packing_data *pack,
 	}
 }
 
+static inline unsigned long oe_delta_size(struct packing_data *pack,
+					  const struct object_entry *e)
+{
+	if (e->delta_size_valid)
+		return e->delta_size_;
+	return oe_size(pack, e);
+}
+
+static inline void oe_set_delta_size(struct packing_data *pack,
+				     struct object_entry *e,
+				     unsigned long size)
+{
+	e->delta_size_ = size;
+	e->delta_size_valid = e->delta_size_ == size;
+	if (!e->delta_size_valid && size != oe_size(pack, e))
+		BUG("this can only happen in check_object() "
+		    "where delta size is the same as entry size");
+}
+
 #endif
-- 
2.17.0.rc2.515.g4feb9b7923


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v8 14/15] pack-objects: reorder members to shrink struct object_entry
  2018-03-31 10:02               ` [PATCH v8 00/15] " Nguyễn Thái Ngọc Duy
                                   ` (12 preceding siblings ...)
  2018-03-31 10:03                 ` [PATCH v8 13/15] pack-objects: shrink delta_size " Nguyễn Thái Ngọc Duy
@ 2018-03-31 10:03                 ` Nguyễn Thái Ngọc Duy
  2018-03-31 10:03                 ` [PATCH v8 15/15] ci: exercise the whole test suite with uncommon code in pack-objects Nguyễn Thái Ngọc Duy
                                   ` (2 subsequent siblings)
  16 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-31 10:03 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

Previous patches leave lots of holes and padding in this struct. This
patch reorders the members and shrinks the struct down to 80 bytes
(from 136 bytes on 64-bit systems, before any field shrinking is done)
with 16 bits to spare (and a couple more in in_pack_header_size when
we really run out of bits).

This is the last in a series of memory reduction patches (see
"pack-objects: a bit of document about struct object_entry" for the
first one).

Overall they've reduced repack memory size on linux-2.6.git from
3.747G to 3.424G, or by around 320M, a decrease of 8.5%. The runtime
of repack has stayed the same throughout this series. Ævar's testing
on a big monorepo he has access to (bigger than linux-2.6.git) has
shown a 7.9% reduction, so the overall expected improvement should be
somewhere around 8%.

See 87po42cwql.fsf@evledraar.gmail.com on-list
(https://public-inbox.org/git/87po42cwql.fsf@evledraar.gmail.com/) for
more detailed numbers and a test script used to produce the numbers
cited above.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 pack-objects.h | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/pack-objects.h b/pack-objects.h
index b5114a70a7..60192cce1f 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -26,6 +26,10 @@ enum dfs_state {
 };
 
 /*
+ * The size of struct nearly determines pack-objects's memory
+ * consumption. This struct is packed tight for that reason. When you
+ * add or reorder something in this struct, think a bit about this.
+ *
  * basic object info
  * -----------------
  * idx.oid is filled up before delta searching starts. idx.crc32 is
@@ -74,34 +78,44 @@ enum dfs_state {
  */
 struct object_entry {
 	struct pack_idx_entry idx;
+	void *delta_data;	/* cached delta (uncompressed) */
+	off_t in_pack_offset;
+	uint32_t hash;			/* name hint hash */
 	unsigned size_:OE_SIZE_BITS;
 	unsigned size_valid:1;
-	unsigned in_pack_idx:OE_IN_PACK_BITS;	/* already in pack */
-	off_t in_pack_offset;
 	uint32_t delta_idx;	/* delta base object */
 	uint32_t delta_child_idx; /* deltified objects who bases me */
 	uint32_t delta_sibling_idx; /* other deltified objects who
 				     * uses the same base as me
 				     */
-	void *delta_data;	/* cached delta (uncompressed) */
 	unsigned delta_size_:OE_DELTA_SIZE_BITS; /* delta data size (uncompressed) */
 	unsigned delta_size_valid:1;
+	unsigned in_pack_idx:OE_IN_PACK_BITS;	/* already in pack */
 	unsigned z_delta_size:OE_Z_DELTA_BITS;
+	unsigned type_valid:1;
 	unsigned type_:TYPE_BITS;
+	unsigned no_try_delta:1;
 	unsigned in_pack_type:TYPE_BITS; /* could be delta */
-	unsigned type_valid:1;
-	uint32_t hash;			/* name hint hash */
-	unsigned char in_pack_header_size;
 	unsigned preferred_base:1; /*
 				    * we do not pack this, but is available
 				    * to be used as the base object to delta
 				    * objects against.
 				    */
-	unsigned no_try_delta:1;
 	unsigned tagged:1; /* near the very tip of refs */
 	unsigned filled:1; /* assigned write-order */
 	unsigned dfs_state:OE_DFS_STATE_BITS;
+	unsigned char in_pack_header_size;
 	unsigned depth:OE_DEPTH_BITS;
+
+	/*
+	 * pahole results on 64-bit linux (gcc and clang)
+	 *
+	 *   size: 80, bit_padding: 20 bits, holes: 8 bits
+	 *
+	 * and on 32-bit (gcc)
+	 *
+	 *   size: 76, bit_padding: 20 bits, holes: 8 bits
+	 */
 };
 
 struct packing_data {
-- 
2.17.0.rc2.515.g4feb9b7923


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* [PATCH v8 15/15] ci: exercise the whole test suite with uncommon code in pack-objects
  2018-03-31 10:02               ` [PATCH v8 00/15] " Nguyễn Thái Ngọc Duy
                                   ` (13 preceding siblings ...)
  2018-03-31 10:03                 ` [PATCH v8 14/15] pack-objects: reorder members to shrink " Nguyễn Thái Ngọc Duy
@ 2018-03-31 10:03                 ` Nguyễn Thái Ngọc Duy
  2018-03-31 11:36                 ` [PATCH v8 00/15] nd/pack-objects-pack-struct updates Ævar Arnfjörð Bjarmason
  2018-04-06 21:47                 ` Jeff King
  16 siblings, 0 replies; 273+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2018-03-31 10:03 UTC (permalink / raw)
  To: pclouds; +Cc: avarab, e, git, gitster, peff

Some recent optimizations have been added to pack-objects to reduce
memory usage and some code paths are split into two: one for common
use cases and one for rare ones. Make sure the rare cases are tested
with Travis since it requires manual test configuration that is
unlikely to be done by developers.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 ci/run-tests.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/ci/run-tests.sh b/ci/run-tests.sh
index 73e273fac7..857d144ee8 100755
--- a/ci/run-tests.sh
+++ b/ci/run-tests.sh
@@ -10,7 +10,10 @@ ln -s "$cache_dir/.prove" t/.prove
 make --quiet test
 if test "$jobname" = "linux-gcc"
 then
-	GIT_TEST_SPLIT_INDEX=YesPlease make --quiet test
+	export GIT_TEST_SPLIT_INDEX=YesPlease
+	export GIT_TEST_FULL_IN_PACK_ARRAY=true
+	export GIT_TEST_OE_SIZE=10
+	make --quiet test
 fi
 
 check_unignored_build_artifacts
-- 
2.17.0.rc2.515.g4feb9b7923


^ permalink raw reply related	[flat|nested] 273+ messages in thread

* Re: [PATCH v7 10/13] pack-objects: clarify the use of object_entry::size
  2018-03-31  4:35                   ` Duy Nguyen
@ 2018-03-31 10:13                     ` Jeff King
  0 siblings, 0 replies; 273+ messages in thread
From: Jeff King @ 2018-03-31 10:13 UTC (permalink / raw)
  To: Duy Nguyen
  Cc: Ævar Arnfjörð Bjarmason, Eric Wong,
	Git Mailing List, Junio C Hamano

On Sat, Mar 31, 2018 at 06:35:40AM +0200, Duy Nguyen wrote:

> On Fri, Mar 30, 2018 at 11:04 PM, Jeff King <peff@peff.net> wrote:
> > The subject says "clarify" so I was a little surprised to see code
> > changes. It looks like we're just avoiding reassigning on top of the
> > value repeatedly, which is part of that clarification. It looks like a
> > noop to me.
> 
> Oh well... I was counting on the new name (in_pack_size, which follows
> in_pack_type naming convention) to emphasize it (and the new "delta
> size" comment to point out where in_pack_size contains a delta size.

Just to be clear, my final "it looks like a noop" means "good, it looks
like it is a pure cosmetic change and no change to the behavior."

-Peff

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v7 08/13] pack-objects: shrink z_delta_size field in struct object_entry
  2018-03-31  4:40                   ` Duy Nguyen
@ 2018-03-31 10:17                     ` Jeff King
  0 siblings, 0 replies; 273+ messages in thread
From: Jeff King @ 2018-03-31 10:17 UTC (permalink / raw)
  To: Duy Nguyen
  Cc: Ævar Arnfjörð Bjarmason, Eric Wong,
	Git Mailing List, Junio C Hamano

On Sat, Mar 31, 2018 at 06:40:23AM +0200, Duy Nguyen wrote:

> > Unlike the depth, I don't think there's any _inherent_ reason you
> > couldn't throw, say, 1MB deltas into the cache (if you sized it large
> > enough). But I doubt such deltas are really all that common. Here are
> > the top 10 in linux.git:
> >
> >   $ git cat-file --batch-all-objects --batch-check='%(deltabase) %(objectsize:disk)' |
> >     grep -v ^00000 | sort -k 2nr | head
> >   a02b6794337286bc12c907c33d5d75537c240bd0 769103
> >   b28d4b64c05da02c5e8c684dcb9422876225ebdc 327116
> >   1e98ce86ed19aff9ba721d13a749ff08088c9922 325257
> >   a02b6794337286bc12c907c33d5d75537c240bd0 240647
> >   c550d99286c01867dfb26e432417f3106acf8611 177896
> >   5977795854f852c2b95dd023fd03cace023ee41c 119737
> >   4ccf9681c45d01d17376f7e0d266532a4460f5f8 112671
> >   b39fb6821faa9e7bc36de738152a2817b4bf3654 112657
> >   2645d6239b74bebd661436762e819b831095b084 103980
> >   b8ce7fe5d8def58dc63b7ae099eff7bd07e4e845 101014
> >
> > It's possible some weird workload would want to tweak this. Say you were
> > storing a ton of delta-capable files that were big and always differed
> > by a megabyte. And it was somehow really important to you to tradeoff
> > memory for CPU during the write phase of a pack.
> 
> We're not short on spare bits so I will try to raise this limit to 1MB
> (not because you mentioned 1MB, but because the largest size in your
> output is close to 1MB).

I doubt it matters much. Unless somebody has been tweaking the config
themselves, this has been limited to 1000 for everybody running
linux.git and nobody has ever noticed.

So I think it would only be an issue if:

  1. you had an oddball repo with gigantic deltas

AND

  2. you for some reason really cared about caching the deltas between
     phases

AND

  3. you had done enough homework to even figure out that this knob
     existed

I was thinking that you might care about (2) for serving fetches of your
oddball repository. But really, if you care about minimizing work, you
want to be reusing on-disk deltas anyway, which would skip this cache
entirely. So any work we do to reproduce the delta would probably be
dwarfed by the finding of this giant delta in the first place.

So raise the limit if you want, but I'd be surprised if anybody was even
doing (3) in the first place.

-Peff

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v7 06/13] pack-objects: move in_pack out of struct object_entry
  2018-03-31  4:51                   ` Duy Nguyen
@ 2018-03-31 10:20                     ` Jeff King
  2018-03-31 10:45                       ` Duy Nguyen
  0 siblings, 1 reply; 273+ messages in thread
From: Jeff King @ 2018-03-31 10:20 UTC (permalink / raw)
  To: Duy Nguyen
  Cc: Ævar Arnfjörð Bjarmason, Eric Wong,
	Git Mailing List, Junio C Hamano

On Sat, Mar 31, 2018 at 06:51:10AM +0200, Duy Nguyen wrote:

> >> +#define IN_PACK(obj) oe_in_pack(&to_pack, obj)
> >
> > How come this one gets a macro, but the earlier conversions don't?
> >
> > I guess the problem is that oe_in_pack() is defined in the generic
> > pack-objects.h, but &to_pack is only in builtin/pack-objects.c?
> >
> > I wonder if it would be that bad to just say oe_in_pack(&to_pack, obj)
> > everywhere. It's longer, but it makes the code slightly less magical to
> > read.
> 
> Longer was exactly why I added these macros (with the hope that the
> macro upper case names already ring a "it's magical" bell). Should I
> drop all these macros? Some code becomes a lot more verbose though.

I'm on the fence. I agree that the macro screams "magical". I just
sometimes see a macro and think something really weird and
unfunction-like is going on. But really we're just replacing a default
parameter.

So I dunno. If you get rid of the macros and I look at it, I give even
odds that I'll say "yech, put them back!". :)

-Peff

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v7 06/13] pack-objects: move in_pack out of struct object_entry
  2018-03-31 10:20                     ` Jeff King
@ 2018-03-31 10:45                       ` Duy Nguyen
  0 siblings, 0 replies; 273+ messages in thread
From: Duy Nguyen @ 2018-03-31 10:45 UTC (permalink / raw)
  To: Jeff King
  Cc: Ævar Arnfjörð Bjarmason, Eric Wong,
	Git Mailing List, Junio C Hamano

On Sat, Mar 31, 2018 at 06:20:07AM -0400, Jeff King wrote:
> On Sat, Mar 31, 2018 at 06:51:10AM +0200, Duy Nguyen wrote:
> 
> > >> +#define IN_PACK(obj) oe_in_pack(&to_pack, obj)
> > >
> > > How come this one gets a macro, but the earlier conversions don't?
> > >
> > > I guess the problem is that oe_in_pack() is defined in the generic
> > > pack-objects.h, but &to_pack is only in builtin/pack-objects.c?
> > >
> > > I wonder if it would be that bad to just say oe_in_pack(&to_pack, obj)
> > > everywhere. It's longer, but it makes the code slightly less magical to
> > > read.
> > 
> > Longer was exactly why I added these macros (with the hope that the
> > macro upper case names already ring a "it's magical" bell). Should I
> > drop all these macros? Some code becomes a lot more verbose though.
> 
> I'm on the fence. I agree that the macro screams "magical". I just
> sometimes see a macro and think something really weird and
> unfunction-like is going on. But really we're just replacing a default
> parameter.
> 
> So I dunno. If you get rid of the macros and I look at it, I give even
> odds that I'll say "yech, put them back!". :)

It would look like this (on top of v8). I think the "&to_pack" part is
most distracting when it's used as part of an expression (or a
function argument). I probably went overboard with SET_ macros though.

-- 8< --
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index b5bba2c228..dec849b755 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -29,18 +29,6 @@
 #include "list.h"
 #include "packfile.h"
 
-#define IN_PACK(obj) oe_in_pack(&to_pack, obj)
-#define SIZE(obj) oe_size(&to_pack, obj)
-#define SET_SIZE(obj,size) oe_set_size(&to_pack, obj, size)
-#define DELTA_SIZE(obj) oe_delta_size(&to_pack, obj)
-#define DELTA(obj) oe_delta(&to_pack, obj)
-#define DELTA_CHILD(obj) oe_delta_child(&to_pack, obj)
-#define DELTA_SIBLING(obj) oe_delta_sibling(&to_pack, obj)
-#define SET_DELTA(obj, val) oe_set_delta(&to_pack, obj, val)
-#define SET_DELTA_SIZE(obj, val) oe_set_delta_size(&to_pack, obj, val)
-#define SET_DELTA_CHILD(obj, val) oe_set_delta_child(&to_pack, obj, val)
-#define SET_DELTA_SIBLING(obj, val) oe_set_delta_sibling(&to_pack, obj, val)
-
 static const char *pack_usage[] = {
 	N_("git pack-objects --stdout [<options>...] [< <ref-list> | < <object-list>]"),
 	N_("git pack-objects [<options>...] <base-name> [< <ref-list> | < <object-list>]"),
@@ -137,14 +125,14 @@ static void *get_delta(struct object_entry *entry)
 	buf = read_sha1_file(entry->idx.oid.hash, &type, &size);
 	if (!buf)
 		die("unable to read %s", oid_to_hex(&entry->idx.oid));
-	base_buf = read_sha1_file(DELTA(entry)->idx.oid.hash, &type,
+	base_buf = read_sha1_file(oe_delta(&to_pack, entry)->idx.oid.hash, &type,
 				  &base_size);
 	if (!base_buf)
 		die("unable to read %s",
-		    oid_to_hex(&DELTA(entry)->idx.oid));
+		    oid_to_hex(&oe_delta(&to_pack, entry)->idx.oid));
 	delta_buf = diff_delta(base_buf, base_size,
 			       buf, size, &delta_size, 0);
-	if (!delta_buf || delta_size != DELTA_SIZE(entry))
+	if (!delta_buf || delta_size != oe_delta_size(&to_pack, entry))
 		die("delta size changed");
 	free(buf);
 	free(base_buf);
@@ -295,15 +283,15 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 		FREE_AND_NULL(entry->delta_data);
 		entry->z_delta_size = 0;
 	} else if (entry->delta_data) {
-		size = DELTA_SIZE(entry);
+		size = oe_delta_size(&to_pack, entry);
 		buf = entry->delta_data;
 		entry->delta_data = NULL;
-		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
+		type = (allow_ofs_delta && oe_delta(&to_pack, entry)->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	} else {
 		buf = get_delta(entry);
-		size = DELTA_SIZE(entry);
-		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
+		size = oe_delta_size(&to_pack, entry);
+		type = (allow_ofs_delta && oe_delta(&to_pack, entry)->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	}
 
@@ -327,7 +315,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 		 * encoding of the relative offset for the delta
 		 * base from this object's position in the pack.
 		 */
-		off_t ofs = entry->idx.offset - DELTA(entry)->idx.offset;
+		off_t ofs = entry->idx.offset - oe_delta(&to_pack, entry)->idx.offset;
 		unsigned pos = sizeof(dheader) - 1;
 		dheader[pos] = ofs & 127;
 		while (ofs >>= 7)
@@ -353,7 +341,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 			return 0;
 		}
 		hashwrite(f, header, hdrlen);
-		hashwrite(f, DELTA(entry)->idx.oid.hash, 20);
+		hashwrite(f, oe_delta(&to_pack, entry)->idx.oid.hash, 20);
 		hdrlen += 20;
 	} else {
 		if (limit && hdrlen + datalen + 20 >= limit) {
@@ -379,7 +367,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 				unsigned long limit, int usable_delta)
 {
-	struct packed_git *p = IN_PACK(entry);
+	struct packed_git *p = oe_in_pack(&to_pack, entry);
 	struct pack_window *w_curs = NULL;
 	struct revindex_entry *revidx;
 	off_t offset;
@@ -388,10 +376,10 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 	unsigned char header[MAX_PACK_OBJECT_HEADER],
 		      dheader[MAX_PACK_OBJECT_HEADER];
 	unsigned hdrlen;
-	unsigned long entry_size = SIZE(entry);
+	unsigned long entry_size = oe_size(&to_pack, entry);
 
-	if (DELTA(entry))
-		type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
+	if (oe_delta(&to_pack, entry))
+		type = (allow_ofs_delta && oe_delta(&to_pack, entry)->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	hdrlen = encode_in_pack_object_header(header, sizeof(header),
 					      type, entry_size);
@@ -419,7 +407,7 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 	}
 
 	if (type == OBJ_OFS_DELTA) {
-		off_t ofs = entry->idx.offset - DELTA(entry)->idx.offset;
+		off_t ofs = entry->idx.offset - oe_delta(&to_pack, entry)->idx.offset;
 		unsigned pos = sizeof(dheader) - 1;
 		dheader[pos] = ofs & 127;
 		while (ofs >>= 7)
@@ -438,7 +426,7 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 			return 0;
 		}
 		hashwrite(f, header, hdrlen);
-		hashwrite(f, DELTA(entry)->idx.oid.hash, 20);
+		hashwrite(f, oe_delta(&to_pack, entry)->idx.oid.hash, 20);
 		hdrlen += 20;
 		reused_delta++;
 	} else {
@@ -478,20 +466,20 @@ static off_t write_object(struct hashfile *f,
 	else
 		limit = pack_size_limit - write_offset;
 
-	if (!DELTA(entry))
+	if (!oe_delta(&to_pack, entry))
 		usable_delta = 0;	/* no delta */
 	else if (!pack_size_limit)
 	       usable_delta = 1;	/* unlimited packfile */
-	else if (DELTA(entry)->idx.offset == (off_t)-1)
+	else if (oe_delta(&to_pack, entry)->idx.offset == (off_t)-1)
 		usable_delta = 0;	/* base was written to another pack */
-	else if (DELTA(entry)->idx.offset)
+	else if (oe_delta(&to_pack, entry)->idx.offset)
 		usable_delta = 1;	/* base already exists in this pack */
 	else
 		usable_delta = 0;	/* base could end up in another pack */
 
 	if (!reuse_object)
 		to_reuse = 0;	/* explicit */
-	else if (!IN_PACK(entry))
+	else if (!oe_in_pack(&to_pack, entry))
 		to_reuse = 0;	/* can't reuse what we don't have */
 	else if (oe_type(entry) == OBJ_REF_DELTA ||
 		 oe_type(entry) == OBJ_OFS_DELTA)
@@ -500,7 +488,7 @@ static off_t write_object(struct hashfile *f,
 				/* ... but pack split may override that */
 	else if (oe_type(entry) != entry->in_pack_type)
 		to_reuse = 0;	/* pack has delta which is unusable */
-	else if (DELTA(entry))
+	else if (oe_delta(&to_pack, entry))
 		to_reuse = 0;	/* we want to pack afresh */
 	else
 		to_reuse = 1;	/* we have it in-pack undeltified,
@@ -552,12 +540,12 @@ static enum write_one_status write_one(struct hashfile *f,
 	}
 
 	/* if we are deltified, write out base object first. */
-	if (DELTA(e)) {
+	if (oe_delta(&to_pack, e)) {
 		e->idx.offset = 1; /* now recurse */
-		switch (write_one(f, DELTA(e), offset)) {
+		switch (write_one(f, oe_delta(&to_pack, e), offset)) {
 		case WRITE_ONE_RECURSIVE:
 			/* we cannot depend on this one */
-			SET_DELTA(e, NULL);
+			oe_set_delta(&to_pack, e, NULL);
 			break;
 		default:
 			break;
@@ -619,34 +607,34 @@ static void add_descendants_to_write_order(struct object_entry **wo,
 			/* add this node... */
 			add_to_write_order(wo, endp, e);
 			/* all its siblings... */
-			for (s = DELTA_SIBLING(e); s; s = DELTA_SIBLING(s)) {
+			for (s = oe_delta_sibling(&to_pack, e); s; s = oe_delta_sibling(&to_pack, s)) {
 				add_to_write_order(wo, endp, s);
 			}
 		}
 		/* drop down a level to add left subtree nodes if possible */
-		if (DELTA_CHILD(e)) {
+		if (oe_delta_child(&to_pack, e)) {
 			add_to_order = 1;
-			e = DELTA_CHILD(e);
+			e = oe_delta_child(&to_pack, e);
 		} else {
 			add_to_order = 0;
 			/* our sibling might have some children, it is next */
-			if (DELTA_SIBLING(e)) {
-				e = DELTA_SIBLING(e);
+			if (oe_delta_sibling(&to_pack, e)) {
+				e = oe_delta_sibling(&to_pack, e);
 				continue;
 			}
 			/* go back to our parent node */
-			e = DELTA(e);
-			while (e && !DELTA_SIBLING(e)) {
+			e = oe_delta(&to_pack, e);
+			while (e && !oe_delta_sibling(&to_pack, e)) {
 				/* we're on the right side of a subtree, keep
 				 * going up until we can go right again */
-				e = DELTA(e);
+				e = oe_delta(&to_pack, e);
 			}
 			if (!e) {
 				/* done- we hit our original root node */
 				return;
 			}
 			/* pass it off to sibling at this level */
-			e = DELTA_SIBLING(e);
+			e = oe_delta_sibling(&to_pack, e);
 		}
 	};
 }
@@ -657,7 +645,7 @@ static void add_family_to_write_order(struct object_entry **wo,
 {
 	struct object_entry *root;
 
-	for (root = e; DELTA(root); root = DELTA(root))
+	for (root = e; oe_delta(&to_pack, root); root = oe_delta(&to_pack, root))
 		; /* nothing */
 	add_descendants_to_write_order(wo, endp, root);
 }
@@ -672,8 +660,8 @@ static struct object_entry **compute_write_order(void)
 	for (i = 0; i < to_pack.nr_objects; i++) {
 		objects[i].tagged = 0;
 		objects[i].filled = 0;
-		SET_DELTA_CHILD(&objects[i], NULL);
-		SET_DELTA_SIBLING(&objects[i], NULL);
+		oe_set_delta_child(&to_pack, &objects[i], NULL);
+		oe_set_delta_size(&to_pack, &objects[i], NULL);
 	}
 
 	/*
@@ -683,11 +671,11 @@ static struct object_entry **compute_write_order(void)
 	 */
 	for (i = to_pack.nr_objects; i > 0;) {
 		struct object_entry *e = &objects[--i];
-		if (!DELTA(e))
+		if (!oe_delta(&to_pack, e))
 			continue;
 		/* Mark me as the first child */
-		e->delta_sibling_idx = DELTA(e)->delta_child_idx;
-		SET_DELTA_CHILD(DELTA(e), e);
+		e->delta_sibling_idx = oe_delta(&to_pack, e)->delta_child_idx;
+		oe_set_delta_child(&to_pack, oe_delta(&to_pack, e), e);
 	}
 
 	/*
@@ -1414,8 +1402,8 @@ static void check_object(struct object_entry *entry)
 {
 	unsigned long canonical_size;
 
-	if (IN_PACK(entry)) {
-		struct packed_git *p = IN_PACK(entry);
+	if (oe_in_pack(&to_pack, entry)) {
+		struct packed_git *p = oe_in_pack(&to_pack, entry);
 		struct pack_window *w_curs = NULL;
 		const unsigned char *base_ref = NULL;
 		struct object_entry *base_entry;
@@ -1451,7 +1439,7 @@ static void check_object(struct object_entry *entry)
 		default:
 			/* Not a delta hence we've already got all we need. */
 			oe_set_type(entry, entry->in_pack_type);
-			SET_SIZE(entry, in_pack_size);
+			oe_set_size(&to_pack, entry, in_pack_size);
 			entry->in_pack_header_size = used;
 			if (oe_type(entry) < OBJ_COMMIT || oe_type(entry) > OBJ_BLOB)
 				goto give_up;
@@ -1508,11 +1496,11 @@ static void check_object(struct object_entry *entry)
 			 * circular deltas.
 			 */
 			oe_set_type(entry, entry->in_pack_type);
-			SET_SIZE(entry, in_pack_size); /* delta size */
-			SET_DELTA(entry, base_entry);
-			SET_DELTA_SIZE(entry, in_pack_size);
+			oe_set_size(&to_pack, entry, in_pack_size); /* delta size */
+			oe_set_delta(&to_pack, entry, base_entry);
+			oe_set_delta_size(&to_pack, entry, in_pack_size);
 			entry->delta_sibling_idx = base_entry->delta_child_idx;
-			SET_DELTA_CHILD(base_entry, entry);
+			oe_set_delta_child(&to_pack, base_entry, entry);
 			unuse_pack(&w_curs);
 			return;
 		}
@@ -1529,7 +1517,7 @@ static void check_object(struct object_entry *entry)
 			canonical_size = get_size_from_delta(p, &w_curs, delta_pos);
 			if (canonical_size == 0)
 				goto give_up;
-			SET_SIZE(entry, canonical_size);
+			oe_set_size(&to_pack, entry, canonical_size);
 			unuse_pack(&w_curs);
 			return;
 		}
@@ -1546,7 +1534,7 @@ static void check_object(struct object_entry *entry)
 	oe_set_type(entry, sha1_object_info(entry->idx.oid.hash,
 					    &canonical_size));
 	if (entry->type_valid) {
-		SET_SIZE(entry, canonical_size);
+		oe_set_size(&to_pack, entry, canonical_size);
 	} else {
 		/*
 		 * Bad object type is checked in prepare_pack().  This is
@@ -1561,8 +1549,8 @@ static int pack_offset_sort(const void *_a, const void *_b)
 {
 	const struct object_entry *a = *(struct object_entry **)_a;
 	const struct object_entry *b = *(struct object_entry **)_b;
-	const struct packed_git *a_in_pack = IN_PACK(a);
-	const struct packed_git *b_in_pack = IN_PACK(b);
+	const struct packed_git *a_in_pack = oe_in_pack(&to_pack, a);
+	const struct packed_git *b_in_pack = oe_in_pack(&to_pack, b);
 
 	/* avoid filesystem trashing with loose objects */
 	if (!a_in_pack && !b_in_pack)
@@ -1604,12 +1592,12 @@ static void drop_reused_delta(struct object_entry *entry)
 		else
 			idx = &oe->delta_sibling_idx;
 	}
-	SET_DELTA(entry, NULL);
+	oe_set_delta(&to_pack, entry, NULL);
 	entry->depth = 0;
 
 	oi.sizep = &size;
 	oi.typep = &type;
-	if (packed_object_info(IN_PACK(entry), entry->in_pack_offset, &oi) < 0) {
+	if (packed_object_info(oe_in_pack(&to_pack, entry), entry->in_pack_offset, &oi) < 0) {
 		/*
 		 * We failed to get the info from this pack for some reason;
 		 * fall back to sha1_object_info, which may find another copy.
@@ -1621,7 +1609,7 @@ static void drop_reused_delta(struct object_entry *entry)
 	} else {
 		oe_set_type(entry, type);
 	}
-	SET_SIZE(entry, size);
+	oe_set_size(&to_pack, entry, size);
 }
 
 /*
@@ -1645,7 +1633,7 @@ static void break_delta_chains(struct object_entry *entry)
 
 	for (cur = entry, total_depth = 0;
 	     cur;
-	     cur = DELTA(cur), total_depth++) {
+	     cur = oe_delta(&to_pack, cur), total_depth++) {
 		if (cur->dfs_state == DFS_DONE) {
 			/*
 			 * We've already seen this object and know it isn't
@@ -1670,7 +1658,7 @@ static void break_delta_chains(struct object_entry *entry)
 		 * it's not a delta, we're done traversing, but we'll mark it
 		 * done to save time on future traversals.
 		 */
-		if (!DELTA(cur)) {
+		if (!oe_delta(&to_pack, cur)) {
 			cur->dfs_state = DFS_DONE;
 			break;
 		}
@@ -1693,7 +1681,7 @@ static void break_delta_chains(struct object_entry *entry)
 		 * We keep all commits in the chain that we examined.
 		 */
 		cur->dfs_state = DFS_ACTIVE;
-		if (DELTA(cur)->dfs_state == DFS_ACTIVE) {
+		if (oe_delta(&to_pack, cur)->dfs_state == DFS_ACTIVE) {
 			drop_reused_delta(cur);
 			cur->dfs_state = DFS_DONE;
 			break;
@@ -1708,7 +1696,7 @@ static void break_delta_chains(struct object_entry *entry)
 	 * an extra "next" pointer to keep going after we reset cur->delta.
 	 */
 	for (cur = entry; cur; cur = next) {
-		next = DELTA(cur);
+		next = oe_delta(&to_pack, cur);
 
 		/*
 		 * We should have a chain of zero or more ACTIVE states down to
@@ -1791,8 +1779,8 @@ static int type_size_sort(const void *_a, const void *_b)
 	const struct object_entry *b = *(struct object_entry **)_b;
 	enum object_type a_type = oe_type(a);
 	enum object_type b_type = oe_type(b);
-	unsigned long a_size = SIZE(a);
-	unsigned long b_size = SIZE(b);
+	unsigned long a_size = oe_size(&to_pack, a);
+	unsigned long b_size = oe_size(&to_pack, b);
 
 	if (a_type > b_type)
 		return -1;
@@ -1923,8 +1911,8 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	 * it, we will still save the transfer cost, as we already know
 	 * the other side has it and we won't send src_entry at all.
 	 */
-	if (reuse_delta && IN_PACK(trg_entry) &&
-	    IN_PACK(trg_entry) == IN_PACK(src_entry) &&
+	if (reuse_delta && oe_in_pack(&to_pack, trg_entry) &&
+	    oe_in_pack(&to_pack, trg_entry) == oe_in_pack(&to_pack, src_entry) &&
 	    !src_entry->preferred_base &&
 	    trg_entry->in_pack_type != OBJ_REF_DELTA &&
 	    trg_entry->in_pack_type != OBJ_OFS_DELTA)
@@ -1935,19 +1923,19 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 		return 0;
 
 	/* Now some size filtering heuristics. */
-	trg_size = SIZE(trg_entry);
-	if (!DELTA(trg_entry)) {
+	trg_size = oe_size(&to_pack, trg_entry);
+	if (!oe_delta(&to_pack, trg_entry)) {
 		max_size = trg_size/2 - 20;
 		ref_depth = 1;
 	} else {
-		max_size = DELTA_SIZE(trg_entry);
+		max_size = oe_delta_size(&to_pack, trg_entry);
 		ref_depth = trg->depth;
 	}
 	max_size = (uint64_t)max_size * (max_depth - src->depth) /
 						(max_depth - ref_depth + 1);
 	if (max_size == 0)
 		return 0;
-	src_size = SIZE(src_entry);
+	src_size = oe_size(&to_pack, src_entry);
 	sizediff = src_size < trg_size ? trg_size - src_size : 0;
 	if (sizediff >= max_size)
 		return 0;
@@ -2016,9 +2004,9 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 		return 0;
 	}
 
-	if (DELTA(trg_entry)) {
+	if (oe_delta(&to_pack, trg_entry)) {
 		/* Prefer only shallower same-sized deltas. */
-		if (delta_size == DELTA_SIZE(trg_entry) &&
+		if (delta_size == oe_delta_size(&to_pack, trg_entry) &&
 		    src->depth + 1 >= trg->depth) {
 			free(delta_buf);
 			return 0;
@@ -2033,7 +2021,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 	free(trg_entry->delta_data);
 	cache_lock();
 	if (trg_entry->delta_data) {
-		delta_cache_size -= DELTA_SIZE(trg_entry);
+		delta_cache_size -= oe_delta_size(&to_pack, trg_entry);
 		trg_entry->delta_data = NULL;
 	}
 	if (delta_cacheable(src_size, trg_size, delta_size)) {
@@ -2045,8 +2033,8 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 		free(delta_buf);
 	}
 
-	SET_DELTA(trg_entry, src_entry);
-	SET_DELTA_SIZE(trg_entry, delta_size);
+	oe_set_delta(&to_pack, trg_entry, src_entry);
+	oe_set_delta_size(&to_pack, trg_entry, delta_size);
 	trg->depth = src->depth + 1;
 
 	return 1;
@@ -2054,13 +2042,13 @@ static int try_delta(struct unpacked *trg, struct unpacked *src,
 
 static unsigned int check_delta_limit(struct object_entry *me, unsigned int n)
 {
-	struct object_entry *child = DELTA_CHILD(me);
+	struct object_entry *child = oe_delta_child(&to_pack, me);
 	unsigned int m = n;
 	while (child) {
 		unsigned int c = check_delta_limit(child, n + 1);
 		if (m < c)
 			m = c;
-		child = DELTA_SIBLING(child);
+		child = oe_delta_sibling(&to_pack, child);
 	}
 	return m;
 }
@@ -2071,7 +2059,7 @@ static unsigned long free_unpacked(struct unpacked *n)
 	free_delta_index(n->index);
 	n->index = NULL;
 	if (n->data) {
-		freed_mem += SIZE(n->entry);
+		freed_mem += oe_size(&to_pack, n->entry);
 		FREE_AND_NULL(n->data);
 	}
 	n->entry = NULL;
@@ -2129,7 +2117,7 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
 		 * otherwise they would become too deep.
 		 */
 		max_depth = depth;
-		if (DELTA_CHILD(entry)) {
+		if (oe_delta_child(&to_pack, entry)) {
 			max_depth -= check_delta_limit(entry, 0);
 			if (max_depth <= 0)
 				goto next;
@@ -2169,11 +2157,11 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
 		if (entry->delta_data && !pack_to_stdout) {
 			unsigned long size;
 
-			size = do_compress(&entry->delta_data, DELTA_SIZE(entry));
+			size = do_compress(&entry->delta_data, oe_delta_size(&to_pack, entry));
 			if (size < (1U << OE_Z_DELTA_BITS)) {
 				entry->z_delta_size = size;
 				cache_lock();
-				delta_cache_size -= DELTA_SIZE(entry);
+				delta_cache_size -= oe_delta_size(&to_pack, entry);
 				delta_cache_size += entry->z_delta_size;
 				cache_unlock();
 			} else {
@@ -2186,7 +2174,7 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
 		 * depth, leaving it in the window is pointless.  we
 		 * should evict it first.
 		 */
-		if (DELTA(entry) && max_depth <= n->depth)
+		if (oe_delta(&to_pack, entry) && max_depth <= n->depth)
 			continue;
 
 		/*
@@ -2194,7 +2182,7 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
 		 * currently deltified object, to keep it longer.  It will
 		 * be the first base object to be attempted next.
 		 */
-		if (DELTA(entry)) {
+		if (oe_delta(&to_pack, entry)) {
 			struct unpacked swap = array[best_base];
 			int dist = (window + idx - best_base) % window;
 			int dst = best_base;
@@ -2515,7 +2503,7 @@ static void prepare_pack(int window, int depth)
 	for (i = 0; i < to_pack.nr_objects; i++) {
 		struct object_entry *entry = to_pack.objects + i;
 
-		if (DELTA(entry))
+		if (oe_delta(&to_pack, entry))
 			/* This happens if we decided to reuse existing
 			 * delta from a pack.  "reuse_delta &&" is implied.
 			 */
-- 8< --

^ permalink raw reply related	[flat|nested] 273+ messages in thread

* Re: [PATCH v8 00/15] nd/pack-objects-pack-struct updates
  2018-03-31 10:02               ` [PATCH v8 00/15] " Nguyễn Thái Ngọc Duy
                                   ` (14 preceding siblings ...)
  2018-03-31 10:03                 ` [PATCH v8 15/15] ci: exercise the whole test suite with uncommon code in pack-objects Nguyễn Thái Ngọc Duy
@ 2018-03-31 11:36                 ` Ævar Arnfjörð Bjarmason
  2018-03-31 12:08                   ` Duy Nguyen
  2018-04-06 21:47                 ` Jeff King
  16 siblings, 1 reply; 273+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2018-03-31 11:36 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: e, git, gitster, peff


On Sat, Mar 31 2018, Nguyễn Thái Ngọc Duy wrote:

I'm testing this and it looks good to me so far, aside from this:

> - use git_env_*() instead of manually handling getenv() values
> [...]
>  	struct packed_git **mapping, *p;
> -	int cnt = 0, nr = 1 << OE_IN_PACK_BITS;
> -
> -	if (getenv("GIT_TEST_FULL_IN_PACK_ARRAY")) {
> -		/*
> -		 * leave in_pack_by_idx NULL to force in_pack[] to be
> -		 * used instead
> -		 */
> -		return;
> -	}
> [...]
>
> +	if (git_env_bool("GIT_TEST_FULL_IN_PACK_ARRAY", 0)) {
> +		/*
> +		 * do not initialize in_pack_by_idx[] to force the
> +		 * slow path in oe_in_pack()
> +		 */
> +	} else {
> +		prepare_in_pack_by_idx(pdata);
> +	}
> [...]
> diff --git a/t/README b/t/README
> index 02bfb3fed5..c01d210c15 100644
> --- a/t/README
> +++ b/t/README
> @@ -291,16 +291,26 @@ expect the rest to function correctly.
>  and know what setup is needed for it.  Or when you want to run
>  everything up to a certain test.
>
> +
> +Running tests with special setups
> +---------------------------------
> +
> +The whole test suite could be run to test some special features
> +that cannot be easily covered by a few specific test cases. These
> +could be enabled by running the test suite with correct GIT_TEST_
> +environment set.
> +
> +GIT_TEST_SPLIT_INDEX forces split-index mode on the whole test suite.
> +
>  GIT_TEST_FULL_IN_PACK_ARRAY exercises the uncommon pack-objects code
>  path where there are more than 1024 packs even if the actual number of
>  packs in repository is below this limit.
>
> -GIT_TEST_OE_SIZE_BITS=<bits> exercises the uncommon pack-objects
> -code path where we do not cache objecct size in memory and read it
> -from existing packs on demand. This normally only happens when the
> -object size is over 2GB. This variable forces the code path on any
> -object larger than 2^<bits> bytes.

The docs here say set these env variables, but actually
GIT_TEST_FULL_IN_PACK_ARRAY is a special snowflake in requiring you to
set a bool value.

I'd set GIT_TEST_SPLIT_INDEX=YesPlease already in my test setup & just
copied that as GIT_TEST_FULL_IN_PACK_ARRAY=YesPlease, but that'll error
out since it's expecting bool, not the env variable to be set.

I really don't care which we use, but let's use either if(getenv()) or
if(git_env_bool()) consistently, and then have the docs either say "if
set" or "if set to a boolean value (see git-config(1))".

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v8 00/15] nd/pack-objects-pack-struct updates
  2018-03-31 11:36                 ` [PATCH v8 00/15] nd/pack-objects-pack-struct updates Ævar Arnfjörð Bjarmason
@ 2018-03-31 12:08                   ` Duy Nguyen
  2018-03-31 15:43                     ` Ævar Arnfjörð Bjarmason
  0 siblings, 1 reply; 273+ messages in thread
From: Duy Nguyen @ 2018-03-31 12:08 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Eric Wong, Git Mailing List, Junio C Hamano, Jeff King

On Sat, Mar 31, 2018 at 1:36 PM, Ævar Arnfjörð Bjarmason
<avarab@gmail.com> wrote:
>> +GIT_TEST_SPLIT_INDEX forces split-index mode on the whole test suite.
>> +
>>  GIT_TEST_FULL_IN_PACK_ARRAY exercises the uncommon pack-objects code
>>  path where there are more than 1024 packs even if the actual number of
>>  packs in repository is below this limit.
>>
>> -GIT_TEST_OE_SIZE_BITS=<bits> exercises the uncommon pack-objects
>> -code path where we do not cache objecct size in memory and read it
>> -from existing packs on demand. This normally only happens when the
>> -object size is over 2GB. This variable forces the code path on any
>> -object larger than 2^<bits> bytes.
>
> The docs here say set these env variables, but actually
> GIT_TEST_FULL_IN_PACK_ARRAY is a special snowflake in requiring you to
> set a bool value.
>
> I'd set GIT_TEST_SPLIT_INDEX=YesPlease already in my test setup & just
> copied that as GIT_TEST_FULL_IN_PACK_ARRAY=YesPlease, but that'll error
> out since it's expecting bool, not the env variable to be set.
>
> I really don't care which we use, but let's use either if(getenv()) or
> if(git_env_bool()) consistently, and then have the docs either say "if
> set" or "if set to a boolean value (see git-config(1))".

I'll change GIT_TEST_SPLIT_INDEX to boolean too since I document it
here anyway. Will wait for a while though to see if anything else
should be part of v9.
-- 
Duy

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v8 00/15] nd/pack-objects-pack-struct updates
  2018-03-31 12:08                   ` Duy Nguyen
@ 2018-03-31 15:43                     ` Ævar Arnfjörð Bjarmason
  0 siblings, 0 replies; 273+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2018-03-31 15:43 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Eric Wong, Git Mailing List, Junio C Hamano, Jeff King


On Sat, Mar 31 2018, Duy Nguyen wrote:

> On Sat, Mar 31, 2018 at 1:36 PM, Ævar Arnfjörð Bjarmason
> <avarab@gmail.com> wrote:
>>> +GIT_TEST_SPLIT_INDEX forces split-index mode on the whole test suite.
>>> +
>>>  GIT_TEST_FULL_IN_PACK_ARRAY exercises the uncommon pack-objects code
>>>  path where there are more than 1024 packs even if the actual number of
>>>  packs in repository is below this limit.
>>>
>>> -GIT_TEST_OE_SIZE_BITS=<bits> exercises the uncommon pack-objects
>>> -code path where we do not cache objecct size in memory and read it
>>> -from existing packs on demand. This normally only happens when the
>>> -object size is over 2GB. This variable forces the code path on any
>>> -object larger than 2^<bits> bytes.
>>
>> The docs here say set these env variables, but actually
>> GIT_TEST_FULL_IN_PACK_ARRAY is a special snowflake in requiring you to
>> set a bool value.
>>
>> I'd set GIT_TEST_SPLIT_INDEX=YesPlease already in my test setup & just
>> copied that as GIT_TEST_FULL_IN_PACK_ARRAY=YesPlease, but that'll error
>> out since it's expecting bool, not the env variable to be set.
>>
>> I really don't care which we use, but let's use either if(getenv()) or
>> if(git_env_bool()) consistently, and then have the docs either say "if
>> set" or "if set to a boolean value (see git-config(1))".
>
> I'll change GIT_TEST_SPLIT_INDEX to boolean too since I document it
> here anyway. Will wait for a while though to see if anything else
> should be part of v9.

Sounds good, FWIW (since I spied your forced push to your private branch
on Github) I mean something like this on top of what you just pushed:

    diff --git a/t/README b/t/README
    index 65dee935c0..583bede192 100644
    --- a/t/README
    +++ b/t/README
    @@ -298,7 +298,8 @@ Running tests with special setups
     The whole test suite could be run to test some special features
     that cannot be easily covered by a few specific test cases. These
     could be enabled by running the test suite with correct GIT_TEST_
    -environment set.
    +environment variable set to a boolean value, as documented in the
    +"Values" section of git-config(1).

     GIT_TEST_SPLIT_INDEX=<true|false> forces split-index mode on the whole
     test suite.

I.e. the part above where we just say it has to be set should be changed
to indicate it's a boolean as understood by git, since in shell/*nix
idiom saying something has to be set just means ensure getenv() won't
return NULL.

^ permalink raw reply	[flat|nested] 273+ messages in thread

* Re: [PATCH v8 00/15] nd/pack-objects-pack-struct updates
  2018-03-31 10:02               ` [PATCH v8 00/15] " Nguyễn Thái Ngọc Duy
                                   ` (15 preceding siblings ...)
  2018-03-31 11:36                 ` [PATCH v8 00/15] nd/pack-objects-pack-struct updates Ævar Arnfjörð Bjarmason
@ 2018-04-06 21:47                 ` Jeff King
  16 siblings, 0 replies; 273+ messages in thread
From: Jeff King @ 2018-04-06 21:47 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: avarab, e, git, gitster

On Sat, Mar 31, 2018 at 12:02:56PM +0200, Nguyễn Thái Ngọc Duy wrote:

> v8 changes
> 
> - prefer BUG() over die()
> - do "1U <<" instead of "1 << " to avoid undefined behavior with
>   signed shifting.
> - add more comments based on Jeff's feedback
> - plug a leak in try_delta() when delta_size is too large
> - be kind and set depth/cache_max_small_delta_size to max limit
>   instead of dying when the user gives a value over limit
> - make travis execute pack-objects uncommon code
> - use git_env_*() instead of manually handling getenv() values
> - fallback code for when a new pack is added when pack-objects is
>   running
> - Compressed cached delta size limit is increased from 64k to 1MB
> - Cached delta size limit is decreased from 2G to 1MB

I ran out of time to give this a very careful review, and I'm trying to
clear my plate before going offline for a few weeks. I did look at the
interdiff and it seemed sane.

So I think going forward, you can at least consider my objections
retracted. I don't promise this iteration is bug-free, but I think
you've addressed all my earlier issues. :)

Thanks.

-Peff

^ permalink raw reply	[flat|nested] 273+ messages in thread

end of thread, other threads:[~2018-04-06 21:47 UTC | newest]

Thread overview: 273+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-02-28  9:27 Reduce pack-objects memory footprint? Duy Nguyen
2018-02-28 10:17 ` Jeff King
2018-02-28 10:58   ` Duy Nguyen
2018-02-28 11:11     ` Jeff King
2018-02-28 11:24       ` Duy Nguyen
2018-02-28 18:22 ` Eric Wong
2018-03-01  9:00   ` Duy Nguyen
2018-03-01  9:10 ` [PATCH 00/11] Reduce pack-objects memory footprint Nguyễn Thái Ngọc Duy
2018-03-01  9:10   ` [PATCH 01/11] pack-objects: document holes in struct object_entry.h Nguyễn Thái Ngọc Duy
2018-03-01  9:10   ` [PATCH 02/11] pack-objects: turn type and in_pack_type to bitfields Nguyễn Thái Ngọc Duy
2018-03-01  9:10   ` [PATCH 03/11] pack-objects: use bitfield for object_entry::dfs_state Nguyễn Thái Ngọc Duy
2018-03-01  9:10   ` [PATCH 04/11] pack-objects: use bitfield for object_entry::depth Nguyễn Thái Ngọc Duy
2018-03-01 18:00     ` Junio C Hamano
2018-03-01  9:10   ` [PATCH 05/11] pack-objects: note about in_pack_header_size Nguyễn Thái Ngọc Duy
2018-03-01  9:10   ` [PATCH 06/11] pack-objects: move in_pack_pos out of struct object_entry Nguyễn Thái Ngọc Duy
2018-03-01  9:10   ` [PATCH 07/11] pack-objects: move in_pack " Nguyễn Thái Ngọc Duy
2018-03-01 12:37     ` Ævar Arnfjörð Bjarmason
2018-03-01 14:49     ` Jeff King
2018-03-02  0:02       ` Duy Nguyen
2018-03-01 18:05     ` Junio C Hamano
2018-03-01  9:10   ` [PATCH 08/11] pack-objects: faster reverse packed_git lookup Nguyễn Thái Ngọc Duy
2018-03-01  9:10   ` [PATCH 09/11] pack-objects: refer to delta objects by index instead of pointer Nguyễn Thái Ngọc Duy
2018-03-01 18:08     ` Junio C Hamano
2018-03-01  9:10   ` [PATCH 10/11] pack-objects: reorder 'hash' to pack struct object_entry Nguyễn Thái Ngọc Duy
2018-03-01  9:10   ` [PATCH 11/11] pack-objects: increase pack file limit to 4096 Nguyễn Thái Ngọc Duy
2018-03-01 13:33   ` [PATCH 00/11] Reduce pack-objects memory footprint Ævar Arnfjörð Bjarmason
2018-03-02  0:14     ` Duy Nguyen
2018-03-02 10:57       ` Jeff King
2018-03-03  2:46   ` [PATCH/RFC v2 0/9] " Nguyễn Thái Ngọc Duy
2018-03-03  2:46     ` [PATCH/RFC v2 1/9] pack-objects: document holes in struct object_entry.h Nguyễn Thái Ngọc Duy
2018-03-03  2:46     ` [PATCH/RFC v2 2/9] pack-objects: turn type and in_pack_type to bitfields Nguyễn Thái Ngọc Duy
2018-03-03  2:47     ` [PATCH/RFC v2 3/9] pack-objects: use bitfield for object_entry::dfs_state Nguyễn Thái Ngọc Duy
2018-03-03  2:47     ` [PATCH/RFC v2 4/9] pack-objects: use bitfield for object_entry::depth Nguyễn Thái Ngọc Duy
2018-03-03  2:47     ` [PATCH/RFC v2 5/9] pack-objects: note about in_pack_header_size Nguyễn Thái Ngọc Duy
2018-03-03  2:47     ` [PATCH/RFC v2 6/9] pack-objects: move in_pack_pos out of struct object_entry Nguyễn Thái Ngọc Duy
2018-03-03  2:47     ` [PATCH/RFC v2 7/9] pack-objects: move in_pack " Nguyễn Thái Ngọc Duy
2018-03-03  2:47     ` [PATCH/RFC v2 8/9] pack-objects: refer to delta objects by index instead of pointer Nguyễn Thái Ngọc Duy
2018-03-03  2:47     ` [PATCH/RFC v2 9/9] pack-objects: reorder 'hash' to pack struct object_entry Nguyễn Thái Ngọc Duy
2018-03-05  9:28     ` [PATCH/RFC v2 0/9] Reduce pack-objects memory footprint Duy Nguyen
2018-03-08 11:42     ` [PATCH/RFC v3 00/12] " Nguyễn Thái Ngọc Duy
2018-03-08 11:42       ` [PATCH/RFC v3 01/12] pack-objects: a bit of document about struct object_entry Nguyễn Thái Ngọc Duy
2018-03-09 22:34         ` Junio C Hamano
2018-03-08 11:42       ` [PATCH/RFC v3 02/12] pack-objects: turn type and in_pack_type to bitfields Nguyễn Thái Ngọc Duy
2018-03-09 22:54         ` Junio C Hamano
2018-03-12 17:51           ` Duy Nguyen
2018-03-08 11:42       ` [PATCH/RFC v3 03/12] pack-objects: use bitfield for object_entry::dfs_state Nguyễn Thái Ngọc Duy
2018-03-08 11:42       ` [PATCH/RFC v3 04/12] pack-objects: use bitfield for object_entry::depth Nguyễn Thái Ngọc Duy
2018-03-09 23:07         ` Junio C Hamano
2018-03-08 11:42       ` [PATCH/RFC v3 05/12] pack-objects: note about in_pack_header_size Nguyễn Thái Ngọc Duy
2018-03-08 11:42       ` [PATCH/RFC v3 06/12] pack-objects: move in_pack_pos out of struct object_entry Nguyễn Thái Ngọc Duy
2018-03-08 11:42       ` [PATCH/RFC v3 07/12] pack-objects: move in_pack " Nguyễn Thái Ngọc Duy
2018-03-09 23:21         ` Junio C Hamano
2018-03-08 11:42       ` [PATCH/RFC v3 08/12] pack-objects: refer to delta objects by index instead of pointer Nguyễn Thái Ngọc Duy
2018-03-14 16:18         ` Junio C Hamano
2018-03-08 11:42       ` [PATCH/RFC v3 09/12] pack-objects: reorder 'hash' to pack struct object_entry Nguyễn Thái Ngọc Duy
2018-03-08 11:42       ` [PATCH/RFC v3 10/12] pack-objects: shrink z_delta_size field in " Nguyễn Thái Ngọc Duy
2018-03-08 11:42       ` [PATCH/RFC v3 11/12] pack-objects: shrink size " Nguyễn Thái Ngọc Duy
2018-03-08 11:42       ` [PATCH/RFC v3 12/12] pack-objects: shrink delta_size " Nguyễn Thái Ngọc Duy
2018-03-16 18:31       ` [PATCH v4 00/11] nd/pack-objects-pack-struct updates Nguyễn Thái Ngọc Duy
2018-03-16 18:31         ` [PATCH v4 01/11] pack-objects: a bit of document about struct object_entry Nguyễn Thái Ngọc Duy
2018-03-16 20:32           ` Junio C Hamano
2018-03-17 11:59             ` Duy Nguyen
2018-03-16 18:31         ` [PATCH v4 02/11] pack-objects: turn type and in_pack_type to bitfields Nguyễn Thái Ngọc Duy
2018-03-16 20:49           ` Junio C Hamano
2018-03-16 18:31         ` [PATCH v4 03/11] pack-objects: use bitfield for object_entry::dfs_state Nguyễn Thái Ngọc Duy
2018-03-16 18:31         ` [PATCH v4 04/11] pack-objects: use bitfield for object_entry::depth Nguyễn Thái Ngọc Duy
2018-03-16 18:31         ` [PATCH v4 05/11] pack-objects: move in_pack_pos out of struct object_entry Nguyễn Thái Ngọc Duy
2018-03-16 18:31         ` [PATCH v4 06/11] pack-objects: move in_pack " Nguyễn Thái Ngọc Duy
2018-03-26 20:39           ` Stefan Beller
2018-03-16 18:31         ` [PATCH v4 07/11] pack-objects: refer to delta objects by index instead of pointer Nguyễn Thái Ngọc Duy
2018-03-16 20:59           ` Junio C Hamano
2018-03-16 18:31         ` [PATCH v4 08/11] pack-objects: shrink z_delta_size field in struct object_entry Nguyễn Thái Ngọc Duy
2018-03-16 19:40           ` Junio C Hamano
2018-03-16 18:31         ` [PATCH v4 09/11] pack-objects: shrink size " Nguyễn Thái Ngọc Duy
2018-03-16 19:49           ` Junio C Hamano
2018-03-16 21:34             ` Junio C Hamano
2018-03-16 18:31         ` [PATCH v4 10/11] pack-objects: shrink delta_size " Nguyễn Thái Ngọc Duy
2018-03-16 18:32         ` [PATCH v4 11/11] pack-objects.h: reorder members to shrink " Nguyễn Thái Ngọc Duy
2018-03-16 21:02           ` Junio C Hamano
2018-03-17 12:07             ` Duy Nguyen
2018-03-17 14:10         ` [PATCH v5 00/11] nd/pack-objects-pack-struct updates Nguyễn Thái Ngọc Duy
2018-03-17 14:10           ` [PATCH v5 01/11] pack-objects: a bit of document about struct object_entry Nguyễn Thái Ngọc Duy
2018-03-17 14:10           ` [PATCH v5 02/11] pack-objects: turn type and in_pack_type to bitfields Nguyễn Thái Ngọc Duy
2018-03-17 14:10           ` [PATCH v5 03/11] pack-objects: use bitfield for object_entry::dfs_state Nguyễn Thái Ngọc Duy
2018-03-17 14:10           ` [PATCH v5 04/11] pack-objects: use bitfield for object_entry::depth Nguyễn Thái Ngọc Duy
2018-03-17 21:26             ` Ævar Arnfjörð Bjarmason
2018-03-17 14:10           ` [PATCH v5 05/11] pack-objects: move in_pack_pos out of struct object_entry Nguyễn Thái Ngọc Duy
2018-03-17 14:10           ` [PATCH v5 06/11] pack-objects: move in_pack " Nguyễn Thái Ngọc Duy
2018-03-17 14:10           ` [PATCH v5 07/11] pack-objects: refer to delta objects by index instead of pointer Nguyễn Thái Ngọc Duy
2018-03-17 14:10           ` [PATCH v5 08/11] pack-objects: shrink z_delta_size field in struct object_entry Nguyễn Thái Ngọc Duy
2018-03-17 14:10           ` [PATCH v5 09/11] pack-objects: shrink size " Nguyễn Thái Ngọc Duy
2018-03-17 19:57             ` Ævar Arnfjörð Bjarmason
2018-03-18  5:09             ` Junio C Hamano
2018-03-18  8:23               ` Duy Nguyen
2018-03-17 14:10           ` [PATCH v5 10/11] pack-objects: shrink delta_size " Nguyễn Thái Ngọc Duy
2018-03-17 14:10           ` [PATCH v5 11/11] pack-objects.h: reorder members to shrink " Nguyễn Thái Ngọc Duy
2018-03-17 19:53             ` Ævar Arnfjörð Bjarmason
2018-03-18  8:49               ` Duy Nguyen
2018-03-17 19:45           ` [PATCH v5 00/11] nd/pack-objects-pack-struct updates Ævar Arnfjörð Bjarmason
2018-03-17 19:47             ` Ævar Arnfjörð Bjarmason
2018-03-18 14:25           ` [PATCH v6 " Nguyễn Thái Ngọc Duy
2018-03-18 14:25             ` [PATCH v6 01/11] pack-objects: a bit of document about struct object_entry Nguyễn Thái Ngọc Duy
2018-03-18 14:25             ` [PATCH v6 02/11] pack-objects: turn type and in_pack_type to bitfields Nguyễn Thái Ngọc Duy
2018-03-18 14:25             ` [PATCH v6 03/11] pack-objects: use bitfield for object_entry::dfs_state Nguyễn Thái Ngọc Duy
2018-03-18 14:25             ` [PATCH v6 04/11] pack-objects: use bitfield for object_entry::depth Nguyễn Thái Ngọc Duy
2018-03-18 14:25             ` [PATCH v6 05/11] pack-objects: move in_pack_pos out of struct object_entry Nguyễn Thái Ngọc Duy
2018-03-18 14:25             ` [PATCH v6 06/11] pack-objects: move in_pack " Nguyễn Thái Ngọc Duy
2018-03-18 14:25             ` [PATCH v6 07/11] pack-objects: refer to delta objects by index instead of pointer Nguyễn Thái Ngọc Duy
2018-03-18 14:25             ` [PATCH v6 08/11] pack-objects: shrink z_delta_size field in struct object_entry Nguyễn Thái Ngọc Duy
2018-03-18 14:25             ` [PATCH v6 09/11] pack-objects: shrink size " Nguyễn Thái Ngọc Duy
2018-03-18 14:49               ` Ævar Arnfjörð Bjarmason
2018-03-19 16:19               ` Junio C Hamano
2018-03-19 16:23                 ` Duy Nguyen
2018-03-19 16:43               ` Junio C Hamano
2018-03-19 16:54                 ` Duy Nguyen
2018-03-19 18:29                   ` Junio C Hamano
2018-03-19 18:45                     ` Duy Nguyen
2018-03-19 20:10                       ` Junio C Hamano
2018-03-20 18:08                         ` Duy Nguyen
2018-03-20 18:22                           ` Junio C Hamano
2018-03-21  8:03                           ` Jeff King
2018-03-21 16:12                             ` Duy Nguyen
2018-03-20 18:17                 ` Duy Nguyen
2018-03-18 14:25             ` [PATCH v6 10/11] pack-objects: shrink delta_size " Nguyễn Thái Ngọc Duy
2018-03-18 14:25             ` [PATCH v6 11/11] pack-objects: reorder members to shrink " Nguyễn Thái Ngọc Duy
2018-03-18 14:51             ` [PATCH v6 00/11] nd/pack-objects-pack-struct updates Ævar Arnfjörð Bjarmason
2018-03-21  8:24             ` Jeff King
2018-03-21 15:59               ` Duy Nguyen
2018-03-21 16:17                 ` Ævar Arnfjörð Bjarmason
2018-03-21 16:22                   ` Duy Nguyen
2018-03-21 16:46                 ` Duy Nguyen
2018-03-21 19:11                   ` Junio C Hamano
2018-03-22  9:32                 ` Jeff King
2018-03-22  9:46                   ` Jeff King
2018-03-22 10:57                   ` Duy Nguyen
2018-03-22 11:52                     ` Jeff King
2018-03-22 17:04                       ` Duy Nguyen
2018-03-23  1:28                   ` Ramsay Jones
2018-03-23  2:46                     ` Jeff King
2018-03-23  5:50                       ` Jeff King
2018-03-23 16:01                         ` Ramsay Jones
2018-03-24  6:40                           ` Jeff King
2018-03-23  7:05                       ` Duy Nguyen
2018-03-23 14:03                       ` Ramsay Jones
2018-03-21 16:31               ` Ævar Arnfjörð Bjarmason
2018-03-21 16:53                 ` Junio C Hamano
2018-03-21 17:00                   ` Duy Nguyen
2018-03-22  8:07                 ` Jeff King
2018-03-22  8:23                   ` Duy Nguyen
2018-03-22 10:01                     ` Jeff King
2018-03-24  6:33             ` [PATCH v7 00/13] " Nguyễn Thái Ngọc Duy
2018-03-24  6:33               ` [PATCH v7 01/13] pack-objects: a bit of document about struct object_entry Nguyễn Thái Ngọc Duy
2018-03-24  6:33               ` [PATCH v7 02/13] pack-objects: turn type and in_pack_type to bitfields Nguyễn Thái Ngọc Duy
2018-03-30 20:18                 ` Jeff King
2018-03-24  6:33               ` [PATCH v7 03/13] pack-objects: use bitfield for object_entry::dfs_state Nguyễn Thái Ngọc Duy
2018-03-30 20:23                 ` Jeff King
2018-03-24  6:33               ` [PATCH v7 04/13] pack-objects: use bitfield for object_entry::depth Nguyễn Thái Ngọc Duy
2018-03-30 20:26                 ` Jeff King
2018-03-24  6:33               ` [PATCH v7 05/13] pack-objects: move in_pack_pos out of struct object_entry Nguyễn Thái Ngọc Duy
2018-03-30 20:30                 ` Jeff King
2018-03-24  6:33               ` [PATCH v7 06/13] pack-objects: move in_pack " Nguyễn Thái Ngọc Duy
2018-03-24  9:42                 ` Ævar Arnfjörð Bjarmason
2018-03-24 12:26                   ` Duy Nguyen
2018-03-24 12:13                 ` Ævar Arnfjörð Bjarmason
2018-03-30 20:48                 ` Jeff King
2018-03-31  4:51                   ` Duy Nguyen
2018-03-31 10:20                     ` Jeff King
2018-03-31 10:45                       ` Duy Nguyen
2018-03-24  6:33               ` [PATCH v7 07/13] pack-objects: refer to delta objects by index instead of pointer Nguyễn Thái Ngọc Duy
2018-03-30 20:53                 ` Jeff King
2018-03-24  6:33               ` [PATCH v7 08/13] pack-objects: shrink z_delta_size field in struct object_entry Nguyễn Thái Ngọc Duy
2018-03-30 20:59                 ` Jeff King
2018-03-31  4:40                   ` Duy Nguyen
2018-03-31 10:17                     ` Jeff King
2018-03-24  6:33               ` [PATCH v7 09/13] pack-objects: don't check size when the object is bad Nguyễn Thái Ngọc Duy
2018-03-24  6:33               ` [PATCH v7 10/13] pack-objects: clarify the use of object_entry::size Nguyễn Thái Ngọc Duy
2018-03-30 21:04                 ` Jeff King
2018-03-31  4:35                   ` Duy Nguyen
2018-03-31 10:13                     ` Jeff King
2018-03-24  6:33               ` [PATCH v7 11/13] pack-objects: shrink size field in struct object_entry Nguyễn Thái Ngọc Duy
2018-03-30 21:18                 ` Jeff King
2018-03-24  6:33               ` [PATCH v7 12/13] pack-objects: shrink delta_size " Nguyễn Thái Ngọc Duy
2018-03-30 21:24                 ` Jeff King
2018-03-31  4:21                   ` Duy Nguyen
2018-03-31  9:10                   ` Duy Nguyen
2018-03-24  6:33               ` [PATCH v7 13/13] pack-objects: reorder members to shrink " Nguyễn Thái Ngọc Duy
2018-03-30 21:26                 ` Jeff King
2018-03-31  4:10                   ` Duy Nguyen
2018-03-26 15:13               ` [PATCH v7 00/13] nd/pack-objects-pack-struct updates Jeff King
2018-03-26 17:04                 ` Duy Nguyen
2018-03-27 16:53                   ` Jeff King
2018-03-31 10:02               ` [PATCH v8 00/15] " Nguyễn Thái Ngọc Duy
2018-03-31 10:02                 ` [PATCH v8 01/15] t/README: mention about running the test suite in special modes Nguyễn Thái Ngọc Duy
2018-03-31 10:02                 ` [PATCH v8 02/15] pack-objects: a bit of document about struct object_entry Nguyễn Thái Ngọc Duy
2018-03-31 10:02                 ` [PATCH v8 03/15] pack-objects: turn type and in_pack_type to bitfields Nguyễn Thái Ngọc Duy
2018-03-31 10:03                 ` [PATCH v8 04/15] pack-objects: use bitfield for object_entry::dfs_state Nguyễn Thái Ngọc Duy
2018-03-31 10:03                 ` [PATCH v8 05/15] pack-objects: use bitfield for object_entry::depth Nguyễn Thái Ngọc Duy
2018-03-31 10:03                 ` [PATCH v8 06/15] pack-objects: move in_pack_pos out of struct object_entry Nguyễn Thái Ngọc Duy
2018-03-31 10:03                 ` [PATCH v8 07/15] pack-objects: move in_pack " Nguyễn Thái Ngọc Duy
2018-03-31 10:03                 ` [PATCH v8 08/15] pack-objects: refer to delta objects by index instead of pointer Nguyễn Thái Ngọc Duy
2018-03-31 10:03                 ` [PATCH v8 09/15] pack-objects: shrink z_delta_size field in struct object_entry Nguyễn Thái Ngọc Duy
2018-03-31 10:03                 ` [PATCH v8 10/15] pack-objects: don't check size when the object is bad Nguyễn Thái Ngọc Duy
2018-03-31 10:03                 ` [PATCH v8 11/15] pack-objects: clarify the use of object_entry::size Nguyễn Thái Ngọc Duy
2018-03-31 10:03                 ` [PATCH v8 12/15] pack-objects: shrink size field in struct object_entry Nguyễn Thái Ngọc Duy
2018-03-31 10:03                 ` [PATCH v8 13/15] pack-objects: shrink delta_size " Nguyễn Thái Ngọc Duy
2018-03-31 10:03                 ` [PATCH v8 14/15] pack-objects: reorder members to shrink " Nguyễn Thái Ngọc Duy
2018-03-31 10:03                 ` [PATCH v8 15/15] ci: exercise the whole test suite with uncommon code in pack-objects Nguyễn Thái Ngọc Duy
2018-03-31 11:36                 ` [PATCH v8 00/15] nd/pack-objects-pack-struct updates Ævar Arnfjörð Bjarmason
2018-03-31 12:08                   ` Duy Nguyen
2018-03-31 15:43                     ` Ævar Arnfjörð Bjarmason
2018-04-06 21:47                 ` Jeff King
2018-03-01  9:20 ` [PATCH/RFC 0/1] Avoid expensive 'repack -ad' in gc --auto Nguyễn Thái Ngọc Duy
2018-03-01  9:20   ` [PATCH/RFC 1/1] gc --auto: exclude the largest giant pack in low-memory config Nguyễn Thái Ngọc Duy
2018-03-01 18:14     ` Junio C Hamano
2018-03-02  0:00       ` Duy Nguyen
2018-03-05 14:00     ` Ævar Arnfjörð Bjarmason
2018-03-06 10:41   ` [PATCH v2 0/5] Avoid expensive 'repack -ad' in gc --auto Nguyễn Thái Ngọc Duy
2018-03-06 10:41     ` [PATCH v2 1/5] fixup! Add a test showing that 'git repack' throws away grafted-away parents Nguyễn Thái Ngọc Duy
2018-03-06 18:01       ` Junio C Hamano
2018-03-06 10:41     ` [PATCH v2 2/5] repack: add --keep-pack option Nguyễn Thái Ngọc Duy
2018-03-06 18:25       ` Junio C Hamano
2018-03-07 10:19         ` Duy Nguyen
2018-03-06 10:41     ` [PATCH v2 3/5] gc --auto: exclude base pack if not enough mem to "repack -ad" Nguyễn Thái Ngọc Duy
2018-03-06 19:19       ` Junio C Hamano
2018-03-07 10:48         ` Duy Nguyen
2018-03-07 18:38           ` Junio C Hamano
2018-03-12 18:56             ` Ævar Arnfjörð Bjarmason
2018-03-12 21:16               ` Junio C Hamano
2018-03-12 22:01                 ` Ævar Arnfjörð Bjarmason
2018-03-15 16:48               ` Duy Nguyen
2018-03-07 10:48       ` Johannes Schindelin
2018-03-07 18:40         ` Junio C Hamano
2018-03-12 19:30       ` Ævar Arnfjörð Bjarmason
2018-03-15 17:00         ` Duy Nguyen
2018-03-15 19:21           ` Ævar Arnfjörð Bjarmason
2018-03-16 17:47             ` Duy Nguyen
2018-03-06 10:41     ` [PATCH v2 4/5] pack-objects: show some progress when counting kept objects Nguyễn Thái Ngọc Duy
2018-03-12 18:32       ` Ævar Arnfjörð Bjarmason
2018-03-16 19:14         ` Duy Nguyen
2018-03-16 20:13           ` Duy Nguyen
2018-03-06 10:41     ` [PATCH v2 5/5] pack-objects: display progress in get_object_details() Nguyễn Thái Ngọc Duy
2018-03-06 17:49     ` [PATCH v2 0/5] Avoid expensive 'repack -ad' in gc --auto Junio C Hamano
2018-03-16 19:27     ` [PATCH v3 0/7] nd/repack-keep-pack updates Nguyễn Thái Ngọc Duy
2018-03-16 19:27       ` [PATCH v3 1/7] repack: add --keep-pack option Nguyễn Thái Ngọc Duy
2018-03-16 19:27       ` [PATCH v3 2/7] gc: add --keep-base-pack Nguyễn Thái Ngọc Duy
2018-03-16 21:05         ` Ævar Arnfjörð Bjarmason
2018-03-19 17:26           ` Duy Nguyen
2018-03-19 19:04             ` Ævar Arnfjörð Bjarmason
2018-03-16 21:25         ` Ævar Arnfjörð Bjarmason
2018-03-16 19:27       ` [PATCH v3 3/7] gc: detect base packs based on gc.bigPackThreshold config Nguyễn Thái Ngọc Duy
2018-03-16 21:02         ` Ævar Arnfjörð Bjarmason
2018-03-16 19:27       ` [PATCH v3 4/7] gc --auto: exclude base pack if not enough mem to "repack -ad" Nguyễn Thái Ngọc Duy
2018-03-16 21:14         ` Ævar Arnfjörð Bjarmason
2018-03-16 19:27       ` [PATCH v3 5/7] gc: handle a corner case in gc.bigPackThreshold Nguyễn Thái Ngọc Duy
2018-03-16 21:10         ` Ævar Arnfjörð Bjarmason
2018-03-16 19:27       ` [PATCH v3 6/7] pack-objects: show some progress when counting kept objects Nguyễn Thái Ngọc Duy
2018-03-16 19:27       ` [PATCH v3 7/7] pack-objects: display progress in get_object_details() Nguyễn Thái Ngọc Duy
2018-03-24  7:25       ` [PATCH v4 0/7] nd/repack-keep-pack updates Nguyễn Thái Ngọc Duy
2018-03-24  7:25         ` [PATCH v4 1/7] t7700: have closing quote of a test at the beginning of line Nguyễn Thái Ngọc Duy
2018-03-24  7:25         ` [PATCH v4 2/7] repack: add --keep-pack option Nguyễn Thái Ngọc Duy
2018-03-24  7:25         ` [PATCH v4 3/7] gc: add --keep-largest-pack option Nguyễn Thái Ngọc Duy
2018-03-24  7:25         ` [PATCH v4 4/7] gc: add gc.bigPackThreshold config Nguyễn Thái Ngọc Duy
2018-03-24  7:25         ` [PATCH v4 5/7] gc: handle a corner case in gc.bigPackThreshold Nguyễn Thái Ngọc Duy
2018-03-24  7:25         ` [PATCH v4 6/7] gc --auto: exclude base pack if not enough mem to "repack -ad" Nguyễn Thái Ngọc Duy
2018-03-24  7:25         ` [PATCH v4 7/7] pack-objects: show some progress when counting kept objects Nguyễn Thái Ngọc Duy
2018-03-02 10:18 ` Reduce pack-objects memory footprint? Duy Nguyen
2018-03-02 10:37   ` Eric Wong
2018-03-02 10:54   ` Jeff King
2018-03-02 10:55     ` Duy Nguyen
2018-03-02 14:38     ` Duy Nguyen
2018-03-17 22:05 ` Why does pack-objects use so much memory on incremental packing? Ævar Arnfjörð Bjarmason
2018-03-18  8:37   ` Duy Nguyen
2018-03-20  5:28   ` Jeff King

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.