All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH/RFC] reverse the pack-objects delta window logic
@ 2006-04-26  3:37 Nicolas Pitre
  2006-04-26  5:45 ` Junio C Hamano
  0 siblings, 1 reply; 9+ messages in thread
From: Nicolas Pitre @ 2006-04-26  3:37 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

This allows for keeping a single delta index constant while delta 
targets are tested against the same base object.

Signed-off-by: Nicolas Pitre <nico@cam.org>

---

Note, this is a RFC particularly to Junio since the resulting pack is 
larger than without the patch with git-repack -a -f.  However using a 
subsequent git-repack -a brings the pack size down to expected size.  So 
I'm not sure I've got everything right.

diff --git a/pack-objects.c b/pack-objects.c
index c0acc46..33027a8 100644
--- a/pack-objects.c
+++ b/pack-objects.c
@@ -19,19 +19,17 @@ struct object_entry {
 	unsigned long offset;	/* offset into the final pack file;
 				 * nonzero if already written.
 				 */
-	unsigned int depth;	/* delta depth */
-	unsigned int delta_limit;	/* base adjustment for in-pack delta */
+	unsigned int delta_limit;	/* deepest delta from this object */
 	unsigned int hash;	/* name hint hash */
 	enum object_type type;
 	enum object_type in_pack_type;	/* could be delta */
 	unsigned long delta_size;	/* delta data size (uncompressed) */
 	struct object_entry *delta;	/* delta base object */
-	struct packed_git *in_pack; 	/* already in pack */
-	unsigned int in_pack_offset;
 	struct object_entry *delta_child; /* delitified objects who bases me */
 	struct object_entry *delta_sibling; /* other deltified objects who
-					     * uses the same base as me
-					     */
+					       uses the same base as me */
+	struct packed_git *in_pack; 	/* already in pack */
+	unsigned int in_pack_offset;
 	int preferred_base;	/* we do not pack this, but is encouraged to
 				 * be used as the base objectto delta huge
 				 * objects against.
@@ -906,11 +904,11 @@ static void get_object_details(void)
 	for (i = 0, entry = objects; i < nr_objects; i++, entry++)
 		check_object(entry);
 
-	if (nr_objects == nr_result) {
+	if (!no_reuse_delta && nr_objects == nr_result) {
 		/*
-		 * Depth of objects that depend on the entry -- this
-		 * is subtracted from depth-max to break too deep
-		 * delta chain because of delta data reusing.
+		 * We must determine the maximum depth of reused deltas
+		 * for those objects used as their base before find_deltas()
+		 * starts considering them as potential delta targets.
 		 * However, we loosen this restriction when we know we
 		 * are creating a thin pack -- it will have to be
 		 * expanded on the other end anyway, so do not
@@ -1004,64 +1002,78 @@ struct unpacked {
  * more importantly, the bigger file is likely the more recent
  * one.
  */
-static int try_delta(struct unpacked *cur, struct unpacked *old, unsigned max_depth)
+static int try_delta(struct unpacked *trg, struct unpacked *src,
+		     struct delta_index *src_index, unsigned max_depth)
 {
-	struct object_entry *cur_entry = cur->entry;
-	struct object_entry *old_entry = old->entry;
-	unsigned long size, oldsize, delta_size, sizediff;
-	long max_size;
+	struct object_entry *trg_entry = trg->entry;
+	struct object_entry *src_entry = src->entry;
+	unsigned long size, src_size, delta_size, sizediff, max_size;
 	void *delta_buf;
 
 	/* Don't bother doing diffs between different types */
-	if (cur_entry->type != old_entry->type)
+	if (trg_entry->type != src_entry->type)
 		return -1;
 
 	/* We do not compute delta to *create* objects we are not
 	 * going to pack.
 	 */
-	if (cur_entry->preferred_base)
-		return -1;
+	if (trg_entry->preferred_base)
+		return 0;
 
-	/* If the current object is at pack edge, take the depth the
-	 * objects that depend on the current object into account --
-	 * otherwise they would become too deep.
+	/*
+	 * Make sure deltifying this object won't make its deepest delta
+	 * too deep, but only when not producing a thin pack.
 	 */
-	if (cur_entry->delta_child) {
-		if (max_depth <= cur_entry->delta_limit)
-			return 0;
-		max_depth -= cur_entry->delta_limit;
-	}
-
-	size = cur_entry->size;
-	oldsize = old_entry->size;
-	sizediff = oldsize > size ? oldsize - size : size - oldsize;
+	if (nr_objects == nr_result && trg_entry->delta_limit >= max_depth)
+		return 0;
 
+	/* Now some size filtering euristics. */
+	size = trg_entry->size;
 	if (size < 50)
-		return -1;
-	if (old_entry->depth >= max_depth)
 		return 0;
-
-	/*
-	 * NOTE!
-	 *
-	 * We always delta from the bigger to the smaller, since that's
-	 * more space-efficient (deletes don't have to say _what_ they
-	 * delete).
-	 */
 	max_size = size / 2 - 20;
-	if (cur_entry->delta)
-		max_size = cur_entry->delta_size-1;
+	if (trg_entry->delta)
+		max_size = trg_entry->delta_size-1;
+	src_size = src_entry->size;
+	sizediff = src_size < size ? size - src_size : 0;
 	if (sizediff >= max_size)
 		return 0;
-	delta_buf = diff_delta(old->data, oldsize,
-			       cur->data, size, &delta_size, max_size);
+
+	delta_buf = create_delta(src_index, trg->data, size, &delta_size, max_size);
 	if (!delta_buf)
 		return 0;
-	cur_entry->delta = old_entry;
-	cur_entry->delta_size = delta_size;
-	cur_entry->depth = old_entry->depth + 1;
+
+	if (trg_entry->delta) {
+		/*
+		 * The target object already has a delta base but we just
+		 * found a better one.  Remove it from its former base
+		 * childhood and redetermine the base delta_limit (if used).
+		 */
+		struct object_entry *base = trg_entry->delta;
+		struct object_entry **child_link = &base->delta_child;
+		base->delta_limit = 0;
+		while (*child_link) {
+			if (*child_link == trg_entry) {
+				*child_link = trg_entry->delta_sibling;
+				if (nr_objects != nr_result)
+					break;
+				continue;
+			}
+			if (base->delta_limit <= (*child_link)->delta_limit)
+				base->delta_limit =
+					(*child_link)->delta_limit + 1;
+			child_link = &(*child_link)->delta_sibling;
+		}
+	}
+
+	trg_entry->delta = src_entry;
+	trg_entry->delta_size = delta_size;
+	trg_entry->delta_sibling = src_entry->delta_child;
+	src_entry->delta_child = trg_entry;
+	if (src_entry->delta_limit <= trg_entry->delta_limit)
+		src_entry->delta_limit = trg_entry->delta_limit + 1;
 	free(delta_buf);
-	return 0;
+	return 1;
 }
 
 static void progress_interval(int signum)
@@ -1078,14 +1090,15 @@ static void find_deltas(struct object_en
 	unsigned last_percent = 999;
 
 	memset(array, 0, array_size);
-	i = nr_objects;
+	i = 0;
 	idx = 0;
 	if (progress)
 		fprintf(stderr, "Deltifying %d objects.\n", nr_result);
 
-	while (--i >= 0) {
-		struct object_entry *entry = list[i];
+	while (i < nr_objects) {
+		struct object_entry *entry = list[i++];
 		struct unpacked *n = array + idx;
+		struct delta_index *delta_index;
 		unsigned long size;
 		char type[10];
 		int j;
@@ -1113,7 +1126,13 @@ static void find_deltas(struct object_en
 		n->entry = entry;
 		n->data = read_sha1_file(entry->sha1, type, &size);
 		if (size != entry->size)
-			die("object %s inconsistent object length (%lu vs %lu)", sha1_to_hex(entry->sha1), size, entry->size);
+			die("object %s inconsistent object length (%lu vs %lu)",
+			    sha1_to_hex(entry->sha1), size, entry->size);
+		if (!size)
+			continue;
+		delta_index = create_delta_index(n->data, size);
+		if (!delta_index)
+			die("out of memory");
 
 		j = window;
 		while (--j > 0) {
@@ -1124,18 +1143,10 @@ static void find_deltas(struct object_en
 			m = array + other_idx;
 			if (!m->entry)
 				break;
-			if (try_delta(n, m, depth) < 0)
+			if (try_delta(m, n, delta_index, depth) < 0)
 				break;
 		}
-#if 0
-		/* if we made n a delta, and if n is already at max
-		 * depth, leaving it in the window is pointless.  we
-		 * should evict it first.
-		 * ... in theory only; somehow this makes things worse.
-		 */
-		if (entry->delta && depth <= entry->depth)
-			continue;
-#endif
+		free_delta_index(delta_index);
 		idx++;
 		if (idx >= window)
 			idx = 0;

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH/RFC] reverse the pack-objects delta window logic
  2006-04-26  3:37 [PATCH/RFC] reverse the pack-objects delta window logic Nicolas Pitre
@ 2006-04-26  5:45 ` Junio C Hamano
  2006-04-26 15:48   ` Nicolas Pitre
  2006-04-26 17:48   ` Nicolas Pitre
  0 siblings, 2 replies; 9+ messages in thread
From: Junio C Hamano @ 2006-04-26  5:45 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git

Nicolas Pitre <nico@cam.org> writes:

> Note, this is a RFC particularly to Junio since the resulting pack is 
> larger than without the patch with git-repack -a -f.  However using a 
> subsequent git-repack -a brings the pack size down to expected size.  So 
> I'm not sure I've got everything right.

I haven't tested it seriously yet, but there is nothing that
looks obviously wrong that might cause the inflation problem,
from the cursory look after applying the patch on top of your
last round.

> +	if (nr_objects == nr_result && trg_entry->delta_limit >= max_depth)
> +		return 0;

The older code was loosening this check only for a delta chain
that is already in pack (which is limited to its previous
max_depth).  The end result is almost the same -- a thin pack
recipient would have deeper delta than it asked. The difference
is that the earlier code had implicit 2*max_depth limit, but
this one makes the chain length unbounded, which I do not think
it is necessarily a bad change.  In any case it does not explain
why you are getting larger resulting pack, though.

> +	/* Now some size filtering euristics. */
> +	size = trg_entry->size;
>  	if (size < 50)
> -		return -1;
> -	if (old_entry->depth >= max_depth)
>  		return 0;

This is necessary because you are scanning from smaller to
larger, and I think it is a good change.

> -	/*
> -	 * NOTE!
> -	 *
> -	 * We always delta from the bigger to the smaller, since that's
> -	 * more space-efficient (deletes don't have to say _what_ they
> -	 * delete).
> -	 */

This comment by Linus still applies, even though the scan order
is now reversed; no need to remove it.

> +
> +	if (trg_entry->delta) {
> +		/*
> +		 * The target object already has a delta base but we just
> +		 * found a better one.  Remove it from its former base
> +		 * childhood and redetermine the base delta_limit (if used).
> +		 */

And you are making the delta chain unbound for thin case, you
can probably omit this with the same if() here; the
recomputation seems rather expensive.

> +			die("object %s inconsistent object length (%lu vs %lu)",
> +			    sha1_to_hex(entry->sha1), size, entry->size);
> +		if (!size)
> +			continue;
> +		delta_index = create_delta_index(n->data, size);
> +		if (!delta_index)
> +			die("out of memory");

It might be worth saying "if (size < 50)" here as well; no point
wasting the delta window for small sources.

> -#if 0
> -		/* if we made n a delta, and if n is already at max
> -		 * depth, leaving it in the window is pointless.  we
> -		 * should evict it first.
> -		 * ... in theory only; somehow this makes things worse.
> -		 */
> -		if (entry->delta && depth <= entry->depth)
> -			continue;
> -#endif

I was almost tempted to suggest that the degradation you are
seeing might be related to this mystery I did not get around to
solve.  By allowing to give chance to try delta against less
optimum candidates, it appeared that we ended up making the
final pack size bigger than otherwise, which suggests that our
choice between plain undeltified and a delta half its size might
be favoring delta too much.  But it does not appear to be
related to the inflation you are seeing.

With object list taken between v1.2.3..v1.3.0 in git.git
repository and without delta reuse, 3054 objects are packed
(delta 1734) with this code.  The "next" makes 1818 delta (only
5% more), which makes me suspect that it is making a bad choice
of delta base, because the final pack size is 1.5M vs 1.9M.

The chain length distribution is a bit different (run
"git-verify-pack -v" and look at the end of its output).

The "next" version:

chain length = 1: 257 objects
chain length = 2: 189 objects
chain length = 3: 156 objects
chain length = 4: 149 objects
chain length = 5: 113 objects
chain length = 6: 105 objects
chain length = 7: 105 objects
chain length = 8: 102 objects
chain length = 9: 103 objects
chain length = 10: 539 objects

this version:

chain length = 1: 415 objects
chain length = 2: 333 objects
chain length = 3: 259 objects
chain length = 4: 197 objects
chain length = 5: 155 objects
chain length = 6: 134 objects
chain length = 7: 106 objects
chain length = 8: 69 objects
chain length = 9: 47 objects
chain length = 10: 19 objects

The resulting pack would be faster to access (it has much
shorter median chain length).

BTW, have you tried it without --no-reuse-pack on an object list
that is not thin?  It appears you are busting the depth limit.

Using the same "git rev-list --objects v1.2.3..v1.3.0" as input,
git-pack-objects without --no-reuse-pack gives this
distribution:

chain length = 1: 364 objects
chain length = 2: 269 objects
chain length = 3: 198 objects
chain length = 4: 164 objects
chain length = 5: 148 objects
chain length = 6: 123 objects
chain length = 7: 122 objects
chain length = 8: 103 objects
chain length = 9: 92 objects
chain length = 10: 234 objects
chain length = 11: 12 objects
chain length = 12: 1 object
chain length = 13: 2 objects

So it _might_ be that the depth limiting code is subtly broken
which is causing you throw away a perfectly good delta base
which in turn results in a bad pack.  The distribution from the
"next" version looks like this:

chain length = 1: 358 objects
chain length = 2: 250 objects
chain length = 3: 214 objects
chain length = 4: 169 objects
chain length = 5: 150 objects
chain length = 6: 122 objects
chain length = 7: 126 objects
chain length = 8: 100 objects
chain length = 9: 101 objects
chain length = 10: 232 objects


-- >8 --

Summary of the experiment.

# test dataset
git rev-list --objects v1.2.3..v1.3.0 >RL-1.2.3--1.3.0

# baseline: "next" version is what is on my $PATH
git-pack-objects --no-reuse-delta test-next-pack-nr <RL-1.2.3--1.3.0
git-verify-pack -v test-next-pack-nr-*.pack | tail -n 20
git-pack-objects test-next-pack <RL-1.2.3--1.3.0
git-verify-pack -v test-next-pack-*.pack | tail -n 20

# freshly compiled version with the patch in question
./git-pack-objects --no-reuse-delta test-nico-pack-nr <RL-1.2.3--1.3.0
git-verify-pack -v test-nico-pack-nr-*.pack | tail -n 20
./git-pack-objects test-nico-pack <RL-1.2.3--1.3.0
git-verify-pack -v test-nico-pack-*.pack | tail -n 20

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH/RFC] reverse the pack-objects delta window logic
  2006-04-26  5:45 ` Junio C Hamano
@ 2006-04-26 15:48   ` Nicolas Pitre
  2006-04-26 17:48   ` Nicolas Pitre
  1 sibling, 0 replies; 9+ messages in thread
From: Nicolas Pitre @ 2006-04-26 15:48 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

On Tue, 25 Apr 2006, Junio C Hamano wrote:

> Nicolas Pitre <nico@cam.org> writes:
> 
> > Note, this is a RFC particularly to Junio since the resulting pack is 
> > larger than without the patch with git-repack -a -f.  However using a 
> > subsequent git-repack -a brings the pack size down to expected size.  So 
> > I'm not sure I've got everything right.
> 
> I haven't tested it seriously yet, but there is nothing that
> looks obviously wrong that might cause the inflation problem,
> from the cursory look after applying the patch on top of your
> last round.
> 
> > +	if (nr_objects == nr_result && trg_entry->delta_limit >= max_depth)
> > +		return 0;
> 
> The older code was loosening this check only for a delta chain
> that is already in pack (which is limited to its previous
> max_depth).  The end result is almost the same -- a thin pack
> recipient would have deeper delta than it asked. The difference
> is that the earlier code had implicit 2*max_depth limit,

Ah.  Indeed.  Didn't realize that.  I can restore that behavior quite 
easily if necessary.

> but this one makes the chain length unbounded, which I do not think it 
> is necessarily a bad change.

Well as long as the thin pack doesn't carry too many revisions it should 
be fine since, as the comment in the code sais, those packs are always 
unpacked.

Initially I had a bug where the delta depth was completely ignored.  I 
was pretty excited when repacking the kernel produced a pack 20% smaller 
although I didn't know why at that time.  But when attempting another 
git-repack -a -f then the initial object counting was sooooooo 
slooooooooow.

> > -	/*
> > -	 * NOTE!
> > -	 *
> > -	 * We always delta from the bigger to the smaller, since that's
> > -	 * more space-efficient (deletes don't have to say _what_ they
> > -	 * delete).
> > -	 */
> 
> This comment by Linus still applies, even though the scan order
> is now reversed; no need to remove it.

This is not exactly true.  In general it is so, but as we fixed the 
deltification of objects with the same name but in different directories 
it is well possible to go from smaller to larger and leaving that 
comment there is misleading.

This is also why I changed the sizediff rule such that:

	sizediff = src_size < size ? size - src_size : 0;

Since the src buffer already has its delta index computed, it costs 
almost nothing to attempt matching much smaller objects against it.  
However if we go from small to larger then the previous logic still 
applies.

> > +	if (trg_entry->delta) {
> > +		/*
> > +		 * The target object already has a delta base but we just
> > +		 * found a better one.  Remove it from its former base
> > +		 * childhood and redetermine the base delta_limit (if used).
> > +		 */
> 
> And you are making the delta chain unbound for thin case, you
> can probably omit this with the same if() here; the
> recomputation seems rather expensive.

Ah right.  I was doing so partly, but I can skip any tree maintenance 
altogether in that case as well.

> > +		if (!size)
> > +			continue;
> > +		delta_index = create_delta_index(n->data, size);
> > +		if (!delta_index)
> > +			die("out of memory");
> 
> It might be worth saying "if (size < 50)" here as well; no point
> wasting the delta window for small sources.

Good point.  No real effect on the pack size though.

> > -#if 0
> > -		/* if we made n a delta, and if n is already at max
> > -		 * depth, leaving it in the window is pointless.  we
> > -		 * should evict it first.
> > -		 * ... in theory only; somehow this makes things worse.
> > -		 */
> > -		if (entry->delta && depth <= entry->depth)
> > -			continue;
> > -#endif
> 
> I was almost tempted to suggest that the degradation you are
> seeing might be related to this mystery I did not get around to
> solve.  By allowing to give chance to try delta against less
> optimum candidates, it appeared that we ended up making the
> final pack size bigger than otherwise, which suggests that our
> choice between plain undeltified and a delta half its size might
> be favoring delta too much.  But it does not appear to be
> related to the inflation you are seeing.

Certainly not, since git-repack -a may only delta _more_ and the pack 
size actualy goes down a lot in my case.

The mystery I'm facing is why would a second pass with git-repack -a fix 
things?  It has a different window behavior since objects already 
deltified do not occupy window space. Hmmm.  That would certainly 
explain why doing a git-repack -a after a git-repack -a -f produces a 
smaller pack even currently.

> BTW, have you tried it without --no-reuse-pack on an object list
> that is not thin?  It appears you are busting the depth limit.
> 
> Using the same "git rev-list --objects v1.2.3..v1.3.0" as input,
> git-pack-objects without --no-reuse-pack gives this
> distribution:
> 
> chain length = 1: 364 objects
> chain length = 2: 269 objects
> chain length = 3: 198 objects
> chain length = 4: 164 objects
> chain length = 5: 148 objects
> chain length = 6: 123 objects
> chain length = 7: 122 objects
> chain length = 8: 103 objects
> chain length = 9: 92 objects
> chain length = 10: 234 objects
> chain length = 11: 12 objects
> chain length = 12: 1 object
> chain length = 13: 2 objects

Oops.  OK fixed.

> So it _might_ be that the depth limiting code is subtly broken
> which is causing you throw away a perfectly good delta base
> which in turn results in a bad pack.

Actually no.  That bug instead allowed each given base to deltify more 
targets than it should have.


Nicolas

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH/RFC] reverse the pack-objects delta window logic
  2006-04-26  5:45 ` Junio C Hamano
  2006-04-26 15:48   ` Nicolas Pitre
@ 2006-04-26 17:48   ` Nicolas Pitre
  2006-04-27  3:05     ` Nicolas Pitre
  1 sibling, 1 reply; 9+ messages in thread
From: Nicolas Pitre @ 2006-04-26 17:48 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

On Tue, 25 Apr 2006, Junio C Hamano wrote:

> Nicolas Pitre <nico@cam.org> writes:
> 
> > Note, this is a RFC particularly to Junio since the resulting pack is 
> > larger than without the patch with git-repack -a -f.  However using a 
> > subsequent git-repack -a brings the pack size down to expected size.  So 
> > I'm not sure I've got everything right.
> 
> I haven't tested it seriously yet, but there is nothing that
> looks obviously wrong that might cause the inflation problem,
> from the cursory look after applying the patch on top of your
> last round.

Never mind.  I found a flaw in the determination of delta_limit when 
reparenting a delta target.  The immediate parent's delta_limit is 
readjusted when its longest delta is moved to another base, but if that 
parent was itself a delta then the delta_limit adjustment is not 
propagated back up to the top.  This means that some objects were 
falsely credited with too high delta_limit.

And actually I'm not sure how to solve that without walking the tree 
up to the top each time, which I want to avoid as much as possible.


Nicolas

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH/RFC] reverse the pack-objects delta window logic
  2006-04-26 17:48   ` Nicolas Pitre
@ 2006-04-27  3:05     ` Nicolas Pitre
  2006-04-27  3:58       ` [PATCH] use delta index data when finding best delta matches Nicolas Pitre
  2006-04-27  5:04       ` [PATCH/RFC] reverse the pack-objects delta window logic Junio C Hamano
  0 siblings, 2 replies; 9+ messages in thread
From: Nicolas Pitre @ 2006-04-27  3:05 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

On Wed, 26 Apr 2006, Nicolas Pitre wrote:

> On Tue, 25 Apr 2006, Junio C Hamano wrote:
> 
> > Nicolas Pitre <nico@cam.org> writes:
> > 
> > > Note, this is a RFC particularly to Junio since the resulting pack is 
> > > larger than without the patch with git-repack -a -f.  However using a 
> > > subsequent git-repack -a brings the pack size down to expected size.  So 
> > > I'm not sure I've got everything right.
> > 
> > I haven't tested it seriously yet, but there is nothing that
> > looks obviously wrong that might cause the inflation problem,
> > from the cursory look after applying the patch on top of your
> > last round.
> 
> Never mind.  I found a flaw in the determination of delta_limit when 
> reparenting a delta target.  The immediate parent's delta_limit is 
> readjusted when its longest delta is moved to another base, but if that 
> parent was itself a delta then the delta_limit adjustment is not 
> propagated back up to the top.  This means that some objects were 
> falsely credited with too high delta_limit.
> 
> And actually I'm not sure how to solve that without walking the tree 
> up to the top each time, which I want to avoid as much as possible.

Well, that seems to be unavoidable.

Reversing the window logic isn't that much of a good idea after all.  As 
soon as we need to control the delta depth we sorta need to maintain 
trees of deltas and those trees are built from leaves up to the trunk.  

But each new object in the list is then used as a possible
new base for previously created deltas still in the object window.  When 
the new base is determined to produce a better delta then the 
relationship with the old base must be broken, which means that the 
information about delta length for the old base has to be updated.  And 
if the detached delta chain was the longest for that old base then the 
remaining longest delta chain from that old base has to be found and 
that information reflected up to the ultimate base in that tree.  And 
doing so isn't necessarily trivial as can be seen in the patch below.

Then there is the possibility of having a delta "branch" with maximum 
depth meaning that the trunk for that branch may not be deltified. But 
if a later objects to come does constitute a better delta base for the 
object in the middle of that branch then the branch will be broken in 
the middle to be transplanted onto the new base as explained previously.  
Which means that the initial trunk no longer has a maximum 
depth and 
some objects that were skipped because of the depth limit could 
now have been tested, leading to suboptimal delta matching.  This is why 
running pack-objects a second time improved things as it picked up 
those missed delta opportunities.  But having to run pack-objects 
multiple times is a bit against the point, and even if pack-objects 
processed the object list multiple times it certainly won't be 
faster than the current code.

In short, trying to deltify objects by keeping the base object constant 
for the match window really sucks, even from a theoretical point of 
view.  It does produce sensibly larger 
packs and it does take longer to do so.  I'm therefore dropping that 
approach.  My current patch can be found below.  If someone 
smarter than me (there are plenty I'm sure) can come with improvements 
to it then be my guest.

Therefore I really think the best approach is to simply keep delta index 
data along with object data in the match window and keep the current 
window direction for matching.  It obviously will take up more memory, 
but most probably less than if we set the window size = 20.  And 
building delta trees from the trunk to the leaves will always be optimal 
regardless with no delta rebasing needed.  I'll post another patch doing 
just that.

---

diff --git a/pack-objects.c b/pack-objects.c
index c0acc46..17280fb 100644
--- a/pack-objects.c
+++ b/pack-objects.c
@@ -19,19 +19,17 @@ struct object_entry {
 	unsigned long offset;	/* offset into the final pack file;
 				 * nonzero if already written.
 				 */
-	unsigned int depth;	/* delta depth */
-	unsigned int delta_limit;	/* base adjustment for in-pack delta */
+	unsigned int delta_limit;	/* deepest delta from this object */
 	unsigned int hash;	/* name hint hash */
 	enum object_type type;
 	enum object_type in_pack_type;	/* could be delta */
 	unsigned long delta_size;	/* delta data size (uncompressed) */
 	struct object_entry *delta;	/* delta base object */
-	struct packed_git *in_pack; 	/* already in pack */
-	unsigned int in_pack_offset;
 	struct object_entry *delta_child; /* delitified objects who bases me */
 	struct object_entry *delta_sibling; /* other deltified objects who
-					     * uses the same base as me
-					     */
+					       uses the same base as me */
+	struct packed_git *in_pack; 	/* already in pack */
+	unsigned int in_pack_offset;
 	int preferred_base;	/* we do not pack this, but is encouraged to
 				 * be used as the base objectto delta huge
 				 * objects against.
@@ -884,17 +882,16 @@ static void check_object(struct object_e
 		    sha1_to_hex(entry->sha1), type);
 }
 
-static unsigned int check_delta_limit(struct object_entry *me, unsigned int n)
+static unsigned int find_delta_limit(struct object_entry *me)
 {
 	struct object_entry *child = me->delta_child;
-	unsigned int m = n;
 	while (child) {
-		unsigned int c = check_delta_limit(child, n + 1);
-		if (m < c)
-			m = c;
+		unsigned int c = find_delta_limit(child);
+		if (me->delta_limit <= c)
+			me->delta_limit = c + 1;
 		child = child->delta_sibling;
 	}
-	return m;
+	return me->delta_limit;
 }
 
 static void get_object_details(void)
@@ -906,11 +903,11 @@ static void get_object_details(void)
 	for (i = 0, entry = objects; i < nr_objects; i++, entry++)
 		check_object(entry);
 
-	if (nr_objects == nr_result) {
+	if (!no_reuse_delta && nr_objects == nr_result) {
 		/*
-		 * Depth of objects that depend on the entry -- this
-		 * is subtracted from depth-max to break too deep
-		 * delta chain because of delta data reusing.
+		 * We must determine the maximum depth of reused deltas
+		 * for those objects used as their base before find_deltas()
+		 * starts considering them as potential delta targets.
 		 * However, we loosen this restriction when we know we
 		 * are creating a thin pack -- it will have to be
 		 * expanded on the other end anyway, so do not
@@ -919,8 +916,7 @@ static void get_object_details(void)
 		 */
 		for (i = 0, entry = objects; i < nr_objects; i++, entry++)
 			if (!entry->delta && entry->delta_child)
-				entry->delta_limit =
-					check_delta_limit(entry, 1);
+				find_delta_limit(entry);
 	}
 }
 
@@ -994,6 +990,7 @@ static int type_size_sort(const struct o
 struct unpacked {
 	struct object_entry *entry;
 	void *data;
+	int pos;
 };
 
 /*
@@ -1004,64 +1001,94 @@ struct unpacked {
  * more importantly, the bigger file is likely the more recent
  * one.
  */
-static int try_delta(struct unpacked *cur, struct unpacked *old, unsigned max_depth)
+static int try_delta(struct unpacked *trg, struct unpacked *src,
+		     struct delta_index *src_index, unsigned max_depth)
 {
-	struct object_entry *cur_entry = cur->entry;
-	struct object_entry *old_entry = old->entry;
-	unsigned long size, oldsize, delta_size, sizediff;
-	long max_size;
+	struct object_entry *trg_entry = trg->entry;
+	struct object_entry *src_entry = src->entry;
+	unsigned long size, src_size, delta_size, sizediff, max_size;
 	void *delta_buf;
 
 	/* Don't bother doing diffs between different types */
-	if (cur_entry->type != old_entry->type)
+	if (trg_entry->type != src_entry->type)
 		return -1;
 
 	/* We do not compute delta to *create* objects we are not
 	 * going to pack.
 	 */
-	if (cur_entry->preferred_base)
-		return -1;
+	if (trg_entry->preferred_base)
+		return 0;
 
-	/* If the current object is at pack edge, take the depth the
-	 * objects that depend on the current object into account --
-	 * otherwise they would become too deep.
+	/*
+	 * Make sure deltifying this object won't make its deepest delta
+	 * too deep, but only when not producing a thin pack.
 	 */
-	if (cur_entry->delta_child) {
-		if (max_depth <= cur_entry->delta_limit)
-			return 0;
-		max_depth -= cur_entry->delta_limit;
-	}
-
-	size = cur_entry->size;
-	oldsize = old_entry->size;
-	sizediff = oldsize > size ? oldsize - size : size - oldsize;
+	if (trg_entry->delta_limit >= max_depth && nr_objects == nr_result)
+		return 0;
 
+	/* Now some size filtering euristics. */
+	size = trg_entry->size;
 	if (size < 50)
-		return -1;
-	if (old_entry->depth >= max_depth)
 		return 0;
-
-	/*
-	 * NOTE!
-	 *
-	 * We always delta from the bigger to the smaller, since that's
-	 * more space-efficient (deletes don't have to say _what_ they
-	 * delete).
-	 */
 	max_size = size / 2 - 20;
-	if (cur_entry->delta)
-		max_size = cur_entry->delta_size-1;
+	if (trg_entry->delta)
+		max_size = trg_entry->delta_size-1;
+	src_size = src_entry->size;
+	sizediff = src_size < size ? size - src_size : 0;
 	if (sizediff >= max_size)
 		return 0;
-	delta_buf = diff_delta(old->data, oldsize,
-			       cur->data, size, &delta_size, max_size);
+
+	delta_buf = create_delta(src_index, trg->data, size, &delta_size, max_size);
 	if (!delta_buf)
 		return 0;
-	cur_entry->delta = old_entry;
-	cur_entry->delta_size = delta_size;
-	cur_entry->depth = old_entry->depth + 1;
+
+	if (trg_entry->delta && nr_objects == nr_result) {
+		/*
+		 * The target object already has a delta base but we just
+		 * found a better one.  Remove it from its former base
+		 * childhood and redetermine the base delta_limit.
+		 * But again, only when not creating a thin pack.
+		 */
+		struct object_entry *base = trg_entry->delta;
+		struct object_entry **child_link = &base->delta_child;
+		unsigned int limit = base->delta_limit;
+		base->delta_limit = 0;
+		while (*child_link) {
+			if (*child_link == trg_entry) {
+				*child_link = trg_entry->delta_sibling;
+				continue;
+			}
+			if (base->delta_limit <= (*child_link)->delta_limit)
+				base->delta_limit =
+					(*child_link)->delta_limit + 1;
+			child_link = &(*child_link)->delta_sibling;
+		}
+		if (base->delta_limit == limit)
+			goto out;
+		while ((base = base->delta) && ++limit == base->delta_limit) {
+			struct object_entry *child = base->delta_child;
+			base->delta_limit = 0;
+			do {
+				if(base->delta_limit <= child->delta_limit) {
+					base->delta_limit =
+						child->delta_limit + 1;
+					if (base->delta_limit == limit)
+						goto out;
+				}
+				child = child->delta_sibling;
+			} while (child);
+		}
+		out:;
+	}
+
+	trg_entry->delta = src_entry;
+	trg_entry->delta_size = delta_size;
+	trg_entry->delta_sibling = src_entry->delta_child;
+	src_entry->delta_child = trg_entry;
+	if (src_entry->delta_limit <= trg_entry->delta_limit)
+		src_entry->delta_limit = trg_entry->delta_limit + 1;
 	free(delta_buf);
-	return 0;
+	return 1;
 }
 
 static void progress_interval(int signum)
@@ -1078,14 +1105,15 @@ static void find_deltas(struct object_en
 	unsigned last_percent = 999;
 
 	memset(array, 0, array_size);
-	i = nr_objects;
+	i = 0;
 	idx = 0;
 	if (progress)
 		fprintf(stderr, "Deltifying %d objects.\n", nr_result);
 
-	while (--i >= 0) {
-		struct object_entry *entry = list[i];
+	while (i < nr_objects) {
+		struct object_entry *entry = list[i++];
 		struct unpacked *n = array + idx;
+		struct delta_index *delta_index;
 		unsigned long size;
 		char type[10];
 		int j;
@@ -1109,11 +1137,18 @@ static void find_deltas(struct object_en
 			 */
 			continue;
 
+		if (entry->size < 50)
+			continue;
 		free(n->data);
+		n->pos = i;
 		n->entry = entry;
 		n->data = read_sha1_file(entry->sha1, type, &size);
 		if (size != entry->size)
-			die("object %s inconsistent object length (%lu vs %lu)", sha1_to_hex(entry->sha1), size, entry->size);
+			die("object %s inconsistent object length (%lu vs %lu)",
+			    sha1_to_hex(entry->sha1), size, entry->size);
+		delta_index = create_delta_index(n->data, size);
+		if (!delta_index)
+			die("out of memory");
 
 		j = window;
 		while (--j > 0) {
@@ -1124,18 +1159,10 @@ static void find_deltas(struct object_en
 			m = array + other_idx;
 			if (!m->entry)
 				break;
-			if (try_delta(n, m, depth) < 0)
+			if (try_delta(m, n, delta_index, depth) < 0)
 				break;
 		}
-#if 0
-		/* if we made n a delta, and if n is already at max
-		 * depth, leaving it in the window is pointless.  we
-		 * should evict it first.
-		 * ... in theory only; somehow this makes things worse.
-		 */
-		if (entry->delta && depth <= entry->depth)
-			continue;
-#endif
+		free_delta_index(delta_index);
 		idx++;
 		if (idx >= window)
 			idx = 0;

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH] use delta index data when finding best delta matches
  2006-04-27  3:05     ` Nicolas Pitre
@ 2006-04-27  3:58       ` Nicolas Pitre
  2006-04-28  1:08         ` Junio C Hamano
  2006-04-27  5:04       ` [PATCH/RFC] reverse the pack-objects delta window logic Junio C Hamano
  1 sibling, 1 reply; 9+ messages in thread
From: Nicolas Pitre @ 2006-04-27  3:58 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git


This patch allows for computing the delta index for each base object 
only once and reuse it when trying to find the best delta match.

This should set the mark and pave the way for possibly better delta 
generator algorithms.

Signed-off-by: Nicolas Pitre <nico@cam.org>

---

... as mentioned in my previous post

diff --git a/pack-objects.c b/pack-objects.c
index c0acc46..5b2ef9a 100644
--- a/pack-objects.c
+++ b/pack-objects.c
@@ -994,6 +994,7 @@ static int type_size_sort(const struct o
 struct unpacked {
 	struct object_entry *entry;
 	void *data;
+	struct delta_index *index;
 };
 
 /*
@@ -1004,64 +1005,56 @@ struct unpacked {
  * more importantly, the bigger file is likely the more recent
  * one.
  */
-static int try_delta(struct unpacked *cur, struct unpacked *old, unsigned max_depth)
+static int try_delta(struct unpacked *trg, struct unpacked *src,
+		     struct delta_index *src_index, unsigned max_depth)
 {
-	struct object_entry *cur_entry = cur->entry;
-	struct object_entry *old_entry = old->entry;
-	unsigned long size, oldsize, delta_size, sizediff;
-	long max_size;
+	struct object_entry *trg_entry = trg->entry;
+	struct object_entry *src_entry = src->entry;
+	unsigned long size, src_size, delta_size, sizediff, max_size;
 	void *delta_buf;
 
 	/* Don't bother doing diffs between different types */
-	if (cur_entry->type != old_entry->type)
+	if (trg_entry->type != src_entry->type)
 		return -1;
 
 	/* We do not compute delta to *create* objects we are not
 	 * going to pack.
 	 */
-	if (cur_entry->preferred_base)
+	if (trg_entry->preferred_base)
 		return -1;
 
-	/* If the current object is at pack edge, take the depth the
+	/*
+	 * If the current object is at pack edge, take the depth the
 	 * objects that depend on the current object into account --
 	 * otherwise they would become too deep.
 	 */
-	if (cur_entry->delta_child) {
-		if (max_depth <= cur_entry->delta_limit)
+	if (trg_entry->delta_child) {
+		if (max_depth <= trg_entry->delta_limit)
 			return 0;
-		max_depth -= cur_entry->delta_limit;
+		max_depth -= trg_entry->delta_limit;
 	}
-
-	size = cur_entry->size;
-	oldsize = old_entry->size;
-	sizediff = oldsize > size ? oldsize - size : size - oldsize;
-
-	if (size < 50)
-		return -1;
-	if (old_entry->depth >= max_depth)
+	if (src_entry->depth >= max_depth)
 		return 0;
 
-	/*
-	 * NOTE!
-	 *
-	 * We always delta from the bigger to the smaller, since that's
-	 * more space-efficient (deletes don't have to say _what_ they
-	 * delete).
-	 */
+	/* Now some size filtering euristics. */
+	size = trg_entry->size;
 	max_size = size / 2 - 20;
-	if (cur_entry->delta)
-		max_size = cur_entry->delta_size-1;
+	if (trg_entry->delta)
+		max_size = trg_entry->delta_size-1;
+	src_size = src_entry->size;
+	sizediff = src_size < size ? size - src_size : 0;
 	if (sizediff >= max_size)
 		return 0;
-	delta_buf = diff_delta(old->data, oldsize,
-			       cur->data, size, &delta_size, max_size);
+
+	delta_buf = create_delta(src_index, trg->data, size, &delta_size, max_size);
 	if (!delta_buf)
 		return 0;
-	cur_entry->delta = old_entry;
-	cur_entry->delta_size = delta_size;
-	cur_entry->depth = old_entry->depth + 1;
+
+	trg_entry->delta = src_entry;
+	trg_entry->delta_size = delta_size;
+	trg_entry->depth = src_entry->depth + 1;
 	free(delta_buf);
-	return 0;
+	return 1;
 }
 
 static void progress_interval(int signum)
@@ -1109,11 +1102,19 @@ static void find_deltas(struct object_en
 			 */
 			continue;
 
+		if (entry->size < 50)
+			continue;
+		if (n->index)
+			free_delta_index(n->index);
 		free(n->data);
 		n->entry = entry;
 		n->data = read_sha1_file(entry->sha1, type, &size);
 		if (size != entry->size)
-			die("object %s inconsistent object length (%lu vs %lu)", sha1_to_hex(entry->sha1), size, entry->size);
+			die("object %s inconsistent object length (%lu vs %lu)",
+			    sha1_to_hex(entry->sha1), size, entry->size);
+		n->index = create_delta_index(n->data, size);
+		if (!n->index)
+			die("out of memory");
 
 		j = window;
 		while (--j > 0) {
@@ -1124,7 +1125,7 @@ static void find_deltas(struct object_en
 			m = array + other_idx;
 			if (!m->entry)
 				break;
-			if (try_delta(n, m, depth) < 0)
+			if (try_delta(n, m, m->index, depth) < 0)
 				break;
 		}
 #if 0
@@ -1144,8 +1145,11 @@ #endif
 	if (progress)
 		fputc('\n', stderr);
 
-	for (i = 0; i < window; ++i)
+	for (i = 0; i < window; ++i) {
+		if (array[i].index)
+			free_delta_index(array[i].index);
 		free(array[i].data);
+	}
 	free(array);
 }
 

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH/RFC] reverse the pack-objects delta window logic
  2006-04-27  3:05     ` Nicolas Pitre
  2006-04-27  3:58       ` [PATCH] use delta index data when finding best delta matches Nicolas Pitre
@ 2006-04-27  5:04       ` Junio C Hamano
  1 sibling, 0 replies; 9+ messages in thread
From: Junio C Hamano @ 2006-04-27  5:04 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git

Nicolas Pitre <nico@cam.org> writes:

>> And actually I'm not sure how to solve that without walking the tree 
>> up to the top each time, which I want to avoid as much as possible.
>
> Well, that seems to be unavoidable.
>
> Reversing the window logic isn't that much of a good idea after all.
> ...

> Then there is the possibility of having a delta "branch" with maximum 
> depth meaning that the trunk for that branch may not be deltified. But 
> if a later objects to come does constitute a better delta base for the 
> object in the middle of that branch then the branch will be broken in 
> the middle to be transplanted onto the new base as explained previously.  
> Which means that the initial trunk no longer has a maximum depth and some
> objects that were skipped because of the depth limit could now have been
> tested, leading to suboptimal delta matching.

Good analysis, and I tend to agree with your conclusion.

BTW, Geert mentioned on the #git channel that about half the
filepair git-pack-objects asks diff_delta() to try have long
sequences of matching bytes at the beginning and at the end.  It
might be worthwhile if we can take an advantage of it, whichever
delta algorithm we would use.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] use delta index data when finding best delta matches
  2006-04-27  3:58       ` [PATCH] use delta index data when finding best delta matches Nicolas Pitre
@ 2006-04-28  1:08         ` Junio C Hamano
  2006-04-28  1:56           ` Nicolas Pitre
  0 siblings, 1 reply; 9+ messages in thread
From: Junio C Hamano @ 2006-04-28  1:08 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git

Nicolas Pitre <nico@cam.org> writes:

> This patch allows for computing the delta index for each base object 
> only once and reuse it when trying to find the best delta match.
>
> This should set the mark and pave the way for possibly better delta 
> generator algorithms.
>
> Signed-off-by: Nicolas Pitre <nico@cam.org>

My understanding is that theoretically this should not make any
difference to the result, and should run faster when the memory
pressure does not cause the machine to thrash.  However,....

I am seeing some differences.  Even with the smallish "git.git"
repository, packing is slightly slower, and the end result is
smaller.

Here are full packing experiments in a fully unpacked git.git
repository.

("next" version)
Total 17724, written 17724 (delta 11779), reused 0 (delta 0)
31.61user 6.24system 0:37.97elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (0major+431995minor)pagefaults 0swaps

 6520520 pack-next-f1fac077a093ffdaf094aab2b7f11859ec0c18f1.pack

(with "use delta index" patch)
Total 17724, written 17724 (delta 12002), reused 0 (delta 0)
33.26user 6.00system 0:39.33elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (0major+434451minor)pagefaults 0swaps

 6188418 pack-nico-f1fac077a093ffdaf094aab2b7f11859ec0c18f1.pack

Not that I am complaining that it produces better results with a
small performance penalty.  I am curious because I do not
understand where the differences are coming from, and I was
reluctant to merge it in "next" until I understand what is going
on.

But I think I know where the differences come from:

-	sizediff = oldsize > size ? oldsize - size : size - oldsize;
+	sizediff = src_size < size ? size - src_size : 0;

There is another "omit smaller than 50" difference but that
should not trigger -- we do not have files that small.

The size-diff change sort-of makes sense -- you are counting how
much the target grew, which you are likely to need to represent
as additions of literal data, and there is no reason to limit
the diff if the size difference that is greater than maxsize is
in the other direction (deletion).

So, I "backported" that part of the change on top of "next" and
tried the same experiment.

(without "use delta index" but the size heuristics part ported to "next")
Total 17724, written 17724 (delta 12002), reused 0 (delta 0)
36.92user 6.55system 0:43.75elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (0major+431860minor)pagefaults 0swaps

 6188418 pack-size-f1fac077a093ffdaf094aab2b7f11859ec0c18f1.pack

And now the resulting pack is the same as what you produce.

So comparing 31.61 seconds vs 33.26 seconds and complaining you
made it slower is not fair.  You fixed the size heuristic logic
in the current code to produce 5% smaller pack (which made
things slower to spend 36.92 seconds while doing so -- that's
15% slowdown), and then reusing delta-index brought that penalty
down to 5% or so.


-- >8 --

This patch applies on top of "next" to match the size heuristics
used in the "reuse delta index" patch.

 pack-objects.c |   12 ++++++------
 1 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/pack-objects.c b/pack-objects.c
index c0acc46..6604338 100644
--- a/pack-objects.c
+++ b/pack-objects.c
@@ -1032,12 +1032,6 @@ static int try_delta(struct unpacked *cu
 		max_depth -= cur_entry->delta_limit;
 	}
 
-	size = cur_entry->size;
-	oldsize = old_entry->size;
-	sizediff = oldsize > size ? oldsize - size : size - oldsize;
-
-	if (size < 50)
-		return -1;
 	if (old_entry->depth >= max_depth)
 		return 0;
 
@@ -1048,9 +1042,12 @@ static int try_delta(struct unpacked *cu
 	 * more space-efficient (deletes don't have to say _what_ they
 	 * delete).
 	 */
+	size = cur_entry->size;
 	max_size = size / 2 - 20;
 	if (cur_entry->delta)
 		max_size = cur_entry->delta_size-1;
+	oldsize = old_entry->size;
+	sizediff = oldsize < size ? size - oldsize : 0;
 	if (sizediff >= max_size)
 		return 0;
 	delta_buf = diff_delta(old->data, oldsize,
@@ -1109,6 +1106,9 @@ static void find_deltas(struct object_en
 			 */
 			continue;
 
+		if (entry->size < 50)
+			continue;
+
 		free(n->data);
 		n->entry = entry;
 		n->data = read_sha1_file(entry->sha1, type, &size);

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH] use delta index data when finding best delta matches
  2006-04-28  1:08         ` Junio C Hamano
@ 2006-04-28  1:56           ` Nicolas Pitre
  0 siblings, 0 replies; 9+ messages in thread
From: Nicolas Pitre @ 2006-04-28  1:56 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

On Thu, 27 Apr 2006, Junio C Hamano wrote:

> Nicolas Pitre <nico@cam.org> writes:
> 
> > This patch allows for computing the delta index for each base object 
> > only once and reuse it when trying to find the best delta match.
> >
> > This should set the mark and pave the way for possibly better delta 
> > generator algorithms.
> >
> > Signed-off-by: Nicolas Pitre <nico@cam.org>
> 
> My understanding is that theoretically this should not make any
> difference to the result, and should run faster when the memory
> pressure does not cause the machine to thrash.  However,....
> 
> I am seeing some differences.  Even with the smallish "git.git"
> repository, packing is slightly slower, and the end result is
> smaller.

Well, I changed some euristics a bit.

> Not that I am complaining that it produces better results with a
> small performance penalty.  I am curious because I do not
> understand where the differences are coming from, and I was
> reluctant to merge it in "next" until I understand what is going
> on.
> 
> But I think I know where the differences come from:
> 
> -	sizediff = oldsize > size ? oldsize - size : size - oldsize;
> +	sizediff = src_size < size ? size - src_size : 0;

Right.  The idea is that when the delta source index has to be computed 
each time, if the target buffer is really small then we spend more time 
computing that index than anything else.

But when the delta index is computed only once and already available 
anyway, we don't lose much attempting a delta with a small target buffer 
since the delta computation is non-existent at that point and the actual 
delta generation will be quick if the target buffer is small.

> There is another "omit smaller than 50" difference but that
> should not trigger -- we do not have files that small.

Right.  And if such small files show up they won't waste window space.


Nicolas

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2006-04-28  1:56 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2006-04-26  3:37 [PATCH/RFC] reverse the pack-objects delta window logic Nicolas Pitre
2006-04-26  5:45 ` Junio C Hamano
2006-04-26 15:48   ` Nicolas Pitre
2006-04-26 17:48   ` Nicolas Pitre
2006-04-27  3:05     ` Nicolas Pitre
2006-04-27  3:58       ` [PATCH] use delta index data when finding best delta matches Nicolas Pitre
2006-04-28  1:08         ` Junio C Hamano
2006-04-28  1:56           ` Nicolas Pitre
2006-04-27  5:04       ` [PATCH/RFC] reverse the pack-objects delta window logic Junio C Hamano

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.