All of lore.kernel.org
 help / color / mirror / Atom feed
* RFC: [PATCH] Support incremental pack files
@ 2007-02-23  7:13 Martin Koegler
  2007-02-23  8:10 ` Junio C Hamano
  2007-02-23 16:04 ` Nicolas Pitre
  0 siblings, 2 replies; 6+ messages in thread
From: Martin Koegler @ 2007-02-23  7:13 UTC (permalink / raw)
  To: git

With CVS (or RCS), commiting a version, increases the required storage
only by the size of the diff.

Commiting a new version in GIT increases the storage by the compressed
size of each changed blob. Packing all unpacked objects decreases the
required storage, but does not generate deltas against objects in
packs. You need to repack all objects to get around this.

For normal source code, this is not a problem.  But if you want to use
git for big files, you waste storage (or CPU time for everything
repacking).

The follwing patch (again git-1.5.0-rc3) is a prototyp for supporting
incremental pack files in git. The file structures are not changed.
It only permits, that the base commit of a delta is located in a
different pack or as unpacked object.

Changes: 

* in sha1_file get_delta_base returns offset 0, if the delta base
object is not the same pack. Additionally it stores the sha1 of it in
a parameter. unpack_delta_entry and packed_delta_info are changed, so
that they search such a blob in other packs or as sha1
file. packed_object_info_detail stops searching, if the delta base
object is not in the same pack.

* to builtin-pack-objects.c a loop detector is added (flag
loop_check).  When adding a excluded object (= preferred base), its
position in the pack file is added, if it is in pack.

If a preferred base object is in a pack, it checks, if it is a delta.
If so, it locates the base object and stores it in ->delta. If it was
not in the object list, the base object gets added with a exclude
value of 2 and it is recusivly check the base object in the same way
(check_preferred_object).

try_delta refuses to produce a delta, if a loop would be created or if
the base object was added by check_preferred_object.

* mark_edges_unparsed travers all commit list for not parsed parent
commit objects. They are used as heuristic to create a list of
suiteable base object for building the delta.

With the patch, you can eg. pack all unpacked object with a command link like
git-pack-objects --non-empty --all --reflog --unpacked --incremental --base-parent

I see the following problems:
* fetching an incremental pack file over HTTP
* reusing a incremental pack file via reuse_cached_pack in builtin-pack-objects.c

mfg Martin Kögler

--- builtin-pack-objects.c.orig	2007-02-22 22:11:11.287809817 +0100
+++ builtin-pack-objects.c	2007-02-22 22:11:22.694525976 +0100
@@ -16,7 +16,7 @@
 static const char pack_usage[] = "\
 git-pack-objects [{ -q | --progress | --all-progress }] \n\
 	[--local] [--incremental] [--window=N] [--depth=N] \n\
-	[--no-reuse-delta] [--delta-base-offset] [--non-empty] \n\
+	[--no-reuse-delta] [--delta-base-offset] [--non-empty] [--base-parent] \n\
 	[--revs [--unpacked | --all]*] [--reflog] [--stdout | base-name] \n\
 	[<ref-list | <object-list]";
 
@@ -65,6 +65,8 @@
 static int local;
 static int incremental;
 static int allow_ofs_delta;
+static int base_parent;
+static int loop_check;
 
 static struct object_entry **sorted_by_sha, **sorted_by_type;
 static struct object_entry *objects;
@@ -692,6 +694,19 @@
 			}
 		}
 	}
+
+	if (loop_check && exclude) {
+		for (p = packed_git; p; p = p->next) {
+			unsigned long offset = find_pack_entry_one(sha1, p);
+			if (offset) {
+				if (!found_pack) {
+					found_offset = offset;
+					found_pack = p;
+				}
+			}
+		}
+	}
+
 	if ((entry = locate_object_entry(sha1)) != NULL)
 		goto already_added;
 
@@ -722,13 +737,13 @@
 		progress_update = 0;
 	}
 	if (exclude)
-		entry->preferred_base = 1;
-	else {
-		if (found_pack) {
-			entry->in_pack = found_pack;
-			entry->in_pack_offset = found_offset;
-		}
+	    entry->preferred_base = exclude;
+
+	if (found_pack) {
+	    entry->in_pack = found_pack;
+	    entry->in_pack_offset = found_offset;
 	}
+
 	return status;
 }
 
@@ -976,6 +991,78 @@
 	it->pcache.tree_size = size;
 }
 
+static void check_preferred_object(struct object_entry *entry)
+{
+    struct packed_git *p = entry->in_pack;
+    struct pack_window *w_curs = NULL;
+    unsigned long size, used;
+    unsigned char *buf;
+    unsigned long left = p->pack_size - entry->in_pack_offset;
+    struct object_entry *base_entry = NULL;
+    unsigned hash;
+
+    if (!p || !entry->preferred_base)
+	return;
+
+    buf = use_pack(p, &w_curs, entry->in_pack_offset, NULL);
+    
+    used = unpack_object_header_gently(buf, left,
+				       &entry->in_pack_type, &size);
+
+    unsigned char c, *base_name;
+    unsigned long ofs;
+    unsigned long used_0;
+    /* there is at least 20 bytes left in the pack */
+    switch (entry->in_pack_type) {
+	case OBJ_REF_DELTA:
+	    base_name = use_pack(p, &w_curs,
+				 entry->in_pack_offset + used, NULL);
+	    used += 20;
+	    break;
+	case OBJ_OFS_DELTA:
+	    buf = use_pack(p, &w_curs,
+			   entry->in_pack_offset + used, NULL);
+	    used_0 = 0;
+	    c = buf[used_0++];
+	    ofs = c & 127;
+	    while (c & 128) {
+		ofs += 1;
+		if (!ofs || ofs & ~(~0UL >> 7))
+		    die("delta base offset overflow in pack for %s",
+			sha1_to_hex(entry->sha1));
+		c = buf[used_0++];
+		ofs = (ofs << 7) + (c & 127);
+	    }
+	    if (ofs >= entry->in_pack_offset)
+		die("delta base offset out of bound for %s",
+		    sha1_to_hex(entry->sha1));
+	    ofs = entry->in_pack_offset - ofs;
+	    base_name = find_packed_object_name(p, ofs);
+	    used += used_0;
+	    break;
+	default:
+	    base_name = NULL;
+    }
+
+    unuse_pack(&w_curs);
+
+    if (!base_name)
+	return;
+
+    base_entry = locate_object_entry(base_name);
+    if (!base_entry) {
+	hash = name_hash("");
+	add_object_entry(base_name, hash, 2);
+
+	base_entry = locate_object_entry(base_name);
+	check_preferred_object(base_entry);
+    }
+
+    entry->delta = base_entry;
+    entry->delta_sibling = base_entry->delta_child;
+    base_entry->delta_child = entry;
+}
+
 static void check_object(struct object_entry *entry)
 {
 	char type[20];
@@ -1062,6 +1149,9 @@
 		/* Otherwise we would do the usual */
 	}
 
+	if (entry->in_pack && entry->preferred_base) 
+	    check_preferred_object (entry);
+
 	if (sha1_object_info(entry->sha1, type, &entry->size))
 		die("unable to get type of object %s",
 		    sha1_to_hex(entry->sha1));
@@ -1218,6 +1308,8 @@
 	 */
 	if (trg_entry->preferred_base)
 		return -1;
+	if (src_entry->preferred_base == 2)
+		return -1;
 
 	/*
 	 * We do not bother to try a delta that we discarded
@@ -1242,6 +1334,15 @@
 	if (src_entry->depth >= max_depth)
 		return 0;
 
+	if (loop_check) {
+	    struct object_entry *i = src_entry->delta;
+	    while (i) {
+		if (i == trg_entry)
+		    return 0;
+		i = i->delta;
+	    }
+	}
+
 	/* Now some size filtering heuristics. */
 	trg_size = trg_entry->size;
 	max_size = trg_size/2 - 20;
@@ -1540,6 +1641,8 @@
 
 	prepare_revision_walk(&revs);
 	mark_edges_uninteresting(revs.commits, &revs, show_edge);
+	if (base_parent)
+	    mark_edges_unparsed(revs.commits, &revs, show_edge);
 	traverse_commit_list(&revs, show_commit, show_object);
 }
 
@@ -1609,6 +1712,11 @@
 			no_reuse_delta = 1;
 			continue;
 		}
+		if (!strcmp("--base-parent", arg)) {
+			base_parent = 1;
+			loop_check = 1;
+			continue;
+		}
 		if (!strcmp("--delta-base-offset", arg)) {
 			allow_ofs_delta = 1;
 			continue;
--- list-objects.c.orig	2007-02-22 22:05:13.972754239 +0100
+++ list-objects.c	2007-02-22 22:11:11.254461051 +0100
@@ -66,6 +66,27 @@
 	tree->buffer = NULL;
 }
 
+void mark_edges_unparsed(struct commit_list *list,
+			 struct rev_info *revs,
+			 show_edge_fn show_edge)
+{
+	for ( ; list; list = list->next) {
+		struct commit *commit = list->item;
+		struct commit_list *parents;
+
+		for (parents = commit->parents; parents; parents = parents->next) {
+		    struct commit *parent = parents->item;
+		    if (parent->object.parsed)
+			continue;
+		    if (!(parent->object.flags & SHOWN)) {
+			parent->object.flags |= SHOWN;
+			show_edge(parent);
+		    }
+		}
+	}
+}
+
+
 static void mark_edge_parents_uninteresting(struct commit *commit,
 					    struct rev_info *revs,
 					    show_edge_fn show_edge)
--- list-objects.h.orig	2007-02-22 22:05:10.375119739 +0100
+++ list-objects.h	2007-02-22 22:05:48.095287586 +0100
@@ -8,5 +8,6 @@
 void traverse_commit_list(struct rev_info *revs, show_commit_fn, show_object_fn);
 
 void mark_edges_uninteresting(struct commit_list *, struct rev_info *, show_edge_fn);
+void mark_edges_unparsed(struct commit_list *, struct rev_info *, show_edge_fn);
 
 #endif
--- sha1_file.c.orig	2007-02-22 22:39:46.163797786 +0100
+++ sha1_file.c	2007-02-22 22:52:36.547048788 +0100
@@ -1030,7 +1030,8 @@
 				    unsigned long offset,
 				    enum object_type kind,
 				    unsigned long delta_obj_offset,
-				    unsigned long *base_obj_offset)
+				    unsigned long *base_obj_offset,
+				    unsigned char *base_sha1)
 {
 	unsigned char *base_info = use_pack(p, w_curs, offset, NULL);
 	unsigned long base_offset;
@@ -1059,9 +1060,7 @@
 	} else if (kind == OBJ_REF_DELTA) {
 		/* The base entry _must_ be in the same pack */
 		base_offset = find_pack_entry_one(base_info, p);
-		if (!base_offset)
-			die("failed to find delta-pack base object %s",
-				sha1_to_hex(base_info));
+		hashcpy (base_sha1, base_info);
 		offset += 20;
 	} else
 		die("I am totally screwed");
@@ -1081,18 +1080,25 @@
 			     char *type,
 			     unsigned long *sizep)
 {
+        unsigned char base_sha1[20];
 	unsigned long base_offset;
 
 	offset = get_delta_base(p, w_curs, offset, kind,
-		obj_offset, &base_offset);
+		obj_offset, &base_offset, base_sha1);
 
 	/* We choose to only get the type of the base object and
 	 * ignore potentially corrupt pack file that expects the delta
 	 * based on a base with a wrong size.  This saves tons of
 	 * inflate() calls.
 	 */
-	if (packed_object_info(p, base_offset, type, NULL))
+	if (base_offset) {
+	    if (packed_object_info(p, base_offset, type, NULL))
 		die("cannot get info for delta-pack base");
+	} else {
+	    if (sha1_object_info(base_sha1, type, NULL))
+		die("cannot get info for delta-pack base %s",
+		    sha1_to_hex (base_sha1));
+	}
 
 	if (sizep) {
 		const unsigned char *data;
@@ -1168,6 +1174,7 @@
 	struct pack_window *w_curs = NULL;
 	unsigned long obj_offset, val;
 	unsigned char *next_sha1;
+	unsigned char sha1[20];
 	enum object_type kind;
 
 	*delta_chain_length = 0;
@@ -1189,7 +1196,7 @@
 			return;
 		case OBJ_OFS_DELTA:
 			get_delta_base(p, &w_curs, offset, kind,
-				obj_offset, &offset);
+				obj_offset, &offset, sha1);
 			if (*delta_chain_length == 0) {
 				/* TODO: find base_sha1 as pointed by offset */
 			}
@@ -1199,6 +1206,8 @@
 			if (*delta_chain_length == 0)
 				hashcpy(base_sha1, next_sha1);
 			offset = find_pack_entry_one(next_sha1, p);
+			if (!offset)
+			    return;
 			break;
 		}
 		obj_offset = offset;
@@ -1281,11 +1290,15 @@
 				unsigned long *sizep)
 {
 	void *delta_data, *result, *base;
+	unsigned char base_sha1[20];
 	unsigned long result_size, base_size, base_offset;
 
 	offset = get_delta_base(p, w_curs, offset, kind,
-		obj_offset, &base_offset);
-	base = unpack_entry(p, base_offset, type, &base_size);
+		obj_offset, &base_offset, base_sha1);
+	if (base_offset)
+	    base = unpack_entry(p, base_offset, type, &base_size);
+	else
+	    base = read_sha1_file (base_sha1, type, &base_size);
 	if (!base)
 		die("failed to read delta base object at %lu from %s",
 		    base_offset, p->pack_name);

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: RFC: [PATCH] Support incremental pack files
  2007-02-23  7:13 RFC: [PATCH] Support incremental pack files Martin Koegler
@ 2007-02-23  8:10 ` Junio C Hamano
  2007-02-26 21:45   ` Martin Koegler
  2007-02-23 16:04 ` Nicolas Pitre
  1 sibling, 1 reply; 6+ messages in thread
From: Junio C Hamano @ 2007-02-23  8:10 UTC (permalink / raw)
  To: Martin Koegler; +Cc: git

mkoegler@auto.tuwien.ac.at (Martin Koegler) writes:

> Commiting a new version in GIT increases the storage by the compressed
> size of each changed blob. Packing all unpacked objects decreases the
> required storage, but does not generate deltas against objects in
> packs. You need to repack all objects to get around this.
>
> For normal source code, this is not a problem.  But if you want to use
> git for big files, you waste storage (or CPU time for everything
> repacking).

Three points that might help you without any code change.

 - Have you run "git repack -a -d" without "-f"?  Reusing of
   existing delta is specifically designed to avoid the "CPU
   time for everything repacking" problem.

 - If you are dealing with something other than "normal source
   code", do you know if your objects delta against each other
   well?  If not, turning core.legacyheaders off might be a
   win.  It allows the objects that are recorded as non-delta in
   resulting pack to be copied straight from loose objects.

 - Once you accumulated large enough packs with existing
   objects, marking them with .keep would leave them untouched
   during subsequent repack.  When "git repack -a -d" repacks
   "everything", its definition of "everything" becomes "except
   things that are in packs marked with .keep files".

Side note: Is the .keep mechanism sufficiently documented?  I am
too lazy to check that right now, but here is a tip.  After
releasing the big one, line v1.5.0, I do:

  $ P=.git/objects/pack
  $ git rev-list --objects v1.5.0 |
    git pack-objects --delta-base-offset \
          --depth=30 --window=100 --no-reuse-delta pack
  ...
  6fba5cb8ed92dfef71ff47def9f95fa1e703ba59
  $ mv pack-6fba5cb8ed92dfef71ff47def9f95fa1e703ba59.* $P/
  $ echo 'Post 1.5.0' >$P/pack-6fba5cb8ed92dfef71ff47def9f95fa1e703ba59.keep
  $ git gc --prune

This does three things:

 - It packs everything reachable from v1.5.0 with delta chain
   that is deeper than the default.

 - The pack is installed in the object store; the presence of
   .keep file (the contents of it does not matter) tells
   subsequent repack not to touch it.

 - Then the remaining objects are packed into different pack.

With this, the repository uses two packs, one is what I'll keep
until it's time to do the big repack again, another is what's
constantly recreated by repacking but contains only "recent"
object.

> It only permits, that the base commit of a delta is located in a
> different pack or as unpacked object.

This "only" change needs to be done _very_ carefully, since
self-containedness of pack files is one of the important
elements of the stability of a git repository.

In effect, you are making the delta and its base object into a
new type of "reachability" for the purpose of fsck/prune by
allowing incremental pack to contain a delta against a loose
object.  I am not saying it is a bad idea, but making sure you
covered every case you could lose necessary objects will be a
lot of work.

For example, suppose a delta in your incremental pack is based
on a loose object.  That loose object can become unreachable
after rewinding or rebasing your refs.  You have to somehow
arrange that git-prune knows this situation and prevent it from
getting pruned -- otherwise your incremental pack becomes
corrupt.

And that is just one example I could come up with after seeing
your message in 3 minutes while watching TV ;-).  I would
usually say "I am sure there will be more...", but in this
particular case, I am inclined to say that I do not even want to
start thinking about possible fallout from this.  It's scary.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: RFC: [PATCH] Support incremental pack files
  2007-02-23  7:13 RFC: [PATCH] Support incremental pack files Martin Koegler
  2007-02-23  8:10 ` Junio C Hamano
@ 2007-02-23 16:04 ` Nicolas Pitre
  2007-02-23 16:32   ` Shawn O. Pearce
  1 sibling, 1 reply; 6+ messages in thread
From: Nicolas Pitre @ 2007-02-23 16:04 UTC (permalink / raw)
  To: Martin Koegler; +Cc: git

On Fri, 23 Feb 2007, Martin Koegler wrote:

> With CVS (or RCS), commiting a version, increases the required storage
> only by the size of the diff.
> 
> Commiting a new version in GIT increases the storage by the compressed
> size of each changed blob. Packing all unpacked objects decreases the
> required storage, but does not generate deltas against objects in
> packs. You need to repack all objects to get around this.
> 
> For normal source code, this is not a problem.  But if you want to use
> git for big files, you waste storage (or CPU time for everything
> repacking).

Did you try repack -a -d (without -f) ?

When -f is not used, already deltified objects are simply copied as is 
into the new pack without further processing.

> The follwing patch (again git-1.5.0-rc3) is a prototyp for supporting
> incremental pack files in git. The file structures are not changed.
> It only permits, that the base commit of a delta is located in a
> different pack or as unpacked object.

We always refused to have packs in the repository that are not self 
contained because that would pave the way for all sort of nasty issues.  
It is otherwise much harder to prevent circular delta chains, harder to 
ensure full reachability when pruning disconnected objects at the 
hierarchical level, etc.  And those are real issues that would bite you 
as soon as you perform a single fetch or push with something else than 
the native protocol.

In other words I think this is a bad idea for repository storage.  We do 
it a part of the native GIT protocol because it is obvious that there is 
no possibility for delta loops (ommitted base objects in the transmitted 
pack are known to exists in the peer repository) and those packs are 
fixed up with missing objects on the receive side when not exploded into 
loose objects.

Again a repack without -f should not be that expensive.  If it is then 
something is wrong and that should be fixed.

One thing that is too expensive in GIT is rev-list --objects --all (or 
equivalent) used to list objects to pack.  But Shawn and I have a plan 
to fix that at some point... (if only I can find some spare time to 
write more code for it).


Nicolas

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: RFC: [PATCH] Support incremental pack files
  2007-02-23 16:04 ` Nicolas Pitre
@ 2007-02-23 16:32   ` Shawn O. Pearce
  0 siblings, 0 replies; 6+ messages in thread
From: Shawn O. Pearce @ 2007-02-23 16:32 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Martin Koegler, git

Nicolas Pitre <nico@cam.org> wrote:
> One thing that is too expensive in GIT is rev-list --objects --all (or 
> equivalent) used to list objects to pack.  But Shawn and I have a plan 
> to fix that at some point... (if only I can find some spare time to 
> write more code for it).

Ditto.  ;-)

I have been pretty swamped this week, but plan on working on packv4
prototype code today and this weekend.  (BTW, I did get the string
code, its useful, thanks!)

-- 
Shawn.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: RFC: [PATCH] Support incremental pack files
  2007-02-23  8:10 ` Junio C Hamano
@ 2007-02-26 21:45   ` Martin Koegler
  2007-02-26 22:03     ` Johannes Schindelin
  0 siblings, 1 reply; 6+ messages in thread
From: Martin Koegler @ 2007-02-26 21:45 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

On Fri, Feb 23, 2007 at 12:10:35AM -0800, Junio C Hamano wrote:
> mkoegler@auto.tuwien.ac.at (Martin Koegler) writes:
> 
> > Commiting a new version in GIT increases the storage by the compressed
> > size of each changed blob. Packing all unpacked objects decreases the
> > required storage, but does not generate deltas against objects in
> > packs. You need to repack all objects to get around this.
> >
> > For normal source code, this is not a problem.  But if you want to use
> > git for big files, you waste storage (or CPU time for everything
> > repacking).
> 
> Three points that might help you without any code change.
> 
>  - Have you run "git repack -a -d" without "-f"?  Reusing of
>    existing delta is specifically designed to avoid the "CPU
>    time for everything repacking" problem.
> 
>  - If you are dealing with something other than "normal source
>    code", do you know if your objects delta against each other
>    well?  If not, turning core.legacyheaders off might be a
>    win.  It allows the objects that are recorded as non-delta in
>    resulting pack to be copied straight from loose objects.

I currently use CVS to save the daily changes in database dumps (files
mostly containing INSERT INTO xx (...) VALUES (...);). I'm trying to
switch this to git.

A commit typically consists of some files with a size of > 100 MB and
are growing every day. (All unpacked blob objects of) A commit require
currently about 60 MB. A incremental pack file containing one commit
is smaller than 1 MB, so the delta works well.

>  - Once you accumulated large enough packs with existing
>    objects, marking them with .keep would leave them untouched
>    during subsequent repack.  When "git repack -a -d" repacks
>    "everything", its definition of "everything" becomes "except
>    things that are in packs marked with .keep files".
> 
> Side note: Is the .keep mechanism sufficiently documented?  I am
> too lazy to check that right now, but here is a tip.  After
> releasing the big one, line v1.5.0, I do:

I have not found any notice of this in the git documentation.

>   $ P=.git/objects/pack
>   $ git rev-list --objects v1.5.0 |
>     git pack-objects --delta-base-offset \
>           --depth=30 --window=100 --no-reuse-delta pack
>   ...
>   6fba5cb8ed92dfef71ff47def9f95fa1e703ba59
>   $ mv pack-6fba5cb8ed92dfef71ff47def9f95fa1e703ba59.* $P/
>   $ echo 'Post 1.5.0' >$P/pack-6fba5cb8ed92dfef71ff47def9f95fa1e703ba59.keep
>   $ git gc --prune
> 
> This does three things:
> 
>  - It packs everything reachable from v1.5.0 with delta chain
>    that is deeper than the default.
> 
>  - The pack is installed in the object store; the presence of
>    .keep file (the contents of it does not matter) tells
>    subsequent repack not to touch it.
> 
>  - Then the remaining objects are packed into different pack.
> 
> With this, the repository uses two packs, one is what I'll keep
> until it's time to do the big repack again, another is what's
> constantly recreated by repacking but contains only "recent"
> object.

This could be a practical solution for me. The biggest disadvantage
of this solution is, that each pack file is at least >= 60 MB.

A nice feature of git is, that it normally does not change files,
which keeps incremental backups small. I want to retain this, so I
want avoid uncessary repacking.

As I have no tags, I can base the repacking decision only on file
size:

  * Daily: Mark all packs >= eg. 100 MB as keep and repack the
           repository.
  * Weekly/Monthly/Yearly: repack repository including packs of the
           next size class.

My first idea was to write a script, which delete all keep files,
recreates them for packs bigger than a specified size and the starts
git-repack.

As git-repack already calls find, this could be easly added to the
script:

--- git-repack  2007-02-17 18:06:09.000000000 +0100
+++ git-repack1 2007-02-26 22:09:12.000000000 +0100
@@ -8,11 +8,12 @@
 . git-sh-setup

 no_update_info= all_into_one= remove_redundant=
-local= quiet= no_reuse_delta= extra=
+local= quiet= no_reuse_delta= extra= sizearg=
 while case "$#" in 0) break ;; esac
 do
        case "$1" in
        -n)     no_update_info=t ;;
+       -s)     sizearg="-size -${2}k" ; shift; ;;
        -a)     all_into_one=t ;;
        -d)     remove_redundant=t ;;
        -q)     quiet=-q ;;
@@ -46,7 +47,7 @@
        ;;
 ,t,)
        if [ -d "$PACKDIR" ]; then
-               for e in `cd "$PACKDIR" && find . -type f -name '*.pack' \
+               for e in `cd "$PACKDIR" && find . -type f $sizearg -name '*.pack' \
                        | sed -e 's/^\.\///' -e 's/\.pack$//'`
                do
                        if [ -e "$PACKDIR/$e.keep" ]; then


> > It only permits, that the base commit of a delta is located in a
> > different pack or as unpacked object.
> 
> This "only" change needs to be done _very_ carefully, since
> self-containedness of pack files is one of the important
> elements of the stability of a git repository.

I understand the problems. GIT would need at least a list of external
base objects in the pack to speed up things like eg. git-prune.

mfg Martin Kögler

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: RFC: [PATCH] Support incremental pack files
  2007-02-26 21:45   ` Martin Koegler
@ 2007-02-26 22:03     ` Johannes Schindelin
  0 siblings, 0 replies; 6+ messages in thread
From: Johannes Schindelin @ 2007-02-26 22:03 UTC (permalink / raw)
  To: Martin Koegler; +Cc: Junio C Hamano, git

Hi,

On Mon, 26 Feb 2007, Martin Koegler wrote:

> On Fri, Feb 23, 2007 at 12:10:35AM -0800, Junio C Hamano wrote:
>
> >   $ P=.git/objects/pack
> >   $ git rev-list --objects v1.5.0 |
> >     git pack-objects --delta-base-offset \
> >           --depth=30 --window=100 --no-reuse-delta pack
> >   ...
> >   6fba5cb8ed92dfef71ff47def9f95fa1e703ba59
> >   $ mv pack-6fba5cb8ed92dfef71ff47def9f95fa1e703ba59.* $P/
> >   $ echo 'Post 1.5.0' >$P/pack-6fba5cb8ed92dfef71ff47def9f95fa1e703ba59.keep
> >   $ git gc --prune
> > 
> > This does three things:
> > 
> >  - It packs everything reachable from v1.5.0 with delta chain
> >    that is deeper than the default.
> > 
> >  - The pack is installed in the object store; the presence of
> >    .keep file (the contents of it does not matter) tells
> >    subsequent repack not to touch it.
> > 
> >  - Then the remaining objects are packed into different pack.
> > 
> > With this, the repository uses two packs, one is what I'll keep
> > until it's time to do the big repack again, another is what's
> > constantly recreated by repacking but contains only "recent"
> > object.
> 
> This could be a practical solution for me. The biggest disadvantage of 
> this solution is, that each pack file is at least >= 60 MB.

Junio has a branch he rewinds sometimes. That's why he does not do the 
obvious, which should work for you:

$ git gc --prune
$ for p in .git/objects/pack/*.pack; do
	keepfile=`echo $p | sed s/pack$/keep/`
	echo "Keep all current packs as-are" > $keepfile
  done

You should run this from time to time. You can run "git gc --prune" more 
often, of course...

Hth,
Dscho

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2007-02-26 22:03 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-02-23  7:13 RFC: [PATCH] Support incremental pack files Martin Koegler
2007-02-23  8:10 ` Junio C Hamano
2007-02-26 21:45   ` Martin Koegler
2007-02-26 22:03     ` Johannes Schindelin
2007-02-23 16:04 ` Nicolas Pitre
2007-02-23 16:32   ` Shawn O. Pearce

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.