All of lore.kernel.org
 help / color / mirror / Atom feed
* support for large packs and 64-bit offsets
@ 2007-04-09  5:06 Nicolas Pitre
  2007-04-09  5:06 ` [PATCH 01/10] get rid of num_packed_objects() Nicolas Pitre
  2007-04-09 17:19 ` support for large packs and 64-bit offsets Shawn O. Pearce
  0 siblings, 2 replies; 24+ messages in thread
From: Nicolas Pitre @ 2007-04-09  5:06 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

This patch series adds support for large packs to GIT.  It creates a new
index format to accommodate both larger offsets and to store a checksum
for each raw object in a pack.  More details is provided in each patch
commit message.

This is not a replacement for the pack size limiting feature.  I think that
GIT should support both: large packs and split packs.

Nicolas

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH 01/10] get rid of num_packed_objects()
  2007-04-09  5:06 support for large packs and 64-bit offsets Nicolas Pitre
@ 2007-04-09  5:06 ` Nicolas Pitre
  2007-04-09  5:06   ` [PATCH 02/10] make overflow test on delta base offset work regardless of variable size Nicolas Pitre
  2007-04-09 17:19 ` support for large packs and 64-bit offsets Shawn O. Pearce
  1 sibling, 1 reply; 24+ messages in thread
From: Nicolas Pitre @ 2007-04-09  5:06 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git, Nicolas Pitre

The coming index format change doesn't allow for the number of objects
to be determined from the size of the index file directly.  Instead, Let's
initialize a field in the packed_git structure with the object count when
the index is validated since the count is always known at that point.

While at it let's reorder some struct packed_git fields to avoid padding
due to needed 64-bit alignment for some of them.

Signed-off-by: Nicolas Pitre <nico@cam.org>
---
 builtin-count-objects.c |    2 +-
 builtin-fsck.c          |    2 +-
 builtin-pack-objects.c  |    4 ++--
 cache.h                 |    8 ++++----
 pack-check.c            |    4 ++--
 sha1_file.c             |   17 ++++++-----------
 sha1_name.c             |    2 +-
 7 files changed, 17 insertions(+), 22 deletions(-)

diff --git a/builtin-count-objects.c b/builtin-count-objects.c
index 6263d8a..ff90ebd 100644
--- a/builtin-count-objects.c
+++ b/builtin-count-objects.c
@@ -111,7 +111,7 @@ int cmd_count_objects(int ac, const char **av, const char *prefix)
 		for (p = packed_git; p; p = p->next) {
 			if (!p->pack_local)
 				continue;
-			packed += num_packed_objects(p);
+			packed += p->num_objects;
 			num_pack++;
 		}
 		printf("count: %lu\n", loose);
diff --git a/builtin-fsck.c b/builtin-fsck.c
index 4d8b66c..44a02d3 100644
--- a/builtin-fsck.c
+++ b/builtin-fsck.c
@@ -653,7 +653,7 @@ int cmd_fsck(int argc, char **argv, const char *prefix)
 			verify_pack(p, 0);
 
 		for (p = packed_git; p; p = p->next) {
-			uint32_t i, num = num_packed_objects(p);
+			uint32_t i, num = p->num_objects;
 			for (i = 0; i < num; i++)
 				fsck_sha1(nth_packed_object_sha1(p, i));
 		}
diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c
index 45ac3e4..6bff17b 100644
--- a/builtin-pack-objects.c
+++ b/builtin-pack-objects.c
@@ -164,7 +164,7 @@ static int cmp_offset(const void *a_, const void *b_)
 static void prepare_pack_revindex(struct pack_revindex *rix)
 {
 	struct packed_git *p = rix->p;
-	int num_ent = num_packed_objects(p);
+	int num_ent = p->num_objects;
 	int i;
 	const char *index = p->index_data;
 
@@ -198,7 +198,7 @@ static struct revindex_entry * find_packed_object(struct packed_git *p,
 		prepare_pack_revindex(rix);
 	revindex = rix->revindex;
 	lo = 0;
-	hi = num_packed_objects(p) + 1;
+	hi = p->num_objects + 1;
 	do {
 		int mi = (lo + hi) / 2;
 		if (revindex[mi].offset == ofs) {
diff --git a/cache.h b/cache.h
index c15daa8..73d8e63 100644
--- a/cache.h
+++ b/cache.h
@@ -378,11 +378,12 @@ struct pack_window {
 extern struct packed_git {
 	struct packed_git *next;
 	struct pack_window *windows;
-	const void *index_data;
-	off_t index_size;
 	off_t pack_size;
-	time_t mtime;
+	const void *index_data;
+	size_t index_size;
+	uint32_t num_objects;
 	int index_version;
+	time_t mtime;
 	int pack_fd;
 	int pack_local;
 	unsigned char sha1[20];
@@ -433,7 +434,6 @@ extern void pack_report(void);
 extern unsigned char* use_pack(struct packed_git *, struct pack_window **, off_t, unsigned int *);
 extern void unuse_pack(struct pack_window **);
 extern struct packed_git *add_packed_git(const char *, int, int);
-extern uint32_t num_packed_objects(const struct packed_git *p);
 extern const unsigned char *nth_packed_object_sha1(const struct packed_git *, uint32_t);
 extern off_t find_pack_entry_one(const unsigned char *, struct packed_git *);
 extern void *unpack_entry(struct packed_git *, off_t, enum object_type *, unsigned long *);
diff --git a/pack-check.c b/pack-check.c
index f58083d..d04536b 100644
--- a/pack-check.c
+++ b/pack-check.c
@@ -40,7 +40,7 @@ static int verify_packfile(struct packed_git *p,
 	 * have verified that nr_objects matches between idx and pack,
 	 * we do not do scan-streaming check on the pack file.
 	 */
-	nr_objects = num_packed_objects(p);
+	nr_objects = p->num_objects;
 	for (i = 0, err = 0; i < nr_objects; i++) {
 		const unsigned char *sha1;
 		void *data;
@@ -79,7 +79,7 @@ static void show_pack_info(struct packed_git *p)
 {
 	uint32_t nr_objects, i, chain_histogram[MAX_CHAIN];
 
-	nr_objects = num_packed_objects(p);
+	nr_objects = p->num_objects;
 	memset(chain_histogram, 0, sizeof(chain_histogram));
 
 	for (i = 0; i < nr_objects; i++) {
diff --git a/sha1_file.c b/sha1_file.c
index 4304fe9..d9ca69a 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -494,6 +494,7 @@ static int check_packed_git_idx(const char *path,  struct packed_git *p)
 	p->index_version = 1;
 	p->index_data = idx_map;
 	p->index_size = idx_size;
+	p->num_objects = nr;
 	return 0;
 }
 
@@ -605,11 +606,11 @@ static int open_packed_git_1(struct packed_git *p)
 			p->pack_name, ntohl(hdr.hdr_version));
 
 	/* Verify the pack matches its index. */
-	if (num_packed_objects(p) != ntohl(hdr.hdr_entries))
+	if (p->num_objects != ntohl(hdr.hdr_entries))
 		return error("packfile %s claims to have %u objects"
-			" while index size indicates %u objects",
-			p->pack_name, ntohl(hdr.hdr_entries),
-			num_packed_objects(p));
+			     " while index indicates %u objects",
+			     p->pack_name, ntohl(hdr.hdr_entries),
+			     p->num_objects);
 	if (lseek(p->pack_fd, p->pack_size - sizeof(sha1), SEEK_SET) == -1)
 		return error("end of packfile %s is unavailable", p->pack_name);
 	if (read_in_full(p->pack_fd, sha1, sizeof(sha1)) != sizeof(sha1))
@@ -1526,18 +1527,12 @@ void *unpack_entry(struct packed_git *p, off_t obj_offset,
 	return data;
 }
 
-uint32_t num_packed_objects(const struct packed_git *p)
-{
-	/* See check_packed_git_idx() */
-	return (uint32_t)((p->index_size - 20 - 20 - 4*256) / 24);
-}
-
 const unsigned char *nth_packed_object_sha1(const struct packed_git *p,
 					    uint32_t n)
 {
 	const unsigned char *index = p->index_data;
 	index += 4 * 256;
-	if (num_packed_objects(p) <= n)
+	if (n >= p->num_objects)
 		return NULL;
 	return index + 24 * n + 4;
 }
diff --git a/sha1_name.c b/sha1_name.c
index 267ea3f..b0b12bb 100644
--- a/sha1_name.c
+++ b/sha1_name.c
@@ -76,7 +76,7 @@ static int find_short_packed_object(int len, const unsigned char *match, unsigne
 
 	prepare_packed_git();
 	for (p = packed_git; p && found < 2; p = p->next) {
-		uint32_t num = num_packed_objects(p);
+		uint32_t num = p->num_objects;
 		uint32_t first = 0, last = num;
 		while (first < last) {
 			uint32_t mid = (first + last) / 2;
-- 
1.5.1.696.g6d352-dirty

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 02/10] make overflow test on delta base offset work regardless of variable size
  2007-04-09  5:06 ` [PATCH 01/10] get rid of num_packed_objects() Nicolas Pitre
@ 2007-04-09  5:06   ` Nicolas Pitre
  2007-04-09  5:06     ` [PATCH 03/10] add overflow tests on pack offset variables Nicolas Pitre
  0 siblings, 1 reply; 24+ messages in thread
From: Nicolas Pitre @ 2007-04-09  5:06 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git, Nicolas Pitre

This patch introduces the MSB() macro to obtain the desired number of
most significant bits from a given variable independently of the variable
type.

It is then used to better implement the overflow test on the OBJ_OFS_DELTA
base offset variable with the property of always working correctly
regardless of the type/size of that variable.

Signed-off-by: Nicolas Pitre <nico@cam.org>
---
 builtin-pack-objects.c   |    2 +-
 builtin-unpack-objects.c |    2 +-
 git-compat-util.h        |    8 ++++++++
 index-pack.c             |    2 +-
 sha1_file.c              |    2 +-
 5 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c
index 6bff17b..ee607a0 100644
--- a/builtin-pack-objects.c
+++ b/builtin-pack-objects.c
@@ -1014,7 +1014,7 @@ static void check_object(struct object_entry *entry)
 				ofs = c & 127;
 				while (c & 128) {
 					ofs += 1;
-					if (!ofs || ofs & ~(~0UL >> 7))
+					if (!ofs || MSB(ofs, 7))
 						die("delta base offset overflow in pack for %s",
 						    sha1_to_hex(entry->sha1));
 					c = buf[used_0++];
diff --git a/builtin-unpack-objects.c b/builtin-unpack-objects.c
index 3956c56..63f7db6 100644
--- a/builtin-unpack-objects.c
+++ b/builtin-unpack-objects.c
@@ -209,7 +209,7 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 		base_offset = c & 127;
 		while (c & 128) {
 			base_offset += 1;
-			if (!base_offset || base_offset & ~(~0UL >> 7))
+			if (!base_offset || MSB(base_offset, 7))
 				die("offset value overflow for delta base object");
 			pack = fill(1);
 			c = *pack;
diff --git a/git-compat-util.h b/git-compat-util.h
index 139fc19..bcfcb35 100644
--- a/git-compat-util.h
+++ b/git-compat-util.h
@@ -13,6 +13,14 @@
 
 #define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0]))
 
+#ifdef __GNUC__
+#define TYPEOF(x) (__typeof__(x))
+#else
+#define TYPEOF(x)
+#endif
+
+#define MSB(x, bits) ((x) & TYPEOF(x)(~0ULL << (sizeof(x) * 8 - (bits))))
+
 #if !defined(__APPLE__) && !defined(__FreeBSD__)
 #define _XOPEN_SOURCE 600 /* glibc2 and AIX 5.3L need 500, OpenBSD needs 600 for S_ISLNK() */
 #define _XOPEN_SOURCE_EXTENDED 1 /* AIX 5.3L needs this */
diff --git a/index-pack.c b/index-pack.c
index 3c768fb..0e54aa6 100644
--- a/index-pack.c
+++ b/index-pack.c
@@ -249,7 +249,7 @@ static void *unpack_raw_entry(struct object_entry *obj, union delta_base *delta_
 		base_offset = c & 127;
 		while (c & 128) {
 			base_offset += 1;
-			if (!base_offset || base_offset & ~(~0UL >> 7))
+			if (!base_offset || MSB(base_offset, 7))
 				bad_object(obj->offset, "offset value overflow for delta base object");
 			p = fill(1);
 			c = *p;
diff --git a/sha1_file.c b/sha1_file.c
index d9ca69a..ebdd497 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -1150,7 +1150,7 @@ static off_t get_delta_base(struct packed_git *p,
 		base_offset = c & 127;
 		while (c & 128) {
 			base_offset += 1;
-			if (!base_offset || base_offset & ~(~0UL >> 7))
+			if (!base_offset || MSB(base_offset, 7))
 				die("offset value overflow for delta base object");
 			c = base_info[used++];
 			base_offset = (base_offset << 7) + (c & 127);
-- 
1.5.1.696.g6d352-dirty

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 03/10] add overflow tests on pack offset variables
  2007-04-09  5:06   ` [PATCH 02/10] make overflow test on delta base offset work regardless of variable size Nicolas Pitre
@ 2007-04-09  5:06     ` Nicolas Pitre
  2007-04-09  5:06       ` [PATCH 04/10] compute a CRC32 for each object as stored in a pack Nicolas Pitre
  0 siblings, 1 reply; 24+ messages in thread
From: Nicolas Pitre @ 2007-04-09  5:06 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git, Nicolas Pitre

Change a few size and offset variables to more appropriate type, then
add overflow tests on those offsets.  This prevents any bad data to be
generated/processed if off_t happens to not be large enough to handle
some big packs.

Better be safe than sorry.

Signed-off-by: Nicolas Pitre <nico@cam.org>
---
 builtin-pack-objects.c   |   19 +++++++++++++------
 builtin-unpack-objects.c |   17 +++++++++++------
 index-pack.c             |   14 ++++++++++----
 3 files changed, 34 insertions(+), 16 deletions(-)

diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c
index ee607a0..d0be879 100644
--- a/builtin-pack-objects.c
+++ b/builtin-pack-objects.c
@@ -369,7 +369,7 @@ static int revalidate_loose_object(struct object_entry *entry,
 	return check_loose_inflate(map, mapsize, size);
 }
 
-static off_t write_object(struct sha1file *f,
+static unsigned long write_object(struct sha1file *f,
 				  struct object_entry *entry)
 {
 	unsigned long size;
@@ -503,16 +503,23 @@ static off_t write_one(struct sha1file *f,
 			       struct object_entry *e,
 			       off_t offset)
 {
+	unsigned long size;
+
+	/* offset is non zero if object is written already. */
 	if (e->offset || e->preferred_base)
-		/* offset starts from header size and cannot be zero
-		 * if it is written already.
-		 */
 		return offset;
-	/* if we are deltified, write out its base object first. */
+
+	/* if we are deltified, write out base object first. */
 	if (e->delta)
 		offset = write_one(f, e->delta, offset);
+
 	e->offset = offset;
-	return offset + write_object(f, e);
+	size = write_object(f, e);
+
+	/* make sure off_t is sufficiently large not to wrap */
+	if (offset > offset + size)
+		die("pack too large for current definition of off_t");
+	return offset + size;
 }
 
 static void write_pack_file(void)
diff --git a/builtin-unpack-objects.c b/builtin-unpack-objects.c
index 63f7db6..f821906 100644
--- a/builtin-unpack-objects.c
+++ b/builtin-unpack-objects.c
@@ -13,7 +13,8 @@ static const char unpack_usage[] = "git-unpack-objects [-n] [-q] [-r] < pack-fil
 
 /* We always read in 4kB chunks. */
 static unsigned char buffer[4096];
-static unsigned long offset, len, consumed_bytes;
+static unsigned int offset, len;
+static off_t consumed_bytes;
 static SHA_CTX ctx;
 
 /*
@@ -49,6 +50,10 @@ static void use(int bytes)
 		die("used more bytes than were available");
 	len -= bytes;
 	offset += bytes;
+
+	/* make sure off_t is sufficiently large not to wrap */
+	if (consumed_bytes > consumed_bytes + bytes)
+		die("pack too large for current definition of off_t");
 	consumed_bytes += bytes;
 }
 
@@ -88,17 +93,17 @@ static void *get_data(unsigned long size)
 
 struct delta_info {
 	unsigned char base_sha1[20];
-	unsigned long base_offset;
+	unsigned nr;
+	off_t base_offset;
 	unsigned long size;
 	void *delta;
-	unsigned nr;
 	struct delta_info *next;
 };
 
 static struct delta_info *delta_list;
 
 static void add_delta_to_list(unsigned nr, unsigned const char *base_sha1,
-			      unsigned long base_offset,
+			      off_t base_offset,
 			      void *delta, unsigned long size)
 {
 	struct delta_info *info = xmalloc(sizeof(*info));
@@ -113,7 +118,7 @@ static void add_delta_to_list(unsigned nr, unsigned const char *base_sha1,
 }
 
 struct obj_info {
-	unsigned long offset;
+	off_t offset;
 	unsigned char sha1[20];
 };
 
@@ -200,7 +205,7 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 	} else {
 		unsigned base_found = 0;
 		unsigned char *pack, c;
-		unsigned long base_offset;
+		off_t base_offset;
 		unsigned lo, mid, hi;
 
 		pack = fill(1);
diff --git a/index-pack.c b/index-pack.c
index 0e54aa6..66fb0bc 100644
--- a/index-pack.c
+++ b/index-pack.c
@@ -12,7 +12,7 @@ static const char index_pack_usage[] =
 
 struct object_entry
 {
-	unsigned long offset;
+	off_t offset;
 	unsigned long size;
 	unsigned int hdr_size;
 	enum object_type type;
@@ -22,7 +22,7 @@ struct object_entry
 
 union delta_base {
 	unsigned char sha1[20];
-	unsigned long offset;
+	off_t offset;
 };
 
 /*
@@ -83,7 +83,8 @@ static unsigned display_progress(unsigned n, unsigned total, unsigned last_pc)
 
 /* We always read in 4kB chunks. */
 static unsigned char input_buffer[4096];
-static unsigned long input_offset, input_len, consumed_bytes;
+static unsigned int input_offset, input_len;
+static off_t consumed_bytes;
 static SHA_CTX input_ctx;
 static int input_fd, output_fd, pack_fd;
 
@@ -129,6 +130,10 @@ static void use(int bytes)
 		die("used more bytes than were available");
 	input_len -= bytes;
 	input_offset += bytes;
+
+	/* make sure off_t is sufficiently large not to wrap */
+	if (consumed_bytes > consumed_bytes + bytes)
+		die("pack too large for current definition of off_t");
 	consumed_bytes += bytes;
 }
 
@@ -216,7 +221,8 @@ static void *unpack_entry_data(unsigned long offset, unsigned long size)
 static void *unpack_raw_entry(struct object_entry *obj, union delta_base *delta_base)
 {
 	unsigned char *p, c;
-	unsigned long size, base_offset;
+	unsigned long size;
+	off_t base_offset;
 	unsigned shift;
 
 	obj->offset = consumed_bytes;
-- 
1.5.1.696.g6d352-dirty

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 04/10] compute a CRC32 for each object as stored in a pack
  2007-04-09  5:06     ` [PATCH 03/10] add overflow tests on pack offset variables Nicolas Pitre
@ 2007-04-09  5:06       ` Nicolas Pitre
  2007-04-09  5:06         ` [PATCH 05/10] compute object CRC32 with index-pack Nicolas Pitre
  0 siblings, 1 reply; 24+ messages in thread
From: Nicolas Pitre @ 2007-04-09  5:06 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git, Nicolas Pitre

The most important optimization for performance when repacking is the
ability to reuse data from a previous pack as is and bypass any delta
or even SHA1 computation by simply copying the raw data from one pack
to another directly.

The problem with  this is that any data corruption within a copied object
would go unnoticed and the new (repacked) pack would be self-consistent
with its own checksum despite containing a corrupted object.  This is a
real issue that already happened at least once in the past.

In some attempt to prevent this, we validate the copied data by inflating
it and making sure no error is signaled by zlib.  But this is still not
perfect as a significant portion of a pack content is made of object
headers and references to delta base objects which are not deflated and
therefore not validated when repacking actually making the pack data reuse
still not as safe as it could be.

Of course a full SHA1 validation could be performed, but that implies
full data inflating and delta replaying which is extremely costly, which
cost the data reuse optimization was designed to avoid in the first place.

So the best solution to this is simply to store a CRC32 of the raw pack
data for each object in the pack index.  This way any object in a pack can
be validated before being copied as is in another pack, including header
and any other non deflated data.

Why CRC32 instead of a faster checksum like Adler32?  Quoting Wikipedia:

   Jonathan Stone discovered in 2001 that Adler-32 has a weakness for very
   short messages. He wrote "Briefly, the problem is that, for very short
   packets, Adler32 is guaranteed to give poor coverage of the available
   bits. Don't take my word for it, ask Mark Adler. :-)" The problem is
   that sum A does not wrap for short messages. The maximum value of A for
   a 128-byte message is 32640, which is below the value 65521 used by the
   modulo operation. An extended explanation can be found in RFC 3309,
   which mandates the use of CRC32 instead of Adler-32 for SCTP, the
   Stream Control Transmission Protocol.

In the context of a GIT pack, we have lots of small objects, especially
deltas, which are likely to be quite small and in a size range for which
Adler32 is dimed not to be sufficient.  Another advantage of CRC32 is the
possibility for recovery from certain types of small corruptions like
single bit errors which are the most probable type of corruptions.

OK what this patch does is to compute the CRC32 of each object written to
a pack within pack-objects.  It is not written to the index yet and it is
obviously not validated when reusing pack data yet either.

Signed-off-by: Nicolas Pitre <nico@cam.org>
---
 builtin-pack-objects.c |    6 ++++++
 csum-file.c            |   14 ++++++++++++++
 csum-file.h            |    4 ++++
 3 files changed, 24 insertions(+), 0 deletions(-)

diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c
index d0be879..03e36f0 100644
--- a/builtin-pack-objects.c
+++ b/builtin-pack-objects.c
@@ -44,6 +44,7 @@ struct object_entry {
 				 * be used as the base objectto delta huge
 				 * objects against.
 				 */
+	uint32_t crc32;		/* crc of raw pack data for this object */
 };
 
 /*
@@ -381,6 +382,9 @@ static unsigned long write_object(struct sha1file *f,
 	enum object_type obj_type;
 	int to_reuse = 0;
 
+	if (!pack_to_stdout)
+		crc32_begin(f);
+
 	obj_type = entry->type;
 	if (! entry->in_pack)
 		to_reuse = 0;	/* can't reuse what we don't have */
@@ -496,6 +500,8 @@ static unsigned long write_object(struct sha1file *f,
 	if (entry->delta)
 		written_delta++;
 	written++;
+	if (!pack_to_stdout)
+		entry->crc32 = crc32_end(f);
 	return hdrlen + datalen;
 }
 
diff --git a/csum-file.c b/csum-file.c
index b7174c6..7c806ad 100644
--- a/csum-file.c
+++ b/csum-file.c
@@ -49,6 +49,8 @@ int sha1close(struct sha1file *f, unsigned char *result, int update)
 
 int sha1write(struct sha1file *f, void *buf, unsigned int count)
 {
+	if (f->do_crc)
+		f->crc32 = crc32(f->crc32, buf, count);
 	while (count) {
 		unsigned offset = f->offset;
 		unsigned left = sizeof(f->buffer) - offset;
@@ -91,6 +93,7 @@ struct sha1file *sha1create(const char *fmt, ...)
 	f->fd = fd;
 	f->error = 0;
 	f->offset = 0;
+	f->do_crc = 0;
 	SHA1_Init(&f->ctx);
 	return f;
 }
@@ -111,6 +114,7 @@ struct sha1file *sha1fd(int fd, const char *name)
 	f->fd = fd;
 	f->error = 0;
 	f->offset = 0;
+	f->do_crc = 0;
 	SHA1_Init(&f->ctx);
 	return f;
 }
@@ -143,4 +147,14 @@ int sha1write_compressed(struct sha1file *f, void *in, unsigned int size)
 	return size;
 }
 
+void crc32_begin(struct sha1file *f)
+{
+	f->crc32 = crc32(0, Z_NULL, 0);
+	f->do_crc = 1;
+}
 
+uint32_t crc32_end(struct sha1file *f)
+{
+	f->do_crc = 0;
+	return f->crc32;
+}
diff --git a/csum-file.h b/csum-file.h
index 3ad1a99..7e13391 100644
--- a/csum-file.h
+++ b/csum-file.h
@@ -7,6 +7,8 @@ struct sha1file {
 	unsigned int offset, namelen;
 	SHA_CTX ctx;
 	char name[PATH_MAX];
+	int do_crc;
+	uint32_t crc32;
 	unsigned char buffer[8192];
 };
 
@@ -15,5 +17,7 @@ extern struct sha1file *sha1create(const char *fmt, ...) __attribute__((format (
 extern int sha1close(struct sha1file *, unsigned char *, int);
 extern int sha1write(struct sha1file *, void *, unsigned int);
 extern int sha1write_compressed(struct sha1file *, void *, unsigned int);
+extern void crc32_begin(struct sha1file *);
+extern uint32_t crc32_end(struct sha1file *);
 
 #endif
-- 
1.5.1.696.g6d352-dirty

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 05/10] compute object CRC32 with index-pack
  2007-04-09  5:06       ` [PATCH 04/10] compute a CRC32 for each object as stored in a pack Nicolas Pitre
@ 2007-04-09  5:06         ` Nicolas Pitre
  2007-04-09  5:06           ` [PATCH 06/10] pack-objects: learn about pack index version 2 Nicolas Pitre
  0 siblings, 1 reply; 24+ messages in thread
From: Nicolas Pitre @ 2007-04-09  5:06 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git, Nicolas Pitre

Same as previous patch but for index-pack.

Signed-off-by: Nicolas Pitre <nico@cam.org>
---
 index-pack.c |   16 +++++++++++++---
 1 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/index-pack.c b/index-pack.c
index 66fb0bc..d33f723 100644
--- a/index-pack.c
+++ b/index-pack.c
@@ -15,6 +15,7 @@ struct object_entry
 	off_t offset;
 	unsigned long size;
 	unsigned int hdr_size;
+	uint32_t crc32;
 	enum object_type type;
 	enum object_type real_type;
 	unsigned char sha1[20];
@@ -86,6 +87,7 @@ static unsigned char input_buffer[4096];
 static unsigned int input_offset, input_len;
 static off_t consumed_bytes;
 static SHA_CTX input_ctx;
+static uint32_t input_crc32;
 static int input_fd, output_fd, pack_fd;
 
 /* Discard current buffer used content. */
@@ -128,6 +130,7 @@ static void use(int bytes)
 {
 	if (bytes > input_len)
 		die("used more bytes than were available");
+	input_crc32 = crc32(input_crc32, input_buffer + input_offset, bytes);
 	input_len -= bytes;
 	input_offset += bytes;
 
@@ -224,8 +227,10 @@ static void *unpack_raw_entry(struct object_entry *obj, union delta_base *delta_
 	unsigned long size;
 	off_t base_offset;
 	unsigned shift;
+	void *data;
 
 	obj->offset = consumed_bytes;
+	input_crc32 = crc32(0, Z_NULL, 0);
 
 	p = fill(1);
 	c = *p;
@@ -276,7 +281,9 @@ static void *unpack_raw_entry(struct object_entry *obj, union delta_base *delta_
 	}
 	obj->hdr_size = consumed_bytes - obj->offset;
 
-	return unpack_entry_data(obj->offset, obj->size);
+	data = unpack_entry_data(obj->offset, obj->size);
+	obj->crc32 = input_crc32;
+	return data;
 }
 
 static void *get_data_from_pack(struct object_entry *obj)
@@ -521,7 +528,7 @@ static void parse_pack_objects(unsigned char *sha1)
 		fputc('\n', stderr);
 }
 
-static int write_compressed(int fd, void *in, unsigned int size)
+static int write_compressed(int fd, void *in, unsigned int size, uint32_t *obj_crc)
 {
 	z_stream stream;
 	unsigned long maxsize;
@@ -542,6 +549,7 @@ static int write_compressed(int fd, void *in, unsigned int size)
 
 	size = stream.total_out;
 	write_or_die(fd, out, size);
+	*obj_crc = crc32(*obj_crc, out, size);
 	free(out);
 	return size;
 }
@@ -562,8 +570,10 @@ static void append_obj_to_pack(const unsigned char *sha1, void *buf,
 	}
 	header[n++] = c;
 	write_or_die(output_fd, header, n);
+	obj[0].crc32 = crc32(0, Z_NULL, 0);
+	obj[0].crc32 = crc32(obj[0].crc32, header, n);
 	obj[1].offset = obj[0].offset + n;
-	obj[1].offset += write_compressed(output_fd, buf, size);
+	obj[1].offset += write_compressed(output_fd, buf, size, &obj[0].crc32);
 	hashcpy(obj->sha1, sha1);
 }
 
-- 
1.5.1.696.g6d352-dirty

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 06/10] pack-objects: learn about pack index version 2
  2007-04-09  5:06         ` [PATCH 05/10] compute object CRC32 with index-pack Nicolas Pitre
@ 2007-04-09  5:06           ` Nicolas Pitre
  2007-04-09  5:06             ` [PATCH 07/10] index-pack: " Nicolas Pitre
  2007-04-09  5:32             ` [PATCH 06/10] pack-objects: learn about pack index version 2 Junio C Hamano
  0 siblings, 2 replies; 24+ messages in thread
From: Nicolas Pitre @ 2007-04-09  5:06 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git, Nicolas Pitre

Pack index version 2 goes as follows:

 - 8 bytes of header with signature and version.

 - 256 entries of 4-byte first-level fan-out table.

 - Table of sorted 20-byte SHA1 records for each object in pack.

 - Table of 4-byte CRC32 entries for raw pack object data.

 - Table of 4-byte offset entries for objects in the pack if offset is
   representable with 31 bits or less, otherwise it is an index in the next
   table with top bit set.

 - Table of 8-byte offset entries indexed from previous table for offsets
   which are 32 bits or more (optional).

 - 20-byte SHA1 checksum of sorted object names.

 - 20-byte SHA1 checksum of the above.

The object SHA1 table is all contiguous so future pack format that would
contain this table directly won't require big changes to the code. It is
also tighter for slightly better cache locality when looking up entries.

Support for large packs exceeding 31 bits in size won't impose an index
size bloat for packs within that range that don't need a 64-bit offset.
And because newer objects which are likely to be the most frequently used
are located at the beginning of the pack, they won't pay the 64-bit offset
lookup at run time either even if the pack is large.

Right now an index version 2 is created only when the biggest offset in a
pack reaches 31 bits.  It might be a good idea to always use index version
2 eventually to benefit from the CRC32 it contains when reusing pack data
while repacking.

Signed-off-by: Nicolas Pitre <nico@cam.org>
---
 builtin-pack-objects.c |   85 ++++++++++++++++++++++++++++++++++++++++++++----
 1 files changed, 78 insertions(+), 7 deletions(-)

diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c
index 03e36f0..c7e0a42 100644
--- a/builtin-pack-objects.c
+++ b/builtin-pack-objects.c
@@ -169,13 +169,33 @@ static void prepare_pack_revindex(struct pack_revindex *rix)
 	int i;
 	const char *index = p->index_data;
 
-	index += 4 * 256;
 	rix->revindex = xmalloc(sizeof(*rix->revindex) * (num_ent + 1));
-	for (i = 0; i < num_ent; i++) {
-		uint32_t hl = *((uint32_t *)(index + 24 * i));
-		rix->revindex[i].offset = ntohl(hl);
-		rix->revindex[i].nr = i;
+	index += 4 * 256;
+
+	if (p->index_version > 1) {
+		const uint32_t *off_32 =
+			(uint32_t *)(index + 8 + p->num_objects * (20 + 4));
+		const uint32_t *off_64 = off_32 + p->num_objects;
+		for (i = 0; i < num_ent; i++) {
+			uint32_t off = ntohl(*off_32++);
+			if (!(off & 0x80000000)) {
+				rix->revindex[i].offset = off;
+			} else {
+				rix->revindex[i].offset =
+					((uint64_t)ntohl(*off_64++)) << 32;
+				rix->revindex[i].offset |=
+					ntohl(*off_64++);
+			}
+			rix->revindex[i].nr = i;
+		}
+	} else {
+		for (i = 0; i < num_ent; i++) {
+			uint32_t hl = *((uint32_t *)(index + 24 * i));
+			rix->revindex[i].offset = ntohl(hl);
+			rix->revindex[i].nr = i;
+		}
 	}
+
 	/* This knows the pack format -- the 20-byte trailer
 	 * follows immediately after the last object data.
 	 */
@@ -582,6 +602,18 @@ static void write_index_file(void)
 	struct object_entry **list = sorted_by_sha;
 	struct object_entry **last = list + nr_result;
 	uint32_t array[256];
+	uint32_t index_version;
+
+	/* if last object's offset is >= 2^31 we should use index V2 */
+	index_version = (objects[nr_result-1].offset >> 31) ? 2 : 1;
+
+	/* index versions 2 and above need a header */
+	if (index_version >= 2) {
+		struct pack_idx_header hdr;
+		hdr.idx_signature = htonl(PACK_IDX_SIGNATURE);
+		hdr.idx_version = htonl(index_version);
+		sha1write(f, &hdr, sizeof(hdr));
+	}
 
 	/*
 	 * Write the first-level table (the list is sorted,
@@ -607,10 +639,49 @@ static void write_index_file(void)
 	list = sorted_by_sha;
 	for (i = 0; i < nr_result; i++) {
 		struct object_entry *entry = *list++;
-		uint32_t offset = htonl(entry->offset);
-		sha1write(f, &offset, 4);
+		if (index_version < 2) {
+			uint32_t offset = htonl(entry->offset);
+			sha1write(f, &offset, 4);
+		}
 		sha1write(f, entry->sha1, 20);
 	}
+
+	if (index_version >= 2) {
+		unsigned int nr_large_offset = 0;
+
+		/* write the crc32 table */
+		list = sorted_by_sha;
+		for (i = 0; i < nr_objects; i++) {
+			struct object_entry *entry = *list++;
+			uint32_t crc32_val = htonl(entry->crc32);
+			sha1write(f, &crc32_val, 4);
+		}
+
+		/* write the 32-bit offset table */
+		list = sorted_by_sha;
+		for (i = 0; i < nr_objects; i++) {
+			struct object_entry *entry = *list++;
+			uint32_t offset = (entry->offset <= 0x7fffffff) ?
+				entry->offset : (0x80000000 | nr_large_offset++);
+			offset = htonl(offset);
+			sha1write(f, &offset, 4);
+		}
+
+		/* write the large offset table */
+		list = sorted_by_sha;
+		while (nr_large_offset) {
+			struct object_entry *entry = *list++;
+			uint64_t offset = entry->offset;
+			if (offset > 0x7fffffff) {
+				uint32_t split[2];
+				split[0]        = htonl(offset >> 32);
+				split[1] = htonl(offset & 0xffffffff);
+				sha1write(f, split, 8);
+				nr_large_offset--;
+			}
+		}
+	}
+
 	sha1write(f, pack_file_sha1, 20);
 	sha1close(f, NULL, 1);
 }
-- 
1.5.1.696.g6d352-dirty

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 07/10] index-pack: learn about pack index version 2
  2007-04-09  5:06           ` [PATCH 06/10] pack-objects: learn about pack index version 2 Nicolas Pitre
@ 2007-04-09  5:06             ` Nicolas Pitre
  2007-04-09  5:06               ` [PATCH 08/10] sha1_file.c: learn about " Nicolas Pitre
  2007-04-09  5:32             ` [PATCH 06/10] pack-objects: learn about pack index version 2 Junio C Hamano
  1 sibling, 1 reply; 24+ messages in thread
From: Nicolas Pitre @ 2007-04-09  5:06 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git, Nicolas Pitre

Like previous patch but for index-pack.

[ There is quite some code duplication between pack-objects and index-pack
  for generating a pack index (and fast-import as well I suppose).  This
  should be reworked into a common function eventually. But not now. ]

Signed-off-by: Nicolas Pitre <nico@cam.org>
---
 index-pack.c |   66 ++++++++++++++++++++++++++++++++++++++++++++++++++--------
 1 files changed, 57 insertions(+), 9 deletions(-)

diff --git a/index-pack.c b/index-pack.c
index d33f723..a833f64 100644
--- a/index-pack.c
+++ b/index-pack.c
@@ -686,9 +686,10 @@ static const char *write_index_file(const char *index_name, unsigned char *sha1)
 {
 	struct sha1file *f;
 	struct object_entry **sorted_by_sha, **list, **last;
-	unsigned int array[256];
+	uint32_t array[256];
 	int i, fd;
 	SHA_CTX ctx;
+	uint32_t index_version;
 
 	if (nr_objects) {
 		sorted_by_sha =
@@ -699,7 +700,6 @@ static const char *write_index_file(const char *index_name, unsigned char *sha1)
 			sorted_by_sha[i] = &objects[i];
 		qsort(sorted_by_sha, nr_objects, sizeof(sorted_by_sha[0]),
 		      sha1_compare);
-
 	}
 	else
 		sorted_by_sha = list = last = NULL;
@@ -718,6 +718,17 @@ static const char *write_index_file(const char *index_name, unsigned char *sha1)
 		die("unable to create %s: %s", index_name, strerror(errno));
 	f = sha1fd(fd, index_name);
 
+	/* if last object's offset is >= 2^31 we should use index V2 */
+	index_version = (objects[nr_objects-1].offset >> 31) ? 2 : 1;
+
+	/* index versions 2 and above need a header */
+	if (index_version >= 2) {
+		struct pack_idx_header hdr;
+		hdr.idx_signature = htonl(PACK_IDX_SIGNATURE);
+		hdr.idx_version = htonl(index_version);
+		sha1write(f, &hdr, sizeof(hdr));
+	}
+
 	/*
 	 * Write the first-level table (the list is sorted,
 	 * but we use a 256-entry lookup to be able to avoid
@@ -734,24 +745,61 @@ static const char *write_index_file(const char *index_name, unsigned char *sha1)
 		array[i] = htonl(next - sorted_by_sha);
 		list = next;
 	}
-	sha1write(f, array, 256 * sizeof(int));
+	sha1write(f, array, 256 * 4);
 
-	/* recompute the SHA1 hash of sorted object names.
-	 * currently pack-objects does not do this, but that
-	 * can be fixed.
-	 */
+	/* compute the SHA1 hash of sorted object names. */
 	SHA1_Init(&ctx);
+
 	/*
 	 * Write the actual SHA1 entries..
 	 */
 	list = sorted_by_sha;
 	for (i = 0; i < nr_objects; i++) {
 		struct object_entry *obj = *list++;
-		unsigned int offset = htonl(obj->offset);
-		sha1write(f, &offset, 4);
+		if (index_version < 2) {
+			uint32_t offset = htonl(obj->offset);
+			sha1write(f, &offset, 4);
+		}
 		sha1write(f, obj->sha1, 20);
 		SHA1_Update(&ctx, obj->sha1, 20);
 	}
+
+	if (index_version >= 2) {
+		unsigned int nr_large_offset = 0;
+
+		/* write the crc32 table */
+		list = sorted_by_sha;
+		for (i = 0; i < nr_objects; i++) {
+			struct object_entry *obj = *list++;
+			uint32_t crc32_val = htonl(obj->crc32);
+			sha1write(f, &crc32_val, 4);
+		}
+
+		/* write the 32-bit offset table */
+		list = sorted_by_sha;
+		for (i = 0; i < nr_objects; i++) {
+			struct object_entry *obj = *list++;
+			uint32_t offset = (obj->offset <= 0x7fffffff) ?
+				obj->offset : (0x80000000 | nr_large_offset++);
+			offset = htonl(offset);
+			sha1write(f, &offset, 4);
+		}
+
+		/* write the large offset table */
+		list = sorted_by_sha;
+		while (nr_large_offset) {
+			struct object_entry *obj = *list++;
+			uint64_t offset = obj->offset;
+			if (offset > 0x7fffffff) {
+				uint32_t split[2];
+				split[0]	= htonl(offset >> 32);
+				split[1] = htonl(offset & 0xffffffff);
+				sha1write(f, split, 8);
+				nr_large_offset--;
+			}
+		}
+	}
+
 	sha1write(f, sha1, 20);
 	sha1close(f, NULL, 1);
 	free(sorted_by_sha);
-- 
1.5.1.696.g6d352-dirty

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 08/10] sha1_file.c: learn about index version 2
  2007-04-09  5:06             ` [PATCH 07/10] index-pack: " Nicolas Pitre
@ 2007-04-09  5:06               ` Nicolas Pitre
  2007-04-09  5:06                 ` [PATCH 09/10] show-index.c: learn about index v2 Nicolas Pitre
  0 siblings, 1 reply; 24+ messages in thread
From: Nicolas Pitre @ 2007-04-09  5:06 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git, Nicolas Pitre

With this patch, packs larger than 4GB are usable, even on a 32-bit machine
(at least on Linux).  If off_t is not large enough to deal with a large
pack then die() is called instead of attempting to use the pack and
producing garbage.

This was tested with a 8GB pack specially created for the occasion on
a 32-bit machine.

Signed-off-by: Nicolas Pitre <nico@cam.org>
---
 sha1_file.c |  118 ++++++++++++++++++++++++++++++++++++++++++++--------------
 1 files changed, 89 insertions(+), 29 deletions(-)

diff --git a/sha1_file.c b/sha1_file.c
index ebdd497..5f3a15d 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -437,7 +437,7 @@ static int check_packed_git_idx(const char *path,  struct packed_git *p)
 	void *idx_map;
 	struct pack_idx_header *hdr;
 	size_t idx_size;
-	uint32_t nr, i, *index;
+	uint32_t version, nr, i, *index;
 	int fd = open(path, O_RDONLY);
 	struct stat st;
 
@@ -455,21 +455,23 @@ static int check_packed_git_idx(const char *path,  struct packed_git *p)
 	idx_map = xmmap(NULL, idx_size, PROT_READ, MAP_PRIVATE, fd, 0);
 	close(fd);
 
-	/* a future index format would start with this, as older git
-	 * binaries would fail the non-monotonic index check below.
-	 * give a nicer warning to the user if we can.
-	 */
 	hdr = idx_map;
 	if (hdr->idx_signature == htonl(PACK_IDX_SIGNATURE)) {
-		munmap(idx_map, idx_size);
-		return error("index file %s is a newer version"
-			" and is not supported by this binary"
-			" (try upgrading GIT to a newer version)",
-			path);
-	}
+		version = ntohl(hdr->idx_version);
+		if (version < 2 || version > 2) {
+			munmap(idx_map, idx_size);
+			return error("index file %s is version %d"
+				     " and is not supported by this binary"
+				     " (try upgrading GIT to a newer version)",
+				     path, version);
+		}
+	} else
+		version = 1;
 
 	nr = 0;
 	index = idx_map;
+	if (version > 1)
+		index += 2;  /* skip index header */
 	for (i = 0; i < 256; i++) {
 		uint32_t n = ntohl(index[i]);
 		if (n < nr) {
@@ -479,19 +481,48 @@ static int check_packed_git_idx(const char *path,  struct packed_git *p)
 		nr = n;
 	}
 
-	/*
-	 * Total size:
-	 *  - 256 index entries 4 bytes each
-	 *  - 24-byte entries * nr (20-byte sha1 + 4-byte offset)
-	 *  - 20-byte SHA1 of the packfile
-	 *  - 20-byte SHA1 file checksum
-	 */
-	if (idx_size != 4*256 + nr * 24 + 20 + 20) {
-		munmap(idx_map, idx_size);
-		return error("wrong index file size in %s", path);
+	if (version == 1) {
+		/*
+		 * Total size:
+		 *  - 256 index entries 4 bytes each
+		 *  - 24-byte entries * nr (20-byte sha1 + 4-byte offset)
+		 *  - 20-byte SHA1 of the packfile
+		 *  - 20-byte SHA1 file checksum
+		 */
+		if (idx_size != 4*256 + nr * 24 + 20 + 20) {
+			munmap(idx_map, idx_size);
+			return error("wrong index file size in %s", path);
+		}
+	} else if (version == 2) {
+		/*
+		 * Minimum size:
+		 *  - 8 bytes of header
+		 *  - 256 index entries 4 bytes each
+		 *  - 20-byte sha1 entry * nr
+		 *  - 4-byte crc entry * nr
+		 *  - 4-byte offset entry * nr
+		 *  - 20-byte SHA1 of the packfile
+		 *  - 20-byte SHA1 file checksum
+		 * And after the 4-byte offset table might be a 
+		 * variable sized table containing 8-byte entries
+		 * for offsets larger than 2^31.
+		 */
+		unsigned long min_size = 8 + 4*256 + nr*(20 + 4 + 4) + 20 + 20;
+		if (idx_size < min_size || idx_size > min_size + (nr - 1)*8) {
+			munmap(idx_map, idx_size);
+			return error("wrong index file size in %s", path);
+		}
+		if (idx_size != min_size) {
+			/* make sure we can deal with large pack offsets */
+			off_t x = 0x7fffffffUL, y = 0xffffffffUL;
+			if (x > (x + 1) || y > (y + 1)) {
+				munmap(idx_map, idx_size);
+				return error("pack too large for current definition of off_t in %s", path);
+			}
+		}
 	}
 
-	p->index_version = 1;
+	p->index_version = version;
 	p->index_data = idx_map;
 	p->index_size = idx_size;
 	p->num_objects = nr;
@@ -1531,27 +1562,56 @@ const unsigned char *nth_packed_object_sha1(const struct packed_git *p,
 					    uint32_t n)
 {
 	const unsigned char *index = p->index_data;
-	index += 4 * 256;
 	if (n >= p->num_objects)
 		return NULL;
-	return index + 24 * n + 4;
+	index += 4 * 256;
+	if (p->index_version == 1) {
+		return index + 24 * n + 4;
+	} else {
+		index += 8;
+		return index + 20 * n;
+	}
+}
+
+static off_t nth_packed_object_offset(const struct packed_git *p, uint32_t n)
+{
+	const unsigned char *index = p->index_data;
+	index += 4 * 256;
+	if (p->index_version == 1) {
+		return ntohl(*((uint32_t *)(index + 24 * n)));
+	} else {
+		uint32_t off;
+		index += 8 + p->num_objects * (20 + 4);
+		off = ntohl(*((uint32_t *)(index + 4 * n)));
+		if (!(off & 0x80000000))
+			return off;
+		index += p->num_objects * 4 + (off & 0x7fffffff) * 8;
+		return (((uint64_t)ntohl(*((uint32_t *)(index + 0)))) << 32) |
+				   ntohl(*((uint32_t *)(index + 4)));
+	}
 }
 
 off_t find_pack_entry_one(const unsigned char *sha1,
 				  struct packed_git *p)
 {
 	const uint32_t *level1_ofs = p->index_data;
-	int hi = ntohl(level1_ofs[*sha1]);
-	int lo = ((*sha1 == 0x0) ? 0 : ntohl(level1_ofs[*sha1 - 1]));
 	const unsigned char *index = p->index_data;
+	unsigned hi, lo;
 
+	if (p->index_version > 1) {
+		level1_ofs += 2;
+		index += 8;
+	}
 	index += 4 * 256;
+	hi = ntohl(level1_ofs[*sha1]);
+	lo = ((*sha1 == 0x0) ? 0 : ntohl(level1_ofs[*sha1 - 1]));
 
 	do {
-		int mi = (lo + hi) / 2;
-		int cmp = hashcmp(index + 24 * mi + 4, sha1);
+		unsigned mi = (lo + hi) / 2;
+		unsigned x = (p->index_version > 1) ? (mi * 20) : (mi * 24 + 4);
+		int cmp = hashcmp(index + x, sha1);
 		if (!cmp)
-			return ntohl(*((uint32_t *)((char *)index + (24 * mi))));
+			return nth_packed_object_offset(p, mi);
 		if (cmp > 0)
 			hi = mi;
 		else
-- 
1.5.1.696.g6d352-dirty

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 09/10] show-index.c: learn about index v2
  2007-04-09  5:06               ` [PATCH 08/10] sha1_file.c: learn about " Nicolas Pitre
@ 2007-04-09  5:06                 ` Nicolas Pitre
  2007-04-09  5:06                   ` [PATCH 10/10] pack-redundant.c: " Nicolas Pitre
  0 siblings, 1 reply; 24+ messages in thread
From: Nicolas Pitre @ 2007-04-09  5:06 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git, Nicolas Pitre

When index v2 is encountered, the CRC32 of each object is also displayed
in parenthesis at the end of the line.

Signed-off-by: Nicolas Pitre <nico@cam.org>
---
 show-index.c |   67 ++++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 files changed, 58 insertions(+), 9 deletions(-)

diff --git a/show-index.c b/show-index.c
index a30a2de..cae1f7d 100644
--- a/show-index.c
+++ b/show-index.c
@@ -1,14 +1,26 @@
 #include "cache.h"
+#include "pack.h"
 
 int main(int argc, char **argv)
 {
 	int i;
 	unsigned nr;
-	unsigned int entry[6];
+	unsigned int version;
 	static unsigned int top_index[256];
 
-	if (fread(top_index, sizeof(top_index), 1, stdin) != 1)
-		die("unable to read index");
+	if (fread(top_index, 2 * 4, 1, stdin) != 1)
+		die("unable to read header");
+	if (top_index[0] == htonl(PACK_IDX_SIGNATURE)) {
+		version = ntohl(top_index[1]);
+		if (version < 2 || version > 2)
+			die("unknown index version");
+		if (fread(top_index, 256 * 4, 1, stdin) != 1)
+			die("unable to read index");
+	} else {
+		version = 1;
+		if (fread(&top_index[2], 254 * 4, 1, stdin) != 1)
+			die("unable to read index");
+	}
 	nr = 0;
 	for (i = 0; i < 256; i++) {
 		unsigned n = ntohl(top_index[i]);
@@ -16,13 +28,50 @@ int main(int argc, char **argv)
 			die("corrupt index file");
 		nr = n;
 	}
-	for (i = 0; i < nr; i++) {
-		unsigned offset;
+	if (version == 1) {
+		for (i = 0; i < nr; i++) {
+			unsigned int offset, entry[6];
 
-		if (fread(entry, 24, 1, stdin) != 1)
-			die("unable to read entry %u/%u", i, nr);
-		offset = ntohl(entry[0]);
-		printf("%u %s\n", offset, sha1_to_hex((void *)(entry+1)));
+			if (fread(entry, 4 + 20, 1, stdin) != 1)
+				die("unable to read entry %u/%u", i, nr);
+			offset = ntohl(entry[0]);
+			printf("%u %s\n", offset, sha1_to_hex((void *)(entry+1)));
+		}
+	} else {
+		unsigned off64_nr = 0;
+		struct {
+			unsigned char sha1[20];
+			uint32_t crc;
+			uint32_t off;
+		} *entries = xmalloc(nr * sizeof(entries[0]));
+		for (i = 0; i < nr; i++)
+			if (fread(entries[i].sha1, 20, 1, stdin) != 1)
+				die("unable to read sha1 %u/%u", i, nr);
+		for (i = 0; i < nr; i++)
+			if (fread(&entries[i].crc, 4, 1, stdin) != 1)
+				die("unable to read crc %u/%u", i, nr);
+		for (i = 0; i < nr; i++)
+			if (fread(&entries[i].off, 4, 1, stdin) != 1)
+				die("unable to read 32b offset %u/%u", i, nr);
+		for (i = 0; i < nr; i++) {
+			uint64_t offset;
+			uint32_t off = ntohl(entries[i].off);
+			if (!(off & 0x80000000)) {
+				offset = off;
+			} else {
+				uint32_t off64[2];
+				if ((off & 0x7fffffff) != off64_nr)
+					die("inconsistent 64b offset index");
+				if (fread(off64, 8, 1, stdin) != 1)
+					die("unable to read 64b offset %u", off64_nr);
+				offset = (((uint64_t)ntohl(off64[0])) << 32) |
+						     ntohl(off64[1]);
+				off64_nr++;
+			}
+			printf("%llu %s (%08x)\n", offset,
+			       sha1_to_hex(entries[i].sha1), ntohl(entries[i].crc));
+		}
+		free(entries);
 	}
 	return 0;
 }
-- 
1.5.1.696.g6d352-dirty

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 10/10] pack-redundant.c: learn about index v2
  2007-04-09  5:06                 ` [PATCH 09/10] show-index.c: learn about index v2 Nicolas Pitre
@ 2007-04-09  5:06                   ` Nicolas Pitre
  0 siblings, 0 replies; 24+ messages in thread
From: Nicolas Pitre @ 2007-04-09  5:06 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git, Nicolas Pitre

Initially the conversion was made using nth_packed_object_sha1() which
made this file completely index version agnostic. Unfortunately the
overhead was quite significant so I went back to raw index walking but
with selectable base and step values which brought back similar
performances as the original.

Signed-off-by: Nicolas Pitre <nico@cam.org>
---
 pack-redundant.c |   47 +++++++++++++++++++++++++++--------------------
 1 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/pack-redundant.c b/pack-redundant.c
index 40e579b..87077e1 100644
--- a/pack-redundant.c
+++ b/pack-redundant.c
@@ -247,16 +247,19 @@ static struct pack_list * pack_list_difference(const struct pack_list *A,
 
 static void cmp_two_packs(struct pack_list *p1, struct pack_list *p2)
 {
-	int p1_off, p2_off;
+	unsigned long p1_off = 0, p2_off = 0, p1_step, p2_step;
 	const unsigned char *p1_base, *p2_base;
 	struct llist_item *p1_hint = NULL, *p2_hint = NULL;
 
-	p1_off = p2_off = 256 * 4 + 4;
 	p1_base = p1->pack->index_data;
 	p2_base = p2->pack->index_data;
+	p1_base += 256 * 4 + ((p1->pack->index_version < 2) ? 4 : 8);
+	p2_base += 256 * 4 + ((p2->pack->index_version < 2) ? 4 : 8);
+	p1_step = (p1->pack->index_version < 2) ? 24 : 20;
+	p2_step = (p2->pack->index_version < 2) ? 24 : 20;
 
-	while (p1_off <= p1->pack->index_size - 3 * 20 &&
-	       p2_off <= p2->pack->index_size - 3 * 20)
+	while (p1_off < p1->pack->num_objects * p1_step &&
+	       p2_off < p2->pack->num_objects * p2_step)
 	{
 		int cmp = hashcmp(p1_base + p1_off, p2_base + p2_off);
 		/* cmp ~ p1 - p2 */
@@ -265,14 +268,14 @@ static void cmp_two_packs(struct pack_list *p1, struct pack_list *p2)
 					p1_base + p1_off, p1_hint);
 			p2_hint = llist_sorted_remove(p2->unique_objects,
 					p1_base + p1_off, p2_hint);
-			p1_off+=24;
-			p2_off+=24;
+			p1_off += p1_step;
+			p2_off += p2_step;
 			continue;
 		}
 		if (cmp < 0) { /* p1 has the object, p2 doesn't */
-			p1_off+=24;
+			p1_off += p1_step;
 		} else { /* p2 has the object, p1 doesn't */
-			p2_off+=24;
+			p2_off += p2_step;
 		}
 	}
 }
@@ -352,28 +355,31 @@ static int is_superset(struct pack_list *pl, struct llist *list)
 static size_t sizeof_union(struct packed_git *p1, struct packed_git *p2)
 {
 	size_t ret = 0;
-	int p1_off, p2_off;
+	unsigned long p1_off = 0, p2_off = 0, p1_step, p2_step;
 	const unsigned char *p1_base, *p2_base;
 
-	p1_off = p2_off = 256 * 4 + 4;
 	p1_base = p1->index_data;
 	p2_base = p2->index_data;
+	p1_base += 256 * 4 + ((p1->index_version < 2) ? 4 : 8);
+	p2_base += 256 * 4 + ((p2->index_version < 2) ? 4 : 8);
+	p1_step = (p1->index_version < 2) ? 24 : 20;
+	p2_step = (p2->index_version < 2) ? 24 : 20;
 
-	while (p1_off <= p1->index_size - 3 * 20 &&
-	       p2_off <= p2->index_size - 3 * 20)
+	while (p1_off < p1->num_objects * p1_step &&
+	       p2_off < p2->num_objects * p2_step)
 	{
 		int cmp = hashcmp(p1_base + p1_off, p2_base + p2_off);
 		/* cmp ~ p1 - p2 */
 		if (cmp == 0) {
 			ret++;
-			p1_off+=24;
-			p2_off+=24;
+			p1_off += p1_step;
+			p2_off += p2_step;
 			continue;
 		}
 		if (cmp < 0) { /* p1 has the object, p2 doesn't */
-			p1_off+=24;
+			p1_off += p1_step;
 		} else { /* p2 has the object, p1 doesn't */
-			p2_off+=24;
+			p2_off += p2_step;
 		}
 	}
 	return ret;
@@ -535,7 +541,7 @@ static void scan_alt_odb_packs(void)
 static struct pack_list * add_pack(struct packed_git *p)
 {
 	struct pack_list l;
-	size_t off;
+	unsigned long off = 0, step;
 	const unsigned char *base;
 
 	if (!p->pack_local && !(alt_odb || verbose))
@@ -544,11 +550,12 @@ static struct pack_list * add_pack(struct packed_git *p)
 	l.pack = p;
 	llist_init(&l.all_objects);
 
-	off = 256 * 4 + 4;
 	base = p->index_data;
-	while (off <= p->index_size - 3 * 20) {
+	base += 256 * 4 + ((p->index_version < 2) ? 4 : 8);
+	step = (p->index_version < 2) ? 24 : 20;
+	while (off < p->num_objects * step) {
 		llist_insert_back(l.all_objects, base + off);
-		off += 24;
+		off += step;
 	}
 	/* this list will be pruned in cmp_two_packs later */
 	l.unique_objects = llist_copy(l.all_objects);
-- 
1.5.1.696.g6d352-dirty

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCH 06/10] pack-objects: learn about pack index version 2
  2007-04-09  5:06           ` [PATCH 06/10] pack-objects: learn about pack index version 2 Nicolas Pitre
  2007-04-09  5:06             ` [PATCH 07/10] index-pack: " Nicolas Pitre
@ 2007-04-09  5:32             ` Junio C Hamano
  2007-04-09 14:54               ` Nicolas Pitre
  1 sibling, 1 reply; 24+ messages in thread
From: Junio C Hamano @ 2007-04-09  5:32 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git

Nicolas Pitre <nico@cam.org> writes:

> Support for large packs exceeding 31 bits in size won't impose an index
> size bloat for packs within that range that don't need a 64-bit offset.
> And because newer objects which are likely to be the most frequently used
> are located at the beginning of the pack, they won't pay the 64-bit offset
> lookup at run time either even if the pack is large.
>
> Right now an index version 2 is created only when the biggest offset in a
> pack reaches 31 bits.  It might be a good idea to always use index version
> 2 eventually to benefit from the CRC32 it contains when reusing pack data
> while repacking.
> ...
> @@ -582,6 +602,18 @@ static void write_index_file(void)
>  	struct object_entry **list = sorted_by_sha;
>  	struct object_entry **last = list + nr_result;
>  	uint32_t array[256];
> +	uint32_t index_version;
> +
> +	/* if last object's offset is >= 2^31 we should use index V2 */
> +	index_version = (objects[nr_result-1].offset >> 31) ? 2 : 1;

Although write_pack_file() iterates objects[] array in the
ascending order of index and calls write_one() for each of them,
in the presense of "we write base object before delta" rule, is
it always true that the last object in the objects[] array has
the largest offset?

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 06/10] pack-objects: learn about pack index version 2
  2007-04-09  5:32             ` [PATCH 06/10] pack-objects: learn about pack index version 2 Junio C Hamano
@ 2007-04-09 14:54               ` Nicolas Pitre
  0 siblings, 0 replies; 24+ messages in thread
From: Nicolas Pitre @ 2007-04-09 14:54 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

On Sun, 8 Apr 2007, Junio C Hamano wrote:

> Nicolas Pitre <nico@cam.org> writes:
> 
> > Support for large packs exceeding 31 bits in size won't impose an index
> > size bloat for packs within that range that don't need a 64-bit offset.
> > And because newer objects which are likely to be the most frequently used
> > are located at the beginning of the pack, they won't pay the 64-bit offset
> > lookup at run time either even if the pack is large.
> >
> > Right now an index version 2 is created only when the biggest offset in a
> > pack reaches 31 bits.  It might be a good idea to always use index version
> > 2 eventually to benefit from the CRC32 it contains when reusing pack data
> > while repacking.
> > ...
> > @@ -582,6 +602,18 @@ static void write_index_file(void)
> >  	struct object_entry **list = sorted_by_sha;
> >  	struct object_entry **last = list + nr_result;
> >  	uint32_t array[256];
> > +	uint32_t index_version;
> > +
> > +	/* if last object's offset is >= 2^31 we should use index V2 */
> > +	index_version = (objects[nr_result-1].offset >> 31) ? 2 : 1;
> 
> Although write_pack_file() iterates objects[] array in the
> ascending order of index and calls write_one() for each of them,
> in the presense of "we write base object before delta" rule, is
> it always true that the last object in the objects[] array has
> the largest offset?

Oops, indeed it is not.  It is true in the index-pack but not here.

Please could you amend this patch with the following:

diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c
index c7e0a42..8cf2871 100644
--- a/builtin-pack-objects.c
+++ b/builtin-pack-objects.c
@@ -548,11 +548,11 @@ static off_t write_one(struct sha1file *f,
 	return offset + size;
 }
 
-static void write_pack_file(void)
+static off_t write_pack_file(void)
 {
 	uint32_t i;
 	struct sha1file *f;
-	off_t offset;
+	off_t offset, last_obj_offset = 0;
 	struct pack_header hdr;
 	unsigned last_percent = 999;
 	int do_progress = progress;
@@ -575,6 +575,7 @@ static void write_pack_file(void)
 	if (!nr_result)
 		goto done;
 	for (i = 0; i < nr_objects; i++) {
+		last_obj_offset = offset;
 		offset = write_one(f, objects + i, offset);
 		if (do_progress) {
 			unsigned percent = written * 100 / nr_result;
@@ -592,9 +593,11 @@ static void write_pack_file(void)
 	if (written != nr_result)
 		die("wrote %u objects while expecting %u", written, nr_result);
 	sha1close(f, pack_file_sha1, 1);
+
+	return last_obj_offset;
 }
 
-static void write_index_file(void)
+static void write_index_file(off_t last_obj_offset)
 {
 	uint32_t i;
 	struct sha1file *f = sha1create("%s-%s.%s", base_name,
@@ -605,7 +608,7 @@ static void write_index_file(void)
 	uint32_t index_version;
 
 	/* if last object's offset is >= 2^31 we should use index V2 */
-	index_version = (objects[nr_result-1].offset >> 31) ? 2 : 1;
+	index_version = (last_obj_offset >> 31) ? 2 : 1;
 
 	/* index versions 2 and above need a header */
 	if (index_version >= 2) {
@@ -1769,6 +1772,7 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 	if (reuse_cached_pack(object_list_sha1))
 		;
 	else {
+		off_t last_obj_offset;
 		if (nr_result)
 			prepare_pack(window, depth);
 		if (progress == 1 && pack_to_stdout) {
@@ -1778,9 +1782,9 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 			signal(SIGALRM, SIG_IGN );
 			progress_update = 0;
 		}
-		write_pack_file();
+		last_obj_offset = write_pack_file();
 		if (!pack_to_stdout) {
-			write_index_file();
+			write_index_file(last_obj_offset);
 			puts(sha1_to_hex(object_list_sha1));
 		}
 	}

Nicolas

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: support for large packs and 64-bit offsets
  2007-04-09  5:06 support for large packs and 64-bit offsets Nicolas Pitre
  2007-04-09  5:06 ` [PATCH 01/10] get rid of num_packed_objects() Nicolas Pitre
@ 2007-04-09 17:19 ` Shawn O. Pearce
  2007-04-09 17:32   ` Nicolas Pitre
  2007-04-09 18:02   ` Linus Torvalds
  1 sibling, 2 replies; 24+ messages in thread
From: Shawn O. Pearce @ 2007-04-09 17:19 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Junio C Hamano, git

Nicolas Pitre <nico@cam.org> wrote:
> This patch series adds support for large packs to GIT.  It creates a new
> index format to accommodate both larger offsets and to store a checksum
> for each raw object in a pack.  More details is provided in each patch
> commit message.
> 
> This is not a replacement for the pack size limiting feature.  I think that
> GIT should support both: large packs and split packs.

I like this series.  I haven't had time to test it myself yet,
and probably won't be able to do so before Junio merges it into a
pu or next release.  But it looks OK on a first read.

It is unfortunate that we are changing the index file format without
also bringing in packv4 support at the same time.  I have just been
too swamped in useless bulls**t in day-job work to spend time on
Git lately.

-- 
Shawn.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: support for large packs and 64-bit offsets
  2007-04-09 17:19 ` support for large packs and 64-bit offsets Shawn O. Pearce
@ 2007-04-09 17:32   ` Nicolas Pitre
  2007-04-09 17:43     ` Shawn O. Pearce
  2007-04-09 18:02   ` Linus Torvalds
  1 sibling, 1 reply; 24+ messages in thread
From: Nicolas Pitre @ 2007-04-09 17:32 UTC (permalink / raw)
  To: Shawn O. Pearce; +Cc: Junio C Hamano, git

On Mon, 9 Apr 2007, Shawn O. Pearce wrote:

> It is unfortunate that we are changing the index file format without
> also bringing in packv4 support at the same time.  I have just been
> too swamped in useless bulls**t in day-job work to spend time on
> Git lately.

Well... I still did index v2 with pack v4 in mind.  The diference 
between index v2 and v3 would be minimal.

Pack v4 is coming along.  Slowly but still coming.


Nicolas

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: support for large packs and 64-bit offsets
  2007-04-09 17:32   ` Nicolas Pitre
@ 2007-04-09 17:43     ` Shawn O. Pearce
  2007-04-09 19:49       ` Junio C Hamano
  0 siblings, 1 reply; 24+ messages in thread
From: Shawn O. Pearce @ 2007-04-09 17:43 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Junio C Hamano, git

Nicolas Pitre <nico@cam.org> wrote:
> On Mon, 9 Apr 2007, Shawn O. Pearce wrote:
> 
> > It is unfortunate that we are changing the index file format without
> > also bringing in packv4 support at the same time.  I have just been
> > too swamped in useless bulls**t in day-job work to spend time on
> > Git lately.
> 
> Well... I still did index v2 with pack v4 in mind.  The diference 
> between index v2 and v3 would be minimal.
> 
> Pack v4 is coming along.  Slowly but still coming.

I take it you are working on it alone at this point?  I'd love
to get back into it, but I don't think I've got the cycles for at
least a couple of weeks.


Here's something we didn't think about, but that occurred to me today
when reading this series: If we move the SHA-1 table out of the index
and into the packfile (like we are planning) dumb commit-walkers
(http-fetch) will have problems.  Right now they download the
indexes of every available packfile to determine if they need to
download the corresponding packfile to obtain a needed object.

Moving the SHA-1 table from the index into the packfile will mean
the client cannot do this `optimization'.  Instead it will need to
perform a byte-range request for part of the packfile to decide
if it needs to fetch the remainder of that packfile; or it must
download the entire packfile.  Since not all HTTP servers support
byte-range requests the former may not always be viable and the
latter is obviously not a good idea.

-- 
Shawn.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: support for large packs and 64-bit offsets
  2007-04-09 17:19 ` support for large packs and 64-bit offsets Shawn O. Pearce
  2007-04-09 17:32   ` Nicolas Pitre
@ 2007-04-09 18:02   ` Linus Torvalds
  2007-04-09 18:26     ` Nicolas Pitre
  1 sibling, 1 reply; 24+ messages in thread
From: Linus Torvalds @ 2007-04-09 18:02 UTC (permalink / raw)
  To: Shawn O. Pearce; +Cc: Nicolas Pitre, Junio C Hamano, git



On Mon, 9 Apr 2007, Shawn O. Pearce wrote:
> 
> I like this series.  I haven't had time to test it myself yet,
> and probably won't be able to do so before Junio merges it into a
> pu or next release.  But it looks OK on a first read.

Me too.. I just wonder whether it should have some more test coverage:
 - force v2 index by default
 - force a mode where a random smattering of index entries are done using 
   64-bit notation (even if they obviously won't need the high 33 bits)

As it is, nobody would normally even *use* the new format, much less the 
new capabilities.. And if it's not used, it's not tested, and it's going 
to have bugs.

		Linus

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: support for large packs and 64-bit offsets
  2007-04-09 18:02   ` Linus Torvalds
@ 2007-04-09 18:26     ` Nicolas Pitre
  2007-04-09 18:34       ` Shawn O. Pearce
  2007-04-09 19:46       ` Nicolas Pitre
  0 siblings, 2 replies; 24+ messages in thread
From: Nicolas Pitre @ 2007-04-09 18:26 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Shawn O. Pearce, Junio C Hamano, git

On Mon, 9 Apr 2007, Linus Torvalds wrote:

> 
> 
> On Mon, 9 Apr 2007, Shawn O. Pearce wrote:
> > 
> > I like this series.  I haven't had time to test it myself yet,
> > and probably won't be able to do so before Junio merges it into a
> > pu or next release.  But it looks OK on a first read.
> 
> Me too.. I just wonder whether it should have some more test coverage:
>  - force v2 index by default

Easy enough.  That's what I did for testing of course.  And I'd hope it 
would be always on by default eventually.  The only reason why it is not 
on by default is because of old clients with dumb protocols.

>  - force a mode where a random smattering of index entries are done using 
>    64-bit notation (even if they obviously won't need the high 33 bits)

That could be achieved with an additional criteria, like the offset's 
LSB value which should be pretty random.

Once Junio merges the series in a branch of its own, we could have a 
separate patch not in that branch that simply forces those features on 
in branch 'next'.  People would only need to remenber not publishing 
repos with that version using dumb protocols.


Nicolas

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: support for large packs and 64-bit offsets
  2007-04-09 18:26     ` Nicolas Pitre
@ 2007-04-09 18:34       ` Shawn O. Pearce
  2007-04-09 19:46       ` Nicolas Pitre
  1 sibling, 0 replies; 24+ messages in thread
From: Shawn O. Pearce @ 2007-04-09 18:34 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Linus Torvalds, Junio C Hamano, git

Nicolas Pitre <nico@cam.org> wrote:
> Once Junio merges the series in a branch of its own, we could have a 
> separate patch not in that branch that simply forces those features on 
> in branch 'next'.  People would only need to remenber not publishing 
> repos with that version using dumb protocols.

Better to force features by config file option or command line
option, then by code hacked into a branch.  This way we can enable
the feature all of the time in a test case, and push it.  E.g. the
sliding window controls; nobody really uses them in practice (I
don't think anyway) but they are handy in the test cases to force
particular conditions, without creating huge temporary packfiles.

-- 
Shawn.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: support for large packs and 64-bit offsets
  2007-04-09 18:26     ` Nicolas Pitre
  2007-04-09 18:34       ` Shawn O. Pearce
@ 2007-04-09 19:46       ` Nicolas Pitre
  1 sibling, 0 replies; 24+ messages in thread
From: Nicolas Pitre @ 2007-04-09 19:46 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Shawn O. Pearce, Junio C Hamano, git

On Mon, 9 Apr 2007, Nicolas Pitre wrote:

> >  - force a mode where a random smattering of index entries are done using 
> >    64-bit notation (even if they obviously won't need the high 33 bits)
> 
> That could be achieved with an additional criteria, like the offset's 
> LSB value which should be pretty random.
> 
> Once Junio merges the series in a branch of its own, we could have a 
> separate patch not in that branch that simply forces those features on 
> in branch 'next'.

Something like this, although I suppose this might require a more 
permanent solution for the test suite:

diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c
index 8cf2871..8d20ea9 100644
--- a/builtin-pack-objects.c
+++ b/builtin-pack-objects.c
@@ -607,8 +607,12 @@ static void write_index_file(off_t last_obj_offset)
 	uint32_t array[256];
 	uint32_t index_version;
 
+#if 0
 	/* if last object's offset is >= 2^31 we should use index V2 */
 	index_version = (last_obj_offset >> 31) ? 2 : 1;
+#else
+	index_version = 2;
+#endif
 
 	/* index versions 2 and above need a header */
 	if (index_version >= 2) {
@@ -664,8 +668,15 @@ static void write_index_file(off_t last_obj_offset)
 		list = sorted_by_sha;
 		for (i = 0; i < nr_objects; i++) {
 			struct object_entry *entry = *list++;
+#if 0
 			uint32_t offset = (entry->offset <= 0x7fffffff) ?
 				entry->offset : (0x80000000 | nr_large_offset++);
+#else
+			/* force some offsets to the 64-bit table for testing */
+			uint32_t offset =
+				(!(entry->offset & 1) && entry->offset <= 0x7fffffff) ?
+				entry->offset : (0x80000000 | nr_large_offset++);
+#endif
 			offset = htonl(offset);
 			sha1write(f, &offset, 4);
 		}
@@ -675,7 +686,11 @@ static void write_index_file(off_t last_obj_offset)
 		while (nr_large_offset) {
 			struct object_entry *entry = *list++;
 			uint64_t offset = entry->offset;
+#if 0
 			if (offset > 0x7fffffff) {
+#else
+			if (offset & 1 || offset > 0x7fffffff) {
+#endif
 				uint32_t split[2];
 				split[0]        = htonl(offset >> 32);
 				split[1] = htonl(offset & 0xffffffff);
diff --git a/index-pack.c b/index-pack.c
index a833f64..1e6db2b 100644
--- a/index-pack.c
+++ b/index-pack.c
@@ -718,8 +718,13 @@ static const char *write_index_file(const char *index_name, unsigned char *sha1)
 		die("unable to create %s: %s", index_name, strerror(errno));
 	f = sha1fd(fd, index_name);
 
+#if 0
 	/* if last object's offset is >= 2^31 we should use index V2 */
 	index_version = (objects[nr_objects-1].offset >> 31) ? 2 : 1;
+#else
+	/* force v2 always on for testing */
+	index_version = 2;
+#endif
 
 	/* index versions 2 and above need a header */
 	if (index_version >= 2) {
@@ -779,8 +784,15 @@ static const char *write_index_file(const char *index_name, unsigned char *sha1)
 		list = sorted_by_sha;
 		for (i = 0; i < nr_objects; i++) {
 			struct object_entry *obj = *list++;
+#if 0
 			uint32_t offset = (obj->offset <= 0x7fffffff) ?
 				obj->offset : (0x80000000 | nr_large_offset++);
+#else
+			/* force some offsets to the 64-bit table for testing */
+			uint32_t offset =
+				(!(obj->offset & 1) && obj->offset <= 0x7fffffff) ?
+				obj->offset : (0x80000000 | nr_large_offset++);
+#endif
 			offset = htonl(offset);
 			sha1write(f, &offset, 4);
 		}
@@ -790,7 +802,11 @@ static const char *write_index_file(const char *index_name, unsigned char *sha1)
 		while (nr_large_offset) {
 			struct object_entry *obj = *list++;
 			uint64_t offset = obj->offset;
+#if 0
 			if (offset > 0x7fffffff) {
+#else
+			if (offset & 1 || offset > 0x7fffffff) {
+#endif
 				uint32_t split[2];
 				split[0]	= htonl(offset >> 32);
 				split[1] = htonl(offset & 0xffffffff);

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: support for large packs and 64-bit offsets
  2007-04-09 17:43     ` Shawn O. Pearce
@ 2007-04-09 19:49       ` Junio C Hamano
  2007-04-09 19:53         ` Shawn O. Pearce
  0 siblings, 1 reply; 24+ messages in thread
From: Junio C Hamano @ 2007-04-09 19:49 UTC (permalink / raw)
  To: Shawn O. Pearce; +Cc: Nicolas Pitre, git

"Shawn O. Pearce" <spearce@spearce.org> writes:

> Nicolas Pitre <nico@cam.org> wrote:
> ...
> Here's something we didn't think about, but that occurred to me today
> when reading this series: If we move the SHA-1 table out of the index
> and into the packfile (like we are planning) dumb commit-walkers
> (http-fetch) will have problems.  Right now they download the
> indexes of every available packfile to determine if they need to
> download the corresponding packfile to obtain a needed object.

If we really care about older dumb clients, one option is to
generate not .idx but .idx2, and have a corresponding .idx only
to support them.  But at that point, it's probably cleaner to
have an explicit option to produce .idx file of a particular
version, and tell people to pack public repositories they expect
older dumb clients to access with that option to keep things
backward compatible.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: support for large packs and 64-bit offsets
  2007-04-09 19:49       ` Junio C Hamano
@ 2007-04-09 19:53         ` Shawn O. Pearce
  2007-04-09 20:02           ` Nicolas Pitre
  2007-04-09 20:18           ` Junio C Hamano
  0 siblings, 2 replies; 24+ messages in thread
From: Shawn O. Pearce @ 2007-04-09 19:53 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Nicolas Pitre, git

Junio C Hamano <junkio@cox.net> wrote:
> "Shawn O. Pearce" <spearce@spearce.org> writes:
> 
> > Nicolas Pitre <nico@cam.org> wrote:
> > ...
> > Here's something we didn't think about, but that occurred to me today
> > when reading this series: If we move the SHA-1 table out of the index
> > and into the packfile (like we are planning) dumb commit-walkers
> > (http-fetch) will have problems.  Right now they download the
> > indexes of every available packfile to determine if they need to
> > download the corresponding packfile to obtain a needed object.
> 
> If we really care about older dumb clients, one option is to
> generate not .idx but .idx2, and have a corresponding .idx only
> to support them.  But at that point, it's probably cleaner to
> have an explicit option to produce .idx file of a particular
> version, and tell people to pack public repositories they expect
> older dumb clients to access with that option to keep things
> backward compatible.

Sure, fine.  But I think you missed my point above - right now if
we move the SHA-1 table out of the .idx file I'm not sure we know
how to support the dumb clients *at all*.  Even if they understand
the latest-and-greatest file formats...

-- 
Shawn.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: support for large packs and 64-bit offsets
  2007-04-09 19:53         ` Shawn O. Pearce
@ 2007-04-09 20:02           ` Nicolas Pitre
  2007-04-09 20:18           ` Junio C Hamano
  1 sibling, 0 replies; 24+ messages in thread
From: Nicolas Pitre @ 2007-04-09 20:02 UTC (permalink / raw)
  To: Shawn O. Pearce; +Cc: Junio C Hamano, git

On Mon, 9 Apr 2007, Shawn O. Pearce wrote:

> Junio C Hamano <junkio@cox.net> wrote:
> > "Shawn O. Pearce" <spearce@spearce.org> writes:
> > 
> > > Nicolas Pitre <nico@cam.org> wrote:
> > > ...
> > > Here's something we didn't think about, but that occurred to me today
> > > when reading this series: If we move the SHA-1 table out of the index
> > > and into the packfile (like we are planning) dumb commit-walkers
> > > (http-fetch) will have problems.  Right now they download the
> > > indexes of every available packfile to determine if they need to
> > > download the corresponding packfile to obtain a needed object.
> > 
> > If we really care about older dumb clients, one option is to
> > generate not .idx but .idx2, and have a corresponding .idx only
> > to support them.  But at that point, it's probably cleaner to
> > have an explicit option to produce .idx file of a particular
> > version, and tell people to pack public repositories they expect
> > older dumb clients to access with that option to keep things
> > backward compatible.
> 
> Sure, fine.  But I think you missed my point above - right now if
> we move the SHA-1 table out of the .idx file I'm not sure we know
> how to support the dumb clients *at all*.  Even if they understand
> the latest-and-greatest file formats...

The table could live in both the pack and the index for those repos 
expected to be exportable through dumb protocols.


Nicolas

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: support for large packs and 64-bit offsets
  2007-04-09 19:53         ` Shawn O. Pearce
  2007-04-09 20:02           ` Nicolas Pitre
@ 2007-04-09 20:18           ` Junio C Hamano
  1 sibling, 0 replies; 24+ messages in thread
From: Junio C Hamano @ 2007-04-09 20:18 UTC (permalink / raw)
  To: Shawn O. Pearce; +Cc: Nicolas Pitre, git

"Shawn O. Pearce" <spearce@spearce.org> writes:

> Sure, fine.  But I think you missed my point above - right now if
> we move the SHA-1 table out of the .idx file I'm not sure we know
> how to support the dumb clients *at all*.  Even if they understand
> the latest-and-greatest file formats...

We do an incremental .keep pack for packs 100 objects or more.
If .idx omits SHA-1 values but keeps the crc and offset, that
would be around 12 bytes per object (but you may need an index
into the real SHA-1 table in the .pack file, I dunno) so that
would be 1200 bytes.  If we duplicate SHA-1 table also in .idx
that would make it 32-byte per object, totalling 3200 bytes,
which admittedly is near 3-fold increase, but it may be worth if
we want to avoid the hassle.

^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2007-04-09 20:18 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-04-09  5:06 support for large packs and 64-bit offsets Nicolas Pitre
2007-04-09  5:06 ` [PATCH 01/10] get rid of num_packed_objects() Nicolas Pitre
2007-04-09  5:06   ` [PATCH 02/10] make overflow test on delta base offset work regardless of variable size Nicolas Pitre
2007-04-09  5:06     ` [PATCH 03/10] add overflow tests on pack offset variables Nicolas Pitre
2007-04-09  5:06       ` [PATCH 04/10] compute a CRC32 for each object as stored in a pack Nicolas Pitre
2007-04-09  5:06         ` [PATCH 05/10] compute object CRC32 with index-pack Nicolas Pitre
2007-04-09  5:06           ` [PATCH 06/10] pack-objects: learn about pack index version 2 Nicolas Pitre
2007-04-09  5:06             ` [PATCH 07/10] index-pack: " Nicolas Pitre
2007-04-09  5:06               ` [PATCH 08/10] sha1_file.c: learn about " Nicolas Pitre
2007-04-09  5:06                 ` [PATCH 09/10] show-index.c: learn about index v2 Nicolas Pitre
2007-04-09  5:06                   ` [PATCH 10/10] pack-redundant.c: " Nicolas Pitre
2007-04-09  5:32             ` [PATCH 06/10] pack-objects: learn about pack index version 2 Junio C Hamano
2007-04-09 14:54               ` Nicolas Pitre
2007-04-09 17:19 ` support for large packs and 64-bit offsets Shawn O. Pearce
2007-04-09 17:32   ` Nicolas Pitre
2007-04-09 17:43     ` Shawn O. Pearce
2007-04-09 19:49       ` Junio C Hamano
2007-04-09 19:53         ` Shawn O. Pearce
2007-04-09 20:02           ` Nicolas Pitre
2007-04-09 20:18           ` Junio C Hamano
2007-04-09 18:02   ` Linus Torvalds
2007-04-09 18:26     ` Nicolas Pitre
2007-04-09 18:34       ` Shawn O. Pearce
2007-04-09 19:46       ` Nicolas Pitre

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.