All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] Support 64-bit indexes for pack files.
@ 2007-02-26 22:40 Troy Telford
  2007-02-26 23:55 ` Shawn O. Pearce
  2007-02-28 19:46 ` Troy Telford
  0 siblings, 2 replies; 20+ messages in thread
From: Troy Telford @ 2007-02-26 22:40 UTC (permalink / raw)
  To: Shawn O. Pearce; +Cc: git

As I've not been involved with git development before, I'm
aware that this may already be on somebody's 'todo' list.  It was an itch
I needed to scratch, as I have a repository whose size is multiple gigabytes,
and 'git clone' (by default) forces everything into a single
packfile with >=git-1.5.0.

Since I'm an insane form of lazy, needed the C practice, and wanted to
avoid those few keystrokes needed to tweak the default push/fetch behavior,
I'm attempting to either fix git or break it in a new and interesting way.

The patch introduces a new packfile index version, which adds a:
 * header to the index file (index version info).
 * leaves the object indexes within the index at 32-bit
 * extends the offsets used to describe the packfile to 64-bit.

The new index format is only used when the associated  packfile 
becomes large enough to warrant a 64-bit index; otherwise the original
index format is used.

Operations such as git-fsck, checkouts, diffs, and branches
all appear to work properly on x86-64 architectures.  I've done
testing on both x86 and x86_64 architectures, using a git repository
with a single 5.4 GB packfile.

32-bit architectures still have issues using the 64-bit
indexes (git- fsck, pull/fetch return various errors) so it's not yet
complete.

I did have quite a bit of help from Eric Biederman, whom
I'd like to acknowledge and thank for his help.

Signed-off-by: Troy Telford <ttelford.groups@gmail.com>
---
 builtin-pack-objects.c |   50 ++++++++++++-
 cache.h                |   13 ++-
 index-pack.c           |   30 ++++++-
 pack-check.c           |    8 +-
 pack-redundant.c       |   83 +++++++++++++++------
 pack.h                 |    4 +
 sha1_file.c            |  193 +++++++++++++++++++++++++++++++++++-------------
 show-index.c           |   81 +++++++++++++++-----
 8 files changed, 349 insertions(+), 113 deletions(-)

diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c
index b5ed9ce..92087c1 100644
--- a/builtin-pack-objects.c
+++ b/builtin-pack-objects.c
@@ -101,7 +101,7 @@ static int object_ix_hashsz;
  * get the object sha1 from the main index.
  */
 struct revindex_entry {
-	unsigned int offset;
+	off_t offset;
 	unsigned int nr;
 };
 struct pack_revindex {
@@ -163,12 +163,12 @@ static int cmp_offset(const void *a_, const void *b_)
 /*
  * Ordered list of offsets of objects in the pack.
  */
-static void prepare_pack_revindex(struct pack_revindex *rix)
+static void prepare_pack_revindex_v0(struct pack_revindex *rix)
 {
 	struct packed_git *p = rix->p;
 	int num_ent = num_packed_objects(p);
 	int i;
-	void *index = p->index_base + 256;
+	void *index = p->index.base + 256;
 
 	rix->revindex = xmalloc(sizeof(*rix->revindex) * (num_ent + 1));
 	for (i = 0; i < num_ent; i++) {
@@ -184,6 +184,43 @@ static void prepare_pack_revindex(struct pack_revindex *rix)
 	qsort(rix->revindex, num_ent, sizeof(*rix->revindex), cmp_offset);
 }
 
+static void prepare_pack_revindex_v1(struct pack_revindex *rix)
+{
+	struct packed_git *p = rix->p;
+	int num_ent = num_packed_objects(p);
+	int i;
+	uint32_t *index = p->index.base + 256 + 2;
+
+	rix->revindex = xmalloc(sizeof(*rix->revindex) * (num_ent + 1));
+	for (i = 0; i < num_ent; i++) {
+		off_t offset;
+		offset = ntohl(*(index + (7 * i)));
+		offset <<=32;
+		offset |= ntohl(*(index + ((7 * i) + 1)));
+		rix->revindex[i].offset = offset;
+		rix->revindex[i].nr = i;
+	}
+	/* This knows the pack format -- the 20-byte trailer
+	 * follows immediately after the last object data.
+	 */
+	rix->revindex[num_ent].offset = p->pack_size - 20;
+	rix->revindex[num_ent].nr = -1;
+	qsort(rix->revindex, num_ent, sizeof(*rix->revindex), cmp_offset);
+}
+static void prepare_pack_revindex(struct pack_revindex *rix)
+{
+	switch (rix->p->index.version)
+	{
+	case 0:
+		return prepare_pack_revindex_v0(rix);
+	case 1:
+		return prepare_pack_revindex_v1(rix);
+	default:
+		die("Unsupported Pack Version");
+
+	}
+}
+
 static struct revindex_entry * find_packed_object(struct packed_git *p,
 						  unsigned int ofs)
 {
@@ -224,7 +261,12 @@ static unsigned char *find_packed_object_name(struct packed_git *p,
 					      unsigned long ofs)
 {
 	struct revindex_entry *entry = find_packed_object(p, ofs);
-	return (unsigned char *)(p->index_base + 256) + 24 * entry->nr + 4;
+	if (p->index.version == 0)
+		return (unsigned char *)(p->index.base + 256) + 24 * entry->nr + 4;
+	else if (p->index.version == 1)
+		return (unsigned char *)(p->index.base + 256 + 2) + 28 * entry->nr + 8;
+	else
+		die("Unsupported index version");
 }
 
 static void *delta_against(void *buf, unsigned long size, struct object_entry *entry)
diff --git a/cache.h b/cache.h
index 04f8e63..b329ca4 100644
--- a/cache.h
+++ b/cache.h
@@ -362,12 +362,17 @@ struct pack_window {
 	unsigned int inuse_cnt;
 };
 
+struct index_info {
+	off_t	size;
+	uint32_t *base;
+	uint32_t version;
+};
+
 extern struct packed_git {
 	struct packed_git *next;
 	struct pack_window *windows;
-	uint32_t *index_base;
-	off_t index_size;
 	off_t pack_size;
+	struct index_info index;
 	int pack_fd;
 	int pack_local;
 	unsigned char sha1[20];
@@ -376,7 +381,7 @@ extern struct packed_git {
 } *packed_git;
 
 struct pack_entry {
-	unsigned int offset;
+	off_t offset;
 	unsigned char sha1[20];
 	struct packed_git *p;
 };
@@ -420,7 +425,7 @@ extern void unuse_pack(struct pack_window **);
 extern struct packed_git *add_packed_git(char *, int, int);
 extern int num_packed_objects(const struct packed_git *p);
 extern int nth_packed_object_sha1(const struct packed_git *, int, unsigned char*);
-extern unsigned long find_pack_entry_one(const unsigned char *, struct packed_git *);
+extern off_t find_pack_entry_one(const unsigned char *, struct packed_git *);
 extern void *unpack_entry(struct packed_git *, unsigned long, char *, unsigned long *);
 extern unsigned long unpack_object_header_gently(const unsigned char *buf, unsigned long len, enum object_type *type, unsigned long *sizep);
 extern void packed_object_info_detail(struct packed_git *, unsigned long, char *, unsigned long *, unsigned long *, unsigned int *, unsigned char *);
diff --git a/index-pack.c b/index-pack.c
index fa9a0e7..cc22ea2 100644
--- a/index-pack.c
+++ b/index-pack.c
@@ -12,7 +12,7 @@ static const char index_pack_usage[] =
 
 struct object_entry
 {
-	unsigned long offset;
+	off_t offset;
 	unsigned long size;
 	unsigned int hdr_size;
 	enum object_type type;
@@ -83,7 +83,7 @@ static unsigned display_progress(unsigned n, unsigned total, unsigned last_pc)
 
 /* We always read in 4kB chunks. */
 static unsigned char input_buffer[4096];
-static unsigned long input_offset, input_len, consumed_bytes;
+static off_t input_offset, input_len, consumed_bytes;
 static SHA_CTX input_ctx;
 static int input_fd, output_fd, pack_fd;
 
@@ -707,6 +707,13 @@ static const char *write_index_file(const char *index_name, unsigned char *sha1)
 		die("unable to create %s: %s", index_name, strerror(errno));
 	f = sha1fd(fd, index_name);
 
+	if ( consumed_bytes >= 0xffffffffUL )
+	{
+		struct pack_idx_header hdr;
+		hdr.idx_signature = htonl(PACK_IDX_SIGNATURE);
+		hdr.idx_version = htonl(1);
+		sha1write(f, &hdr, sizeof(hdr));
+	}
 	/*
 	 * Write the first-level table (the list is sorted,
 	 * but we use a 256-entry lookup to be able to avoid
@@ -736,9 +743,22 @@ static const char *write_index_file(const char *index_name, unsigned char *sha1)
 	list = sorted_by_sha;
 	for (i = 0; i < nr_objects; i++) {
 		struct object_entry *obj = *list++;
-		unsigned int offset = htonl(obj->offset);
-		sha1write(f, &offset, 4);
-		sha1write(f, obj->sha1, 20);
+		if ( consumed_bytes < 0xffffffffUL )
+		{
+			unsigned int offset = htonl(obj->offset);
+			sha1write(f, &offset, 4);
+			sha1write(f, obj->sha1, 20);
+		}
+		else
+		{
+			uint32_t low;
+			uint32_t high;
+			low=htonl(obj->offset & 0xffffffff);
+			high=htonl((obj->offset >>32) & 0xffffffff);
+			sha1write(f, &high, 4);
+			sha1write(f, &low, 4);
+			sha1write(f, obj->sha1, 20);
+		}
 		SHA1_Update(&ctx, obj->sha1, 20);
 	}
 	sha1write(f, sha1, 20);
diff --git a/pack-check.c b/pack-check.c
index 08a9fd8..12ee933 100644
--- a/pack-check.c
+++ b/pack-check.c
@@ -4,8 +4,8 @@
 static int verify_packfile(struct packed_git *p,
 		struct pack_window **w_curs)
 {
-	unsigned long index_size = p->index_size;
-	void *index_base = p->index_base;
+	off_t index_size = p->index.size;
+	void *index_base = p->index.base;
 	SHA_CTX ctx;
 	unsigned char sha1[20];
 	unsigned long offset = 0, pack_sig = p->pack_size - 20;
@@ -123,8 +123,8 @@ static void show_pack_info(struct packed_git *p)
 
 int verify_pack(struct packed_git *p, int verbose)
 {
-	unsigned long index_size = p->index_size;
-	void *index_base = p->index_base;
+	off_t index_size = p->index.size;
+	void *index_base = p->index.base;
 	SHA_CTX ctx;
 	unsigned char sha1[20];
 	int ret;
diff --git a/pack-redundant.c b/pack-redundant.c
index edb5524..d327ae4 100644
--- a/pack-redundant.c
+++ b/pack-redundant.c
@@ -246,15 +246,29 @@ static struct pack_list * pack_list_difference(const struct pack_list *A,
 static void cmp_two_packs(struct pack_list *p1, struct pack_list *p2)
 {
 	int p1_off, p2_off;
+	int p1_inc, p2_inc;
 	unsigned char *p1_base, *p2_base;
 	struct llist_item *p1_hint = NULL, *p2_hint = NULL;
 
-	p1_off = p2_off = 256 * 4 + 4;
-	p1_base = (unsigned char *) p1->pack->index_base;
-	p2_base = (unsigned char *) p2->pack->index_base;
+	if (p1->pack->index.version == 0) {
+		p1_off = 256 * 4 + 4;
+		p1_inc = 24;
+	} else {
+		p1_off = 256 * 4 + 8 + 8;
+		p1_inc = 28;
+	}
+	if (p2->pack->index.version == 0) {
+		p2_off = 256 * 4 + 4;
+		p2_inc = 24;
+	} else {
+		p2_off = 256 * 4 + 8 + 8;
+		p2_inc = 28;
+	}
+	p1_base = (unsigned char *) p1->pack->index.base;
+	p2_base = (unsigned char *) p2->pack->index.base;
 
-	while (p1_off <= p1->pack->index_size - 3 * 20 &&
-	       p2_off <= p2->pack->index_size - 3 * 20)
+	while (p1_off <= p1->pack->index.size - 3 * 20 &&
+	       p2_off <= p2->pack->index.size - 3 * 20)
 	{
 		int cmp = hashcmp(p1_base + p1_off, p2_base + p2_off);
 		/* cmp ~ p1 - p2 */
@@ -263,14 +277,14 @@ static void cmp_two_packs(struct pack_list *p1, struct pack_list *p2)
 					p1_base + p1_off, p1_hint);
 			p2_hint = llist_sorted_remove(p2->unique_objects,
 					p1_base + p1_off, p2_hint);
-			p1_off+=24;
-			p2_off+=24;
+			p1_off+=p1_inc;
+			p2_off+=p2_inc;
 			continue;
 		}
 		if (cmp < 0) { /* p1 has the object, p2 doesn't */
-			p1_off+=24;
+			p1_off+=p1_inc;
 		} else { /* p2 has the object, p1 doesn't */
-			p2_off+=24;
+			p2_off+=p2_inc;
 		}
 	}
 }
@@ -351,27 +365,41 @@ static size_t sizeof_union(struct packed_git *p1, struct packed_git *p2)
 {
 	size_t ret = 0;
 	int p1_off, p2_off;
+	int p1_inc, p2_inc;
 	unsigned char *p1_base, *p2_base;
 
-	p1_off = p2_off = 256 * 4 + 4;
-	p1_base = (unsigned char *)p1->index_base;
-	p2_base = (unsigned char *)p2->index_base;
+	if (p1->index.version == 0) {
+		p1_off = 256 * 4 + 4;
+		p1_inc = 24;
+	} else {
+		p1_off = 256 * 4 + 8 + 8;
+		p1_inc = 28;
+	}
+	if (p2->index.version == 0) {
+		p2_off = 256 * 4 + 4;
+		p2_inc = 24;
+	} else {
+		p2_off = 256 * 4 + 8 + 8;
+		p2_inc = 28;
+	}
+	p1_base = (unsigned char *)p1->index.base;
+	p2_base = (unsigned char *)p2->index.base;
 
-	while (p1_off <= p1->index_size - 3 * 20 &&
-	       p2_off <= p2->index_size - 3 * 20)
+	while (p1_off <= p1->index.size - 3 * 20 &&
+	       p2_off <= p2->index.size - 3 * 20)
 	{
 		int cmp = hashcmp(p1_base + p1_off, p2_base + p2_off);
 		/* cmp ~ p1 - p2 */
 		if (cmp == 0) {
 			ret++;
-			p1_off+=24;
-			p2_off+=24;
+			p1_off+=p1_inc;
+			p2_off+=p2_inc;
 			continue;
 		}
 		if (cmp < 0) { /* p1 has the object, p2 doesn't */
-			p1_off+=24;
+			p1_off+=p1_inc;
 		} else { /* p2 has the object, p1 doesn't */
-			p2_off+=24;
+			p2_off+=p2_inc;
 		}
 	}
 	return ret;
@@ -401,7 +429,7 @@ static inline size_t pack_set_bytecount(struct pack_list *pl)
 	size_t ret = 0;
 	while (pl) {
 		ret += pl->pack->pack_size;
-		ret += pl->pack->index_size;
+		ret += pl->pack->index.size;
 		pl = pl->next;
 	}
 	return ret;
@@ -534,6 +562,7 @@ static struct pack_list * add_pack(struct packed_git *p)
 {
 	struct pack_list l;
 	size_t off;
+	int inc;
 	unsigned char *base;
 
 	if (!p->pack_local && !(alt_odb || verbose))
@@ -541,12 +570,18 @@ static struct pack_list * add_pack(struct packed_git *p)
 
 	l.pack = p;
 	llist_init(&l.all_objects);
-
-	off = 256 * 4 + 4;
-	base = (unsigned char *)p->index_base;
-	while (off <= p->index_size - 3 * 20) {
+	if (p->index.version == 0) {
+		off = 256 * 4 + 4;
+		inc = 24;
+	}
+	if (p->index.version == 1) {
+		off = 256 * 4 + 8 + 8;
+		inc = 28;
+	}
+	base = (unsigned char *)p->index.base;
+	while (off <= p->index.size - 3 * 20) {
 		llist_insert_back(l.all_objects, base + off);
-		off += 24;
+		off += inc;
 	}
 	/* this list will be pruned in cmp_two_packs later */
 	l.unique_objects = llist_copy(l.all_objects);
diff --git a/pack.h b/pack.h
index deb427e..fbf7992 100644
--- a/pack.h
+++ b/pack.h
@@ -41,6 +41,10 @@ struct pack_header {
  * byte word.  This would be true in the proposed future index
  * format as idx_signature would be greater than idx_version.
  */
+struct pack_idx_header {
+	uint32_t idx_signature;
+	uint32_t idx_version;
+};
 #define PACK_IDX_SIGNATURE 0xff744f63	/* "\377tOc" */
 
 extern int verify_pack(struct packed_git *, int);
diff --git a/sha1_file.c b/sha1_file.c
index 2c87031..c5ab5cc 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -430,12 +430,9 @@ void pack_report()
 		pack_mapped, peak_pack_mapped);
 }
 
-static int check_packed_git_idx(const char *path, unsigned long *idx_size_,
-				void **idx_map_)
+static int check_packed_git_idx(const char *path, struct index_info * idx )
 {
-	void *idx_map;
 	uint32_t *index;
-	unsigned long idx_size;
 	int nr, i;
 	int fd = open(path, O_RDONLY);
 	struct stat st;
@@ -445,46 +442,73 @@ static int check_packed_git_idx(const char *path, unsigned long *idx_size_,
 		close(fd);
 		return -1;
 	}
-	idx_size = st.st_size;
-	idx_map = xmmap(NULL, idx_size, PROT_READ, MAP_PRIVATE, fd, 0);
+	idx->size = st.st_size;
+	idx->base = xmmap(NULL, idx->size, PROT_READ, MAP_PRIVATE, fd, 0);
 	close(fd);
 
-	index = idx_map;
-	*idx_map_ = idx_map;
-	*idx_size_ = idx_size;
+	index = idx->base;
 
-	/* check index map */
-	if (idx_size < 4*256 + 20 + 20)
+	if (idx->size < 8)
 		return error("index file %s is too small", path);
-
+	if (index[0] != htonl(PACK_IDX_SIGNATURE))
+	{
+		/* check index map */
+		if (idx->size < 4*256 + 20 + 20)
+			return error("index file %s is too small", path);
+		nr = 0;
+		for (i = 0; i < 256; i++) {
+			unsigned int n = ntohl(index[i]);
+			if (n < nr)
+				return error("non-monotonic index %s", path);
+			nr = n;
+		}
+		/*
+		 * Total size:
+		 *  - 256 index entries 4 bytes each
+		 *  - 24-byte entries * nr (20-byte sha1 + 4-byte offset)
+		 *  	(on 32-bit indexes)
+		 *  - 20-byte SHA1 of the packfile
+		 *  - 20-byte SHA1 file checksum
+		 */
+		if (idx->size != 4*256 + nr * 24 + 20 + 20)
+			return error("wrong index file size in %s", path);
+		idx->version = 0;
+	}
+	else if (index[0] == htonl(PACK_IDX_SIGNATURE) && ntohl(index[1]) == 1 )
+	{
+		index += 2;
+		/* check index map */
+		if (idx->size < 8 + 4*256 + 20 + 20 )
+			return error("index file %s is too small", path);
+		nr = 0;
+		for (i = 0; i < 256; i++) {
+			unsigned int n = ntohl(index[i]);
+			if (n < nr)
+				return error("non-monotonic index %s", path);
+			nr = n;
+		}
+		/*
+		 * Total size:
+		 *  - (4+4) byte header
+		 *  - 256 index entries 4 bytes each
+		 *  - 28-byte entries * nr (20-byte sha1 + 8 byte offset)
+		 *  	(on 64-bit indexes)
+		 *  - 20-byte SHA1 of the packfile
+		 *  - 20-byte SHA1 file checksum
+		 */
+		if (idx->size != 8 + 4*256 + nr * 28 + 20 + 20)
+			return error("wrong index file size in %s", path);
+		idx->version = 1;
+	}
 	/* a future index format would start with this, as older git
 	 * binaries would fail the non-monotonic index check below.
 	 * give a nicer warning to the user if we can.
 	 */
-	if (index[0] == htonl(PACK_IDX_SIGNATURE))
+	else if (index[0] == htonl(PACK_IDX_SIGNATURE))
 		return error("index file %s is a newer version"
 			" and is not supported by this binary"
 			" (try upgrading GIT to a newer version)",
 			path);
-
-	nr = 0;
-	for (i = 0; i < 256; i++) {
-		unsigned int n = ntohl(index[i]);
-		if (n < nr)
-			return error("non-monotonic index %s", path);
-		nr = n;
-	}
-
-	/*
-	 * Total size:
-	 *  - 256 index entries 4 bytes each
-	 *  - 24-byte entries * nr (20-byte sha1 + 4-byte offset)
-	 *  - 20-byte SHA1 of the packfile
-	 *  - 20-byte SHA1 file checksum
-	 */
-	if (idx_size != 4*256 + nr * 24 + 20 + 20)
-		return error("wrong index file size in %s", path);
-
 	return 0;
 }
 
@@ -605,7 +629,7 @@ static int open_packed_git_1(struct packed_git *p)
 		return error("end of packfile %s is unavailable", p->pack_name);
 	if (read_in_full(p->pack_fd, sha1, sizeof(sha1)) != sizeof(sha1))
 		return error("packfile %s signature is unavailable", p->pack_name);
-	idx_sha1 = ((unsigned char *)p->index_base) + p->index_size - 40;
+	idx_sha1 = ((unsigned char *)p->index.base) + p->index.size - 40;
 	if (hashcmp(sha1, idx_sha1))
 		return error("packfile %s does not match index", p->pack_name);
 	return 0;
@@ -703,17 +727,16 @@ struct packed_git *add_packed_git(char *path, int path_len, int local)
 {
 	struct stat st;
 	struct packed_git *p;
-	unsigned long idx_size;
-	void *idx_map;
+	struct index_info idx;
 	unsigned char sha1[20];
 
-	if (check_packed_git_idx(path, &idx_size, &idx_map))
+	if (check_packed_git_idx(path, &idx))
 		return NULL;
 
 	/* do we have a corresponding .pack file? */
 	strcpy(path + path_len - 4, ".pack");
 	if (stat(path, &st) || !S_ISREG(st.st_mode)) {
-		munmap(idx_map, idx_size);
+		munmap(idx.base, idx.size);
 		return NULL;
 	}
 	/* ok, it looks sane as far as we can check without
@@ -721,9 +744,8 @@ struct packed_git *add_packed_git(char *path, int path_len, int local)
 	 */
 	p = xmalloc(sizeof(*p) + path_len + 2);
 	strcpy(p->pack_name, path);
-	p->index_size = idx_size;
+	p->index = idx;
 	p->pack_size = st.st_size;
-	p->index_base = idx_map;
 	p->next = NULL;
 	p->windows = NULL;
 	p->pack_fd = -1;
@@ -742,20 +764,18 @@ struct packed_git *parse_pack_index(unsigned char *sha1)
 struct packed_git *parse_pack_index_file(const unsigned char *sha1, char *idx_path)
 {
 	struct packed_git *p;
-	unsigned long idx_size;
-	void *idx_map;
+	struct index_info idx;
 	char *path;
 
-	if (check_packed_git_idx(idx_path, &idx_size, &idx_map))
+	if (check_packed_git_idx(idx_path, &idx))
 		return NULL;
 
 	path = sha1_pack_name(sha1);
 
 	p = xmalloc(sizeof(*p) + strlen(path) + 2);
 	strcpy(p->pack_name, path);
-	p->index_size = idx_size;
+	p->index = idx;
 	p->pack_size = 0;
-	p->index_base = idx_map;
 	p->next = NULL;
 	p->windows = NULL;
 	p->pack_fd = -1;
@@ -1318,7 +1338,7 @@ static void *unpack_delta_entry(struct packed_git *p,
 	return result;
 }
 
-void *unpack_entry(struct packed_git *p, unsigned long offset,
+void *unpack_entry(struct packed_git *p, off_t offset,
 			  char *type, unsigned long *sizep)
 {
 	struct pack_window *w_curs = NULL;
@@ -1350,27 +1370,56 @@ void *unpack_entry(struct packed_git *p, unsigned long offset,
 
 int num_packed_objects(const struct packed_git *p)
 {
+	int objects = 0;
 	/* See check_packed_git_idx() */
-	return (p->index_size - 20 - 20 - 4*256) / 24;
+	if (p->index.version == 0)
+		objects = (p->index.size - 20 - 20 - 4*256) / 24;
+	else if (p->index.version == 1)
+		objects = (p->index.size - 8 - 20 - 20 - 4*256) / 28;
+	return objects;
 }
 
-int nth_packed_object_sha1(const struct packed_git *p, int n,
-			   unsigned char* sha1)
+static int nth_packed_object_sha1_v0(const struct packed_git *p, int n,
+					unsigned char* sha1)
 {
-	void *index = p->index_base + 256;
+	void *index = p->index.base + 256;
 	if (n < 0 || num_packed_objects(p) <= n)
 		return -1;
 	hashcpy(sha1, (unsigned char *) index + (24 * n) + 4);
 	return 0;
 }
 
-unsigned long find_pack_entry_one(const unsigned char *sha1,
+static int nth_packed_object_sha1_v1(const struct packed_git *p, int n,
+					unsigned char* sha1)
+{
+	void *index = p->index.base + 256 + 2;
+	if (n < 0 || num_packed_objects(p) <= n)
+		return -1;
+	hashcpy(sha1, (unsigned char *) index + (28 * n) + 8);
+	return 0;
+}
+
+int nth_packed_object_sha1(const struct packed_git *p, int n,
+			   unsigned char* sha1)
+{
+	switch  (p->index.version)
+	{
+	case 0:
+		return nth_packed_object_sha1_v0(p, n, sha1);
+	case 1:
+		return nth_packed_object_sha1_v1(p, n, sha1);
+	default:
+		die("Unsupported Pack Version v%d\n", p->index.version);
+	}
+}
+
+static off_t find_pack_entry_one_v0(const unsigned char *sha1,
 				  struct packed_git *p)
 {
-	uint32_t *level1_ofs = p->index_base;
+	uint32_t *level1_ofs = p->index.base;
 	int hi = ntohl(level1_ofs[*sha1]);
 	int lo = ((*sha1 == 0x0) ? 0 : ntohl(level1_ofs[*sha1 - 1]));
-	void *index = p->index_base + 256;
+	void *index = p->index.base + 256;
 
 	do {
 		int mi = (lo + hi) / 2;
@@ -1385,6 +1434,46 @@ unsigned long find_pack_entry_one(const unsigned char *sha1,
 	return 0;
 }
 
+static off_t find_pack_entry_one_v1(const unsigned char *sha1,
+				  struct packed_git *p)
+{
+	uint32_t *level1_ofs = p->index.base + 2;
+	int hi = ntohl(level1_ofs[*sha1]);
+	int lo = ((*sha1 == 0x0) ? 0 : ntohl(level1_ofs[*sha1 - 1]));
+	uint32_t *index = p->index.base + 256 + 2;
+
+	do {
+		int mi = (lo + hi) / 2;
+		int cmp = hashcmp((unsigned char *)(index + (7 * mi) + 2), sha1);
+		if (!cmp){
+			off_t offset;
+			offset = ntohl(*(index + (7 * mi)));
+			offset <<=32;
+			offset |= ntohl(*(index + ((7 * mi) + 1)));
+			return  offset;
+		}
+		if (cmp > 0)
+			hi = mi;
+		else
+			lo = mi+1;
+	} while (lo < hi);
+	return 0;
+}
+
+off_t find_pack_entry_one(const unsigned char *sha1,
+				  struct packed_git *p)
+{
+	switch (p->index.version)
+	{
+	case 0:
+		return find_pack_entry_one_v0(sha1, p);
+	case 1:
+		return find_pack_entry_one_v1(sha1, p);
+	default:
+		die("Unsupported Pack Version: v%d\n", p->index.version);
+	}
+}
+
 static int matches_pack_name(struct packed_git *p, const char *ig)
 {
 	const char *last_c, *c;
diff --git a/show-index.c b/show-index.c
index a30a2de..c7ff161 100644
--- a/show-index.c
+++ b/show-index.c
@@ -1,28 +1,69 @@
 #include "cache.h"
+#include "pack.h"
+#include <stdio.h>
 
 int main(int argc, char **argv)
 {
-	int i;
-	unsigned nr;
-	unsigned int entry[6];
-	static unsigned int top_index[256];
+	struct pack_idx_header hdr;
 
-	if (fread(top_index, sizeof(top_index), 1, stdin) != 1)
-		die("unable to read index");
-	nr = 0;
-	for (i = 0; i < 256; i++) {
-		unsigned n = ntohl(top_index[i]);
-		if (n < nr)
-			die("corrupt index file");
-		nr = n;
-	}
-	for (i = 0; i < nr; i++) {
-		unsigned offset;
+	if (fread(&hdr, sizeof(hdr), 1, stdin) != 1)
+		die("unable to read header");
+	if (PACK_IDX_SIGNATURE != ntohl(hdr.idx_signature))
+	{
+		int i;
+		unsigned nr;
+		unsigned int entry[6];
+		static unsigned int top_index[256];
+		top_index[0] = hdr.idx_signature;
+		top_index[1] = hdr.idx_version;
+		if (fread(top_index+2, sizeof(top_index) - sizeof(top_index[0])*2, 1, stdin) != 1)
+			die("unable to read index");
+		nr = 0;
+		for (i = 0; i < 256; i++) {
+			unsigned n = ntohl(top_index[i]);
+			if (n < nr)
+				die("corrupt index file");
+			nr = n;
+		}
+		for (i = 0; i < nr; i++) {
+			unsigned offset;
 
-		if (fread(entry, 24, 1, stdin) != 1)
-			die("unable to read entry %u/%u", i, nr);
-		offset = ntohl(entry[0]);
-		printf("%u %s\n", offset, sha1_to_hex((void *)(entry+1)));
+			if (fread(entry, 24, 1, stdin) != 1)
+				die("unable to read entry %u/%u", i, nr);
+			offset = ntohl(entry[0]);
+			printf("%u %s\n", offset, sha1_to_hex((void *)(entry+1)));
+		}
+		return 0;
+	}
+	else if (1 == ntohl(hdr.idx_version))
+	{
+		unsigned int nr;
+		unsigned int i;
+		unsigned int entry[7];
+		uint32_t top_index[256];
+		if (fread(top_index, sizeof(top_index), 1, stdin) != 1)
+			die("unable to read index");
+		nr = 0;
+		for (i = 0; i < 256; i++) {
+			unsigned int n;
+			n = ntohl(top_index[i]);
+			if (n < nr)
+				die("corrupt version 1 index file");
+			nr = n;
+		}
+		for (i = 0; i < nr; i++) {
+			off_t offset;
+			if (fread(entry, 28, 1, stdin) != 1)
+				die("unable to read entry %u/%u", i, nr);
+			offset = ntohl(entry[0]);
+			offset <<= 32;
+			offset |= ntohl (entry[1]);
+			printf("%llu %s\n", (unsigned long long) offset,
+				sha1_to_hex((void *)(entry+2)));
+		}
+		return 0;
 	}
-	return 0;
+	else
+		die("Unsupported header version %d.",
+			hdr.idx_version);
 }
-- 
1.5.0.1

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH] Support 64-bit indexes for pack files.
  2007-02-26 22:40 [PATCH] Support 64-bit indexes for pack files Troy Telford
@ 2007-02-26 23:55 ` Shawn O. Pearce
  2007-02-27  0:24   ` Nicolas Pitre
                     ` (2 more replies)
  2007-02-28 19:46 ` Troy Telford
  1 sibling, 3 replies; 20+ messages in thread
From: Shawn O. Pearce @ 2007-02-26 23:55 UTC (permalink / raw)
  To: Troy Telford; +Cc: git

Troy Telford <ttelford.groups@gmail.com> wrote:
> As I've not been involved with git development before, I'm
> aware that this may already be on somebody's 'todo' list.  It was an itch
> I needed to scratch, as I have a repository whose size is multiple gigabytes,
> and 'git clone' (by default) forces everything into a single
> packfile with >=git-1.5.0.
> 
> The patch introduces a new packfile index version, which adds a:
>  * header to the index file (index version info).
>  * leaves the object indexes within the index at 32-bit
>  * extends the offsets used to describe the packfile to 64-bit.
> 
> The new index format is only used when the associated  packfile 
> becomes large enough to warrant a 64-bit index; otherwise the original
> index format is used.

Clearly a good deal of work has been put into this patch.  I cannot
say that reviewed every bit of it... because...

Nico and I are neck deep in our pack version 4 topic.  That topic
hits all of the same code you touched with your patch.

Our topic also requires us to change the index file format, and
in doing so we have decided to extend the index records to look
something like the following[*1*]:

	object SHA-1
	64-bit offset within packfile
	32-bit index of next object in packfile

The latter field is to help pack-objects reuse existing packfile
data, as today it needs to sort everything on its own on the fly.
Having that last field of data will help avoid that, and will keep
the index nicely aligned for 64-bit accesses to the offset.

I want to say your patch shouldn't be merged without even bothering
to review it.  The last time I was in this part of the git code
(implementing sliding mmap window) Nico and Junio also both went in
here and rewrote huge chunks.  Their changes prevented sliding mmap
window from applying.  It was 6 months before I got back around to
rewriting the patch.

Right now I'm neck deep in pack v4.  I hope to have the topic in
pu-ready state by some time mid-next week, hopefully in time for
Junio's git day.  I'm very unlikely to have the time to rewrite the
topic again until late June/July if something like your patch goes
in now.

So would you mind waiting a couple of weeks for 64 bit indexes?


*1*: This was Nico's idea; I'm in agreement with him about it.
     The actual disk layout here has not been set in stone as we
     want to kick around different layouts before we settle on
     something final.  I'm a little unhappy with the way the above
     lays out in memory when we mmap it, for example.

-- 
Shawn.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] Support 64-bit indexes for pack files.
  2007-02-26 23:55 ` Shawn O. Pearce
@ 2007-02-27  0:24   ` Nicolas Pitre
  2007-02-27  0:31     ` Shawn O. Pearce
  2007-02-27  1:16   ` Troy Telford
  2007-02-27  4:56   ` Nicolas Pitre
  2 siblings, 1 reply; 20+ messages in thread
From: Nicolas Pitre @ 2007-02-27  0:24 UTC (permalink / raw)
  To: Shawn O. Pearce; +Cc: Troy Telford, git

On Mon, 26 Feb 2007, Shawn O. Pearce wrote:

> Our topic also requires us to change the index file format, and
> in doing so we have decided to extend the index records to look
> something like the following[*1*]:
> 
> 	object SHA-1
> 	64-bit offset within packfile
> 	32-bit index of next object in packfile
> 
> The latter field is to help pack-objects reuse existing packfile
> data, as today it needs to sort everything on its own on the fly.
> Having that last field of data will help avoid that, and will keep
> the index nicely aligned for 64-bit accesses to the offset.

Wouldn't that later field help the sliding mmap code as well, knowing in 
advance what storage size a given object has? (I didn't look at the 
sliding mmap code so I don't know).

Actually I've been thinking about another format already.

What about keeping the pack offset as 32 bits like it is today, but for 
index v2 if the top bit is set then this become an index into another 
table containing 64-bit offsets as needed.  This way there is no waste 
of space for most projects where the pack has yet to reach the 2GB limit 
for many years to come.


Nicolas

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] Support 64-bit indexes for pack files.
  2007-02-27  0:24   ` Nicolas Pitre
@ 2007-02-27  0:31     ` Shawn O. Pearce
  2007-02-27  4:32       ` Nicolas Pitre
  0 siblings, 1 reply; 20+ messages in thread
From: Shawn O. Pearce @ 2007-02-27  0:31 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Troy Telford, git

Nicolas Pitre <nico@cam.org> wrote:
> On Mon, 26 Feb 2007, Shawn O. Pearce wrote:
> > The latter field is to help pack-objects reuse existing packfile
> > data, as today it needs to sort everything on its own on the fly.
> > Having that last field of data will help avoid that, and will keep
> > the index nicely aligned for 64-bit accesses to the offset.
> 
> Wouldn't that later field help the sliding mmap code as well, knowing in 
> advance what storage size a given object has? (I didn't look at the 
> sliding mmap code so I don't know).

Yes, it probably would.
 
> Actually I've been thinking about another format already.
> 
> What about keeping the pack offset as 32 bits like it is today, but for 
> index v2 if the top bit is set then this become an index into another 
> table containing 64-bit offsets as needed.  This way there is no waste 
> of space for most projects where the pack has yet to reach the 2GB limit 
> for many years to come.

Actually Troy's patch tries to do this by using the current format
and only switching to the new one if the packfile exceeds 4 GiB.
Rather smart.

One thought I had here was to expand the fan-out table from 1<<8
entries to 1<<16 entries, then store only the low 18 bytes of
the SHA-1.  We would have another 2 bytes worth of space to store
the offset, pushing our total offset up to 48 bits.

-- 
Shawn.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] Support 64-bit indexes for pack files.
  2007-02-26 23:55 ` Shawn O. Pearce
  2007-02-27  0:24   ` Nicolas Pitre
@ 2007-02-27  1:16   ` Troy Telford
  2007-02-27  4:56   ` Nicolas Pitre
  2 siblings, 0 replies; 20+ messages in thread
From: Troy Telford @ 2007-02-27  1:16 UTC (permalink / raw)
  To: Shawn O. Pearce; +Cc: git

On Monday 26 February 2007, Shawn O. Pearce wrote:

> Nico and I are neck deep in our pack version 4 topic.  That topic
> hits all of the same code you touched with your patch.

It looked like that may have been the case; but I figured it wouldn't hurt.  
I've been coping with the 'too-small' index issue for about a year now, and 
have been on IRC and kept at least a lazy eye on it hoping that it would be 
attended to.

I've decided I need to start doing some C coding again, and it seemed to be as 
good an itch as any to scratch...

> Our topic also requires us to change the index file format, and
> in doing so we have decided to extend the index records to look
> something like the following[*1*]:
>
> 	object SHA-1
> 	64-bit offset within packfile
> 	32-bit index of next object in packfile
>
> The latter field is to help pack-objects reuse existing packfile
> data, as today it needs to sort everything on its own on the fly.
> Having that last field of data will help avoid that, and will keep
> the index nicely aligned for 64-bit accesses to the offset.

Exactly why I left the packfile itself alone.  Well, that and laziness.  I may 
have a huge repository, but I don't have single files so large that it needs 
a 32-bit index for the next object in the packfile.

Most of the things that large I've seen are binary anyway.

> I want to say your patch shouldn't be merged without even bothering
> to review it.  

No hard feelings on my part, it was as much a learning thing for me as 
anything else.

> The last time I was in this part of the git code 
> (implementing sliding mmap window) Nico and Junio also both went in
> here and rewrote huge chunks.  Their changes prevented sliding mmap
> window from applying.  It was 6 months before I got back around to
> rewriting the patch.

Ouch.

> Right now I'm neck deep in pack v4.  I hope to have the topic in
> pu-ready state by some time mid-next week, hopefully in time for
> Junio's git day.  I'm very unlikely to have the time to rewrite the
> topic again until late June/July if something like your patch goes
> in now.
>
> So would you mind waiting a couple of weeks for 64 bit indexes?

All I'm concerned with is being able to handle my (huge) source repository 
with git.  I wrote the patch in the hopes that it would accelerate the 
process, and that I'd learn something.  If all I have to do is wait that's 
fine.  I figured I'd at least be able to bring an idea or two to the table.  
If the code doesn't get accepted, but still get the desired functionality, I 
still met 1/2 of my goals in doing it.
-- 
Troy Telford

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] Support 64-bit indexes for pack files.
  2007-02-27  0:31     ` Shawn O. Pearce
@ 2007-02-27  4:32       ` Nicolas Pitre
  2007-02-27  4:55         ` Geert Bosch
  0 siblings, 1 reply; 20+ messages in thread
From: Nicolas Pitre @ 2007-02-27  4:32 UTC (permalink / raw)
  To: Shawn O. Pearce; +Cc: Troy Telford, git

On Mon, 26 Feb 2007, Shawn O. Pearce wrote:

> Nicolas Pitre <nico@cam.org> wrote:
> > Actually I've been thinking about another format already.
> > 
> > What about keeping the pack offset as 32 bits like it is today, but for 
> > index v2 if the top bit is set then this become an index into another 
> > table containing 64-bit offsets as needed.  This way there is no waste 
> > of space for most projects where the pack has yet to reach the 2GB limit 
> > for many years to come.
> 
> Actually Troy's patch tries to do this by using the current format
> and only switching to the new one if the packfile exceeds 4 GiB.
> Rather smart.

Yes I saw the patch.  But what I propose is different.  In fact I'd 
require far less changes to the existing code.  The idea is to continue 
to store a 32-bit value along with the SHA1 just like we do today.  
Then, appended to that would be another table containing a list of 
64-bit offsets.

Now if the offset stored in the index is smaller than 2GB you store it 
as we do today.  If it is >= 2GB then a 64-bit index would be added to 
the extra offset table and the 32-bit entry along with the SHA1 would be 
an index into that second table instead, with the top bit set to 
distinguish it from a normal 32-bit offset (actually 31 bits).  So for 
offsets larger than 31 bits then they have an additional level of 
indirection.

The code to implement this would be minimal.  And since objects placed 
at the end of a pack (those more likely to incure the indirection 
overhead) are further back in history they won't get accessed 
very often anyway.

Then nothing prevents us from inserting the next-object-index table in 
between (its size is known while the 64-bit offset one may vary) then 
the code that doesn't care about it need no look at it. 

> One thought I had here was to expand the fan-out table from 1<<8
> entries to 1<<16 entries, then store only the low 18 bytes of
> the SHA-1.  We would have another 2 bytes worth of space to store
> the offset, pushing our total offset up to 48 bits.

That would penalize small packs a lot.  the index would always start 
from 256KB in size.  With a pack of 100 objects (our current treshold 
for keeping a pack) that means a 258KB index file.  Currently the index 
file for a 100-object pack is 3.4KB.


Nicolas

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] Support 64-bit indexes for pack files.
  2007-02-27  4:32       ` Nicolas Pitre
@ 2007-02-27  4:55         ` Geert Bosch
  2007-02-27  5:11           ` Nicolas Pitre
  0 siblings, 1 reply; 20+ messages in thread
From: Geert Bosch @ 2007-02-27  4:55 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Shawn O. Pearce, Troy Telford, git


On Feb 26, 2007, at 23:32, Nicolas Pitre wrote:

>> One thought I had here was to expand the fan-out table from 1<<8
>> entries to 1<<16 entries, then store only the low 18 bytes of
>> the SHA-1.  We would have another 2 bytes worth of space to store
>> the offset, pushing our total offset up to 48 bits.
>
> That would penalize small packs a lot.  the index would always start
> from 256KB in size.  With a pack of 100 objects (our current treshold
> for keeping a pack) that means a 258KB index file.  Currently the  
> index
> file for a 100-object pack is 3.4KB.

Why can't we do it with the current 1<<8 entry fan-out?
This would allow increases of pack file size up to 1 TB.
For larger repositories, we just need to use multiple
pack files. A couple hundred 1 TB pack files doesn't seem
to be a big issue.

Say a couple years from now, we can write data to stable storage
(disks/flash/holograms or whatever) at 1 GB/sec, then it would still
take 16 minutes to write a 1 TB file. At that point we'd need a
bigger overhaul than just larger offsets in the pack file.

BTW, here are a few issues with the current pack file format:
   - The final SHA1 consists of the count of objects in the file
     and all compressed data. Why? This is horrible for streaming
     applications where you only know the count of objects at the
     end, then you need to access *all* data to compute the SHA-1.
     Much better to just use compute a SHA1 over the SHA1's of each
     object. That way at least the data streamed can be streamed to
     disk. Buffering one SHA1 per object is probably going to be OK.

   - The object count is implicit in the SHA1 of all objects and the
     objects we find in the file. Why do we need it in the first place?
     Better to recompute it when necessary. This makes true streaming
     possible.

   -Geert

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] Support 64-bit indexes for pack files.
  2007-02-26 23:55 ` Shawn O. Pearce
  2007-02-27  0:24   ` Nicolas Pitre
  2007-02-27  1:16   ` Troy Telford
@ 2007-02-27  4:56   ` Nicolas Pitre
  2 siblings, 0 replies; 20+ messages in thread
From: Nicolas Pitre @ 2007-02-27  4:56 UTC (permalink / raw)
  To: Shawn O. Pearce; +Cc: Troy Telford, git

On Mon, 26 Feb 2007, Shawn O. Pearce wrote:

> Nico and I are neck deep in our pack version 4 topic.  That topic
> hits all of the same code you touched with your patch.

Although a good deal of code has been written already, we're still 
throwing completely crazy ideas back and forth which, if they survive 
the reality test, might change things quite a lot for the index.

> Our topic also requires us to change the index file format, and
> in doing so we have decided to extend the index records to look
> something like the following[*1*]:

[ delete new index format layout description ]

Although I suggested that format... like... yesterday, I'm already 
dismissing it now.  I'd go with what I just posted instead.

But yet this depends on whether or not we consider this latest idea of 
ours a good thing or not, which require some math to be done to prove 
its usefulness.

Maybe we should post our notes here in order to gather extra comments 
from people or to debunk some of our insane ideas faster.

> I want to say your patch shouldn't be merged without even bothering
> to review it.  The last time I was in this part of the git code
> (implementing sliding mmap window) Nico and Junio also both went in
> here and rewrote huge chunks.  Their changes prevented sliding mmap
> window from applying.  It was 6 months before I got back around to
> rewriting the patch.

The thing is, regardless of the pack v4 work, I think the approach used 
is not the best one.  And because of the pack v4 work, we'll have to 
settle on something before too long anyway.

> So would you mind waiting a couple of weeks for 64 bit indexes?

Of course Troy's patch might be used by those who need it now.  I think 
Junio might even park it in pu for reference.  The pack index is not a 
precious piece of information and it can be thrown away and recreated 
with git-index-pack whenever there is an officially supported new index 
format.

> *1*: This was Nico's idea; I'm in agreement with him about it.

As mentioned above I'm not in agreement with myself anymore about it.  
;-)


Nicolas

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] Support 64-bit indexes for pack files.
  2007-02-27  4:55         ` Geert Bosch
@ 2007-02-27  5:11           ` Nicolas Pitre
  2007-02-27 16:04             ` Geert Bosch
  0 siblings, 1 reply; 20+ messages in thread
From: Nicolas Pitre @ 2007-02-27  5:11 UTC (permalink / raw)
  To: Geert Bosch; +Cc: Shawn O. Pearce, Troy Telford, git

On Mon, 26 Feb 2007, Geert Bosch wrote:

> Why can't we do it with the current 1<<8 entry fan-out?
> This would allow increases of pack file size up to 1 TB.

I had the exact same thought while I was writing the previous mail.
It is indeed perfectly fine and would require less than 10 lines of code 
to implement.

> BTW, here are a few issues with the current pack file format:
>  - The final SHA1 consists of the count of objects in the file
>    and all compressed data. Why? This is horrible for streaming
>    applications where you only know the count of objects at the
>    end, then you need to access *all* data to compute the SHA-1.
>    Much better to just use compute a SHA1 over the SHA1's of each
>    object. That way at least the data streamed can be streamed to
>    disk. Buffering one SHA1 per object is probably going to be OK.

We always know the number of objects before actually constructing or 
streaming a pack.  Finding best delta matches require that we sort the 
object list by type, but for good locality we need to re-sort that list 
by recency.  So we always know the number of objects before starting to 
write since we need to have the list of objects in memory anyway.

Also the receiving end of a streamed pack wants to know the number of 
objects first if only to provide the user with some progress report.

>  - The object count is implicit in the SHA1 of all objects and the
>    objects we find in the file. Why do we need it in the first place?
>    Better to recompute it when necessary. This makes true streaming
>    possible.

Sorry, I don't follow you here.


Nicolas

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] Support 64-bit indexes for pack files.
  2007-02-27  5:11           ` Nicolas Pitre
@ 2007-02-27 16:04             ` Geert Bosch
  2007-02-27 16:11               ` Shawn O. Pearce
                                 ` (2 more replies)
  0 siblings, 3 replies; 20+ messages in thread
From: Geert Bosch @ 2007-02-27 16:04 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Shawn O. Pearce, Troy Telford, git


On Feb 27, 2007, at 00:11, Nicolas Pitre wrote:

>> BTW, here are a few issues with the current pack file format:
>>  - The final SHA1 consists of the count of objects in the file
>>    and all compressed data. Why? This is horrible for streaming
>>    applications where you only know the count of objects at the
>>    end, then you need to access *all* data to compute the SHA-1.
>>    Much better to just use compute a SHA1 over the SHA1's of each
>>    object. That way at least the data streamed can be streamed to
>>    disk. Buffering one SHA1 per object is probably going to be OK.
>
> We always know the number of objects before actually constructing or
> streaming a pack.  Finding best delta matches require that we sort the
> object list by type, but for good locality we need to re-sort that  
> list
> by recency.  So we always know the number of objects before  
> starting to
> write since we need to have the list of objects in memory anyway.

When I import a large code-base (such as a *.tar.gz), I don't know
beforehand how many objects I'm going to create. Ideally, I'd like
to stream them directly into a new pack without ever having to write
the expanded source to the filesystem.

> Also the receiving end of a streamed pack wants to know the number of
> objects first if only to provide the user with some progress report.
>
>>  - The object count is implicit in the SHA1 of all objects and the
>>    objects we find in the file. Why do we need it in the first place?
>>    Better to recompute it when necessary. This makes true streaming
>>    possible.
>
> Sorry, I don't follow you here.

The object-count at the beginning of the pack is a little strange for
local on-disk pack files, as it is data that can easily be derived.
The *index* would seem to be the proper place for this.

Also, it is not possible to write a dummy 0 in the count and then  
fill in
the correct count at the end, because the final SHA1 at the end of  
the pack
file is a checksum over the count followed by all the pack data.
So for creating a large pack from a stream of data, you have to do  
the following:
   1. write out a temporary pack file to disk without correct count
   2. fix-up the count
   3. read the entire temporary pack file to compute the final SHA-1
   4. fix-up the SHA1 at the end of the file
   5. construct and write out the index

There are a few ways to fixing this:
   - Have a count of 0xffffffff mean: look in the index for the count.
     Pulling/pushing would still use regular counted pack files.
   - Have the pack file checksum be the SHA1 of (the count followed
     by the SHA1 of the compressed data of each object). This would  
allow 3.
     to be done without reading back all data.

   -Geert

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] Support 64-bit indexes for pack files.
  2007-02-27 16:04             ` Geert Bosch
@ 2007-02-27 16:11               ` Shawn O. Pearce
  2007-02-27 16:55                 ` Geert Bosch
  2007-02-27 17:03               ` Nicolas Pitre
  2007-02-27 20:05               ` Johannes Schindelin
  2 siblings, 1 reply; 20+ messages in thread
From: Shawn O. Pearce @ 2007-02-27 16:11 UTC (permalink / raw)
  To: Geert Bosch; +Cc: Nicolas Pitre, Troy Telford, git

Geert Bosch <bosch@adacore.com> wrote:
> When I import a large code-base (such as a *.tar.gz), I don't know
> beforehand how many objects I'm going to create. Ideally, I'd like
> to stream them directly into a new pack without ever having to write
> the expanded source to the filesystem.

See git-fast-import.  If you are coming from a tar, also see
contrib/fast-import/import-tars.perl.  :-)
 
> So for creating a large pack from a stream of data, you have to do  
> the following:
>   1. write out a temporary pack file to disk without correct count
>   2. fix-up the count
>   3. read the entire temporary pack file to compute the final SHA-1
>   4. fix-up the SHA1 at the end of the file
>   5. construct and write out the index

Yes, this is exactly what git-fast-import does.  Yes, it sort of
sucks.  But its not as bad as you think.
 
> There are a few ways to fixing this:
>   - Have a count of 0xffffffff mean: look in the index for the count.
>     Pulling/pushing would still use regular counted pack files.
>   - Have the pack file checksum be the SHA1 of (the count followed
>     by the SHA1 of the compressed data of each object). This would  
> allow 3.
>     to be done without reading back all data.

I don't think it is worth it.  Aside from git-fast-import we
always know the object count before we start writing any data.
But despite that, fast-import runs quite well.

-- 
Shawn.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] Support 64-bit indexes for pack files.
  2007-02-27 16:11               ` Shawn O. Pearce
@ 2007-02-27 16:55                 ` Geert Bosch
  2007-02-27 17:36                   ` Nicolas Pitre
  0 siblings, 1 reply; 20+ messages in thread
From: Geert Bosch @ 2007-02-27 16:55 UTC (permalink / raw)
  To: Shawn O. Pearce; +Cc: Nicolas Pitre, Troy Telford, git

On Feb 27, 2007, at 11:11, Shawn O. Pearce wrote:
> Geert Bosch <bosch@adacore.com> wrote:
>> When I import a large code-base (such as a *.tar.gz), I don't know
>> beforehand how many objects I'm going to create. Ideally, I'd like
>> to stream them directly into a new pack without ever having to write
>> the expanded source to the filesystem.
>
> See git-fast-import.  If you are coming from a tar, also see
> contrib/fast-import/import-tars.perl.  :-)

Yes, I saw that, really nice. I had written something myself to
create pack files from a streaming data source. Basically, I'm
breaking arbitrary data-streams (mostly backups) into chunks
along content-defined boundaries and then link the chunks
together in a tree. This eliminates any duplicate files,
and even chunks of larger files that are identical. I create
new pack files whenever the old one gets larger than a certain
predefined size (128 MB, currently).

I'm happy I can base this on git-fast-import now.

>> So for creating a large pack from a stream of data, you have to do
>> the following:
>>   1. write out a temporary pack file to disk without correct count
>>   2. fix-up the count
>>   3. read the entire temporary pack file to compute the final SHA-1
>>   4. fix-up the SHA1 at the end of the file
>>   5. construct and write out the index
>
> Yes, this is exactly what git-fast-import does.  Yes, it sort of
> sucks.  But its not as bad as you think.

For smaller packs, the I/O is all going to be buffered anyway,
but if we're going to have >4GB pack files, it adds a lot of real
I/O  and SHA1 computation for no good reason. If we get a rare chance
to have a new pack format, why not fix this wart at the same time?

   -Geert

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] Support 64-bit indexes for pack files.
  2007-02-27 16:04             ` Geert Bosch
  2007-02-27 16:11               ` Shawn O. Pearce
@ 2007-02-27 17:03               ` Nicolas Pitre
  2007-02-27 20:05               ` Johannes Schindelin
  2 siblings, 0 replies; 20+ messages in thread
From: Nicolas Pitre @ 2007-02-27 17:03 UTC (permalink / raw)
  To: Geert Bosch; +Cc: Shawn O. Pearce, Troy Telford, git

On Tue, 27 Feb 2007, Geert Bosch wrote:

> 
> On Feb 27, 2007, at 00:11, Nicolas Pitre wrote:
> 
> > > BTW, here are a few issues with the current pack file format:
> > > - The final SHA1 consists of the count of objects in the file
> > >   and all compressed data. Why? This is horrible for streaming
> > >   applications where you only know the count of objects at the
> > >   end, then you need to access *all* data to compute the SHA-1.
> > >   Much better to just use compute a SHA1 over the SHA1's of each
> > >   object. That way at least the data streamed can be streamed to
> > >   disk. Buffering one SHA1 per object is probably going to be OK.
> > 
> > We always know the number of objects before actually constructing or
> > streaming a pack.  Finding best delta matches require that we sort the
> > object list by type, but for good locality we need to re-sort that list
> > by recency.  So we always know the number of objects before starting to
> > write since we need to have the list of objects in memory anyway.
> 
> When I import a large code-base (such as a *.tar.gz), I don't know
> beforehand how many objects I'm going to create. Ideally, I'd like
> to stream them directly into a new pack without ever having to write
> the expanded source to the filesystem.

Have a look at git-fast-import and contrib/fast-import/import-tars.perl.

Regardless, you cannot produce a decent pack without knowing in advance 
how many objects you have.  Thisis why git-fast-import produces a 
suboptimal pack that needs to be repacked in the end.

> The object-count at the beginning of the pack is a little strange for
> local on-disk pack files, as it is data that can easily be derived.
> The *index* would seem to be the proper place for this.

The index is _not_ sent over the network protocol by design.  But like I 
said the receiving end wants to know up front how many objects it'll 
have to parse.

> Also, it is not possible to write a dummy 0 in the count and then fill in
> the correct count at the end, because the final SHA1 at the end of the pack
> file is a checksum over the count followed by all the pack data.
> So for creating a large pack from a stream of data, you have to do the
> following:
>  1. write out a temporary pack file to disk without correct count
>  2. fix-up the count
>  3. read the entire temporary pack file to compute the final SHA-1
>  4. fix-up the SHA1 at the end of the file
>  5. construct and write out the index

Sure.  That's in fact what index-pack does when it has to fix up a thin 
pack. But did you find any _real_ issue with this?  Resolving deltas and 
recomputing the index is far more costly than simply redoing the whole 
pack checksum.

> There are a few ways to fixing this:
>  - Have a count of 0xffffffff mean: look in the index for the count.
>    Pulling/pushing would still use regular counted pack files.
>  - Have the pack file checksum be the SHA1 of (the count followed
>    by the SHA1 of the compressed data of each object). This would allow 3.
>    to be done without reading back all data.

Well we _could_ exclude the object count (or even the entire pack 
header for that matter) from the pack checksum.  This way a streamed 
pack streamed to disk could be seeked back to have its object number 
fixed up once it is known without needing to start the checksum all 
over.  Maybe something to consider for pack v4.  But in practice I don't 
see that as a big issue.


Nicolas

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] Support 64-bit indexes for pack files.
  2007-02-27 16:55                 ` Geert Bosch
@ 2007-02-27 17:36                   ` Nicolas Pitre
  2007-02-28  3:52                     ` Shawn O. Pearce
  0 siblings, 1 reply; 20+ messages in thread
From: Nicolas Pitre @ 2007-02-27 17:36 UTC (permalink / raw)
  To: Geert Bosch; +Cc: Shawn O. Pearce, Troy Telford, git

On Tue, 27 Feb 2007, Geert Bosch wrote:

> For smaller packs, the I/O is all going to be buffered anyway,
> but if we're going to have >4GB pack files, it adds a lot of real
> I/O  and SHA1 computation for no good reason. If we get a rare chance
> to have a new pack format, why not fix this wart at the same time?

Fair enough.


Nicolas

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] Support 64-bit indexes for pack files.
  2007-02-27 16:04             ` Geert Bosch
  2007-02-27 16:11               ` Shawn O. Pearce
  2007-02-27 17:03               ` Nicolas Pitre
@ 2007-02-27 20:05               ` Johannes Schindelin
  2007-02-27 20:25                 ` Geert Bosch
  2 siblings, 1 reply; 20+ messages in thread
From: Johannes Schindelin @ 2007-02-27 20:05 UTC (permalink / raw)
  To: Geert Bosch; +Cc: Nicolas Pitre, Shawn O. Pearce, Troy Telford, git

Hi,

On Tue, 27 Feb 2007, Geert Bosch wrote:

> The object-count at the beginning of the pack is a little strange for 
> local on-disk pack files, as it is data that can easily be derived.

The SHA1 is also easily derived. So think of it as a doubly secure way to 
ensure integrity.

Remember, there were some people unable to accept that SHA1 collisions are 
_unlikely_...

Ciao,
Dscho

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] Support 64-bit indexes for pack files.
  2007-02-27 20:05               ` Johannes Schindelin
@ 2007-02-27 20:25                 ` Geert Bosch
  2007-02-27 20:35                   ` Johannes Schindelin
  0 siblings, 1 reply; 20+ messages in thread
From: Geert Bosch @ 2007-02-27 20:25 UTC (permalink / raw)
  To: Johannes Schindelin; +Cc: Nicolas Pitre, Shawn O. Pearce, Troy Telford, git


On Feb 27, 2007, at 15:05, Johannes Schindelin wrote:
> On Tue, 27 Feb 2007, Geert Bosch wrote:
>
>> The object-count at the beginning of the pack is a little strange for
>> local on-disk pack files, as it is data that can easily be derived.
>
> The SHA1 is also easily derived. So think of it as a doubly secure  
> way to
> ensure integrity.
>
> Remember, there were some people unable to accept that SHA1  
> collisions are
> _unlikely_...

Ah, you include the count because you don't trust the SHA1
and now you can count that you got the right number of objects ;-)

It's just that I don't see a use for the count in a local on-disk
pack. In these cases we'll always have an index. We can keep the
current protocol for sending/receiving packs, no need to ever upgrade
to v4 there.

   -Geert

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] Support 64-bit indexes for pack files.
  2007-02-27 20:25                 ` Geert Bosch
@ 2007-02-27 20:35                   ` Johannes Schindelin
  0 siblings, 0 replies; 20+ messages in thread
From: Johannes Schindelin @ 2007-02-27 20:35 UTC (permalink / raw)
  To: Geert Bosch; +Cc: Nicolas Pitre, Shawn O. Pearce, Troy Telford, git

Hi,

On Tue, 27 Feb 2007, Geert Bosch wrote:

> 
> On Feb 27, 2007, at 15:05, Johannes Schindelin wrote:
> > On Tue, 27 Feb 2007, Geert Bosch wrote:
> > 
> > > The object-count at the beginning of the pack is a little strange 
> > > for local on-disk pack files, as it is data that can easily be 
> > > derived.
> > 
> > The SHA1 is also easily derived. So think of it as a doubly secure way 
> > to ensure integrity.
> > 
> > Remember, there were some people unable to accept that SHA1 collisions 
> > are _unlikely_...
> 
> Ah, you include the count because you don't trust the SHA1 and now you 
> can count that you got the right number of objects ;-)

I don't know if it was included for that reason, but you can excuse it 
that way. :-)

> It's just that I don't see a use for the count in a local on-disk pack. 
> In these cases we'll always have an index. We can keep the current 
> protocol for sending/receiving packs, no need to ever upgrade to v4 
> there.

But the protocol expects a valid pack! (Including a count...)

Besides, the pack index is a purely local thin, reconstructed from the 
pack when fetching...

Ciao,
Dscho

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] Support 64-bit indexes for pack files.
  2007-02-27 17:36                   ` Nicolas Pitre
@ 2007-02-28  3:52                     ` Shawn O. Pearce
  2007-02-28  4:12                       ` Nicolas Pitre
  0 siblings, 1 reply; 20+ messages in thread
From: Shawn O. Pearce @ 2007-02-28  3:52 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Geert Bosch, Troy Telford, git

Nicolas Pitre <nico@cam.org> wrote:
> On Tue, 27 Feb 2007, Geert Bosch wrote:
> > For smaller packs, the I/O is all going to be buffered anyway,
> > but if we're going to have >4GB pack files, it adds a lot of real
> > I/O  and SHA1 computation for no good reason. If we get a rare chance
> > to have a new pack format, why not fix this wart at the same time?
> 
> Fair enough.

OK, so lets say that if both ends of a network transport support
pack v4 then we can use pack v4.  If pack v4 omits the count field
from its header (because its easily derived or obtained from the
index, and doesn't add any additional data protection over the
SHA-1s) why not add some machine-readable sideband that can provide
transfer progress?

I think we would want four values, number of objects (sent/total)
and uncompressed bytes (sent/total), to send to the client.

Estimating the total uncompressed bytes is very easy in pack-objects
before we start sending even the header; actually if we are reusing
a majority of the objects from an existing packfile we even have
a good approximation of the compressed size ready.  That would
give the client a reasonable progress meter; certainly better than
nothing at all!  ;-)

-- 
Shawn.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] Support 64-bit indexes for pack files.
  2007-02-28  3:52                     ` Shawn O. Pearce
@ 2007-02-28  4:12                       ` Nicolas Pitre
  0 siblings, 0 replies; 20+ messages in thread
From: Nicolas Pitre @ 2007-02-28  4:12 UTC (permalink / raw)
  To: Shawn O. Pearce; +Cc: Geert Bosch, Troy Telford, git

On Tue, 27 Feb 2007, Shawn O. Pearce wrote:

> Nicolas Pitre <nico@cam.org> wrote:
> > On Tue, 27 Feb 2007, Geert Bosch wrote:
> > > For smaller packs, the I/O is all going to be buffered anyway,
> > > but if we're going to have >4GB pack files, it adds a lot of real
> > > I/O  and SHA1 computation for no good reason. If we get a rare chance
> > > to have a new pack format, why not fix this wart at the same time?
> > 
> > Fair enough.
> 
> OK, so lets say that if both ends of a network transport support
> pack v4 then we can use pack v4.  If pack v4 omits the count field
> from its header (because its easily derived or obtained from the
> index, and doesn't add any additional data protection over the
> SHA-1s)

I don't agree we should omit it at all.

I agree it might just not be included in the whole pack SHA1. Actually 
the whole pack header might be ommitted from the pack SHA1. This way, if 
data is appended at the end like index-pack does in the thin pack case, 
the header can be fixed up without invalidating the SHA1 that was 
computed while receiving the pack.  Why the whole header?  Well who 
knows if we won't dynamically change the pack version while receiving it 
as well in the future.  In fact, we could just add the header to the 
SHA1 checksum but after the data payload (for the checksum only not in 
the actual pack).  this way we preserve the same protection/validity 
strength as we have today.

> why not add some machine-readable sideband that can provide
> transfer progress?
> 
> I think we would want four values, number of objects (sent/total)
> and uncompressed bytes (sent/total), to send to the client.

Uncompressed bytes is meaningless when evaluating throughput of 
compressed data since compression ratio is variable during a single 
fetch.  It doesn't provide more accurate progress than what we have 
today with number of objects either.

> Estimating the total uncompressed bytes is very easy in pack-objects
> before we start sending even the header; actually if we are reusing
> a majority of the objects from an existing packfile we even have
> a good approximation of the compressed size ready.  That would
> give the client a reasonable progress meter; certainly better than
> nothing at all!  ;-)

IMHO it is not worth it.  And it won't help git-pack-index if run on a 
local pack either.


Nicolas

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] Support 64-bit indexes for pack files.
  2007-02-26 22:40 [PATCH] Support 64-bit indexes for pack files Troy Telford
  2007-02-26 23:55 ` Shawn O. Pearce
@ 2007-02-28 19:46 ` Troy Telford
  1 sibling, 0 replies; 20+ messages in thread
From: Troy Telford @ 2007-02-28 19:46 UTC (permalink / raw)
  To: Shawn O. Pearce; +Cc: git

Subject: [PATCH] 32-bit support for large indexes

This patch continues the work I've done on large indexes, and was completed
largely as an educational exercise; I'm obviously aware of Pack v4 making
these two patches unnecessary, and that inclusion is unlikely.  I'm
submitting the patch to make my previous patch on 64-bit indexes more
complete, should anyone actually use it.

It also introduced changes that may be expressing a bug
in the SHA1 code.  (Or at least, that's all myself and Eric Biederman can
think of... but it was late and we were tired.)

The long and the short of it is a large number of 'offset' variables are 
'unsigned long', and needed to be changed to 'off_t', with a few cleanups
 to accommodate this change.

The primary goal was to get the extended index to work on x86 (as well as
x86_64) machines.  Operations such as 'git fetch/pull', 'git cat-file -p', and
of course 'git-fsck --full' were either failing or reporting errors.

In the process of testing, I found an issue on x86 (x86_64 worked fine) where
'git-fsck' would report problems "error:  packed <sha1> from <packfile> is
corrupt"  (On 3 files in a very large repository; the number of 'corrupt' files
is a small percentage)

The patch has some (currently disabled) code that would write the offending
blobs to disk, as well as print some related data to the screen.  In the test
repository, the 'corrupt' blobs were fairly large binaries, which ended up
being either tar.gz or tar.bz2.  (Of sizes 200-19 MB).  I checked the blobs for
internal consistency (gzip -t and bzip2 -t), and they checked out OK.
Similarly, git-cat-file -p could extract the 'corrupt' blob without any issues.

Being a bit confused at this point, I checked with Eric Biederman, who also
couldn't find why the blob was being labeled as 'corrupt'.  Eric's only guess
was that something funny was happening in the SHA1 generation, which I don't
believe my code even touches.

That being said, the only problem I've found at this point is with git-fsck,
and then only with 32-bit machines.  Other (previously broken with both 
the old and 'new' index formats) operations such as fetch/pull, and cat-file
now work on 32-bit systems.  This marks the first time I've ever been able
to use git with my large repository on 32-bit systems.
---
 builtin-pack-objects.c   |   30 ++++++++++----------
 builtin-unpack-objects.c |   12 ++++----
 cache.h                  |    6 ++--
 index-pack.c             |   20 +++++++------
 pack-check.c             |   35 ++++++++++++++++++++----
 sha1_file.c              |   67 ++++++++++++++++++++++++++-------------------
 6 files changed, 103 insertions(+), 67 deletions(-)

diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c
index 92087c1..c1b41b5 100644
--- a/builtin-pack-objects.c
+++ b/builtin-pack-objects.c
@@ -23,7 +23,7 @@ git-pack-objects [{ -q | --progress | --all-progress }] \n\
 struct object_entry {
 	unsigned char sha1[20];
 	unsigned long size;	/* uncompressed size */
-	unsigned long offset;	/* offset into the final pack file;
+	off_t offset;		/* offset into the final pack file;
 				 * nonzero if already written.
 				 */
 	unsigned int depth;	/* delta depth */
@@ -222,7 +222,7 @@ static void prepare_pack_revindex(struct pack_revindex *rix)
 }
 
 static struct revindex_entry * find_packed_object(struct packed_git *p,
-						  unsigned int ofs)
+						  off_t ofs)
 {
 	int num;
 	int lo, hi;
@@ -250,15 +250,15 @@ static struct revindex_entry * find_packed_object(struct packed_git *p,
 	die("internal error: pack revindex corrupt");
 }
 
-static unsigned long find_packed_object_size(struct packed_git *p,
-					     unsigned long ofs)
+static off_t find_packed_object_size(struct packed_git *p,
+					     off_t ofs)
 {
 	struct revindex_entry *entry = find_packed_object(p, ofs);
 	return entry[1].offset - ofs;
 }
 
 static unsigned char *find_packed_object_name(struct packed_git *p,
-					      unsigned long ofs)
+					      off_t ofs)
 {
 	struct revindex_entry *entry = find_packed_object(p, ofs);
 	if (p->index.version == 0)
@@ -320,7 +320,7 @@ static int encode_header(enum object_type type, unsigned long size, unsigned cha
  */
 static int check_pack_inflate(struct packed_git *p,
 		struct pack_window **w_curs,
-		unsigned long offset,
+		off_t offset,
 		unsigned long len,
 		unsigned long expect)
 {
@@ -347,7 +347,7 @@ static int check_pack_inflate(struct packed_git *p,
 static void copy_pack_data(struct sha1file *f,
 		struct packed_git *p,
 		struct pack_window **w_curs,
-		unsigned long offset,
+		off_t offset,
 		unsigned long len)
 {
 	unsigned char *in;
@@ -483,7 +483,7 @@ static unsigned long write_object(struct sha1file *f,
 			 * encoding of the relative offset for the delta
 			 * base from this object's position in the pack.
 			 */
-			unsigned long ofs = entry->offset - entry->delta->offset;
+			off_t ofs = entry->offset - entry->delta->offset;
 			unsigned pos = sizeof(header) - 1;
 			header[pos] = ofs & 127;
 			while (ofs >>= 7)
@@ -504,7 +504,7 @@ static unsigned long write_object(struct sha1file *f,
 	else {
 		struct packed_git *p = entry->in_pack;
 		struct pack_window *w_curs = NULL;
-		unsigned long offset;
+		off_t offset;
 
 		if (entry->delta) {
 			obj_type = (allow_ofs_delta && entry->delta->offset) ?
@@ -514,7 +514,7 @@ static unsigned long write_object(struct sha1file *f,
 		hdrlen = encode_header(obj_type, entry->size, header);
 		sha1write(f, header, hdrlen);
 		if (obj_type == OBJ_OFS_DELTA) {
-			unsigned long ofs = entry->offset - entry->delta->offset;
+			off_t ofs = entry->offset - entry->delta->offset;
 			unsigned pos = sizeof(header) - 1;
 			header[pos] = ofs & 127;
 			while (ofs >>= 7)
@@ -544,7 +544,7 @@ static unsigned long write_object(struct sha1file *f,
 
 static unsigned long write_one(struct sha1file *f,
 			       struct object_entry *e,
-			       unsigned long offset)
+			       off_t offset)
 {
 	if (e->offset || e->preferred_base)
 		/* offset starts from header size and cannot be zero
@@ -562,7 +562,7 @@ static void write_pack_file(void)
 {
 	int i;
 	struct sha1file *f;
-	unsigned long offset;
+	off_t offset;
 	struct pack_header hdr;
 	unsigned last_percent = 999;
 	int do_progress = progress;
@@ -721,7 +721,7 @@ static int add_object_entry(const unsigned char *sha1, unsigned hash, int exclud
 
 	if (!exclude) {
 		for (p = packed_git; p; p = p->next) {
-			unsigned long offset = find_pack_entry_one(sha1, p);
+			off_t offset = find_pack_entry_one(sha1, p);
 			if (offset) {
 				if (incremental)
 					return 0;
@@ -1044,7 +1044,7 @@ static void check_object(struct object_entry *entry)
 		 */
 		if (!no_reuse_delta) {
 			unsigned char c, *base_name;
-			unsigned long ofs;
+			off_t ofs;
 			unsigned long used_0;
 			/* there is at least 20 bytes left in the pack */
 			switch (entry->in_pack_type) {
@@ -1061,7 +1061,7 @@ static void check_object(struct object_entry *entry)
 				ofs = c & 127;
 				while (c & 128) {
 					ofs += 1;
-					if (!ofs || ofs & ~(~0UL >> 7))
+					if (!ofs || ofs & ~(~((off_t) 0) >> 7))
 						die("delta base offset overflow in pack for %s",
 						    sha1_to_hex(entry->sha1));
 					c = buf[used_0++];
diff --git a/builtin-unpack-objects.c b/builtin-unpack-objects.c
index 8f8e898..b85c557 100644
--- a/builtin-unpack-objects.c
+++ b/builtin-unpack-objects.c
@@ -13,7 +13,7 @@ static const char unpack_usage[] = "git-unpack-objects [-n] [-q] [-r] < pack-fil
 
 /* We always read in 4kB chunks. */
 static unsigned char buffer[4096];
-static unsigned long offset, len, consumed_bytes;
+static off_t offset, len, consumed_bytes;
 static SHA_CTX ctx;
 
 /*
@@ -88,7 +88,7 @@ static void *get_data(unsigned long size)
 
 struct delta_info {
 	unsigned char base_sha1[20];
-	unsigned long base_offset;
+	off_t base_offset;
 	unsigned long size;
 	void *delta;
 	unsigned nr;
@@ -98,7 +98,7 @@ struct delta_info {
 static struct delta_info *delta_list;
 
 static void add_delta_to_list(unsigned nr, unsigned const char *base_sha1,
-			      unsigned long base_offset,
+			      off_t base_offset,
 			      void *delta, unsigned long size)
 {
 	struct delta_info *info = xmalloc(sizeof(*info));
@@ -113,7 +113,7 @@ static void add_delta_to_list(unsigned nr, unsigned const char *base_sha1,
 }
 
 struct obj_info {
-	unsigned long offset;
+	off_t offset;
 	unsigned char sha1[20];
 };
 
@@ -209,7 +209,7 @@ static void unpack_delta_entry(enum object_type kind, unsigned long delta_size,
 	} else {
 		unsigned base_found = 0;
 		unsigned char *pack, c;
-		unsigned long base_offset;
+		off_t base_offset;
 		unsigned lo, mid, hi;
 
 		pack = fill(1);
@@ -218,7 +218,7 @@ static void unpack_delta_entry(enum object_type kind, unsigned long delta_size,
 		base_offset = c & 127;
 		while (c & 128) {
 			base_offset += 1;
-			if (!base_offset || base_offset & ~(~0UL >> 7))
+			if (!base_offset || base_offset & ~(~((off_t) 0) >> 7))
 				die("offset value overflow for delta base object");
 			pack = fill(1);
 			c = *pack;
diff --git a/cache.h b/cache.h
index b329ca4..88565b0 100644
--- a/cache.h
+++ b/cache.h
@@ -420,15 +420,15 @@ extern struct packed_git *find_sha1_pack(const unsigned char *sha1,
 					 struct packed_git *packs);
 
 extern void pack_report(void);
-extern unsigned char* use_pack(struct packed_git *, struct pack_window **, unsigned long, unsigned int *);
+extern unsigned char* use_pack(struct packed_git *, struct pack_window **, off_t, unsigned int *);
 extern void unuse_pack(struct pack_window **);
 extern struct packed_git *add_packed_git(char *, int, int);
 extern int num_packed_objects(const struct packed_git *p);
 extern int nth_packed_object_sha1(const struct packed_git *, int, unsigned char*);
 extern off_t find_pack_entry_one(const unsigned char *, struct packed_git *);
-extern void *unpack_entry(struct packed_git *, unsigned long, char *, unsigned long *);
+extern void *unpack_entry(struct packed_git *, off_t, char *, unsigned long *);
 extern unsigned long unpack_object_header_gently(const unsigned char *buf, unsigned long len, enum object_type *type, unsigned long *sizep);
-extern void packed_object_info_detail(struct packed_git *, unsigned long, char *, unsigned long *, unsigned long *, unsigned int *, unsigned char *);
+extern void packed_object_info_detail(struct packed_git *, off_t, char *, unsigned long *, unsigned long *, unsigned int *, unsigned char *);
 
 /* Dumb servers support */
 extern int update_server_info(int);
diff --git a/index-pack.c b/index-pack.c
index cc22ea2..ef10e38 100644
--- a/index-pack.c
+++ b/index-pack.c
@@ -22,7 +22,7 @@ struct object_entry
 
 union delta_base {
 	unsigned char sha1[20];
-	unsigned long offset;
+	off_t offset;
 };
 
 /*
@@ -173,10 +173,10 @@ static void parse_pack_header(void)
 	use(sizeof(struct pack_header));
 }
 
-static void bad_object(unsigned long offset, const char *format,
+static void bad_object(off_t offset, const char *format,
 		       ...) NORETURN __attribute__((format (printf, 2, 3)));
 
-static void bad_object(unsigned long offset, const char *format, ...)
+static void bad_object(off_t offset, const char *format, ...)
 {
 	va_list params;
 	char buf[1024];
@@ -184,10 +184,11 @@ static void bad_object(unsigned long offset, const char *format, ...)
 	va_start(params, format);
 	vsnprintf(buf, sizeof(buf), format, params);
 	va_end(params);
-	die("pack has bad object at offset %lu: %s", offset, buf);
+	die("pack has bad object at offset %llu: %s",
+		(unsigned long long) offset, buf);
 }
 
-static void *unpack_entry_data(unsigned long offset, unsigned long size)
+static void *unpack_entry_data(off_t offset, unsigned long size)
 {
 	z_stream stream;
 	void *buf = xmalloc(size);
@@ -216,7 +217,8 @@ static void *unpack_entry_data(unsigned long offset, unsigned long size)
 static void *unpack_raw_entry(struct object_entry *obj, union delta_base *delta_base)
 {
 	unsigned char *p, c;
-	unsigned long size, base_offset;
+	unsigned long size;
+	off_t base_offset;
 	unsigned shift;
 
 	obj->offset = consumed_bytes;
@@ -249,7 +251,7 @@ static void *unpack_raw_entry(struct object_entry *obj, union delta_base *delta_
 		base_offset = c & 127;
 		while (c & 128) {
 			base_offset += 1;
-			if (!base_offset || base_offset & ~(~0UL >> 7))
+			if (!base_offset || base_offset & ~(~((off_t)0) >> 7))
 				bad_object(obj->offset, "offset value overflow for delta base object");
 			p = fill(1);
 			c = *p;
@@ -275,8 +277,8 @@ static void *unpack_raw_entry(struct object_entry *obj, union delta_base *delta_
 
 static void *get_data_from_pack(struct object_entry *obj)
 {
-	unsigned long from = obj[0].offset + obj[0].hdr_size;
-	unsigned long len = obj[1].offset - from;
+	off_t from = obj[0].offset + obj[0].hdr_size;
+	off_t len = obj[1].offset - from;
 	unsigned char *src, *data;
 	z_stream stream;
 	int st;
diff --git a/pack-check.c b/pack-check.c
index 12ee933..af5fd3b 100644
--- a/pack-check.c
+++ b/pack-check.c
@@ -8,7 +8,7 @@ static int verify_packfile(struct packed_git *p,
 	void *index_base = p->index.base;
 	SHA_CTX ctx;
 	unsigned char sha1[20];
-	unsigned long offset = 0, pack_sig = p->pack_size - 20;
+	off_t offset = 0, pack_sig = p->pack_size - 20;
 	int nr_objects, err, i;
 
 	/* Note that the pack header checks are actually performed by
@@ -22,9 +22,11 @@ static int verify_packfile(struct packed_git *p,
 		unsigned int remaining;
 		unsigned char *in = use_pack(p, w_curs, offset, &remaining);
 		offset += remaining;
+/*		printf("Offset: %llu\tpack_sig: %llu\tremaining: %u\tfirst foo\n",(unsigned long long) offset, (unsigned long long) pack_sig, remaining);*/
 		if (offset > pack_sig)
 			remaining -= offset - pack_sig;
 		SHA1_Update(&ctx, in, remaining);
+/*		printf("Offset: %llu\tpack_sig: %llu\tremaining: %u\n",(unsigned long long) offset, (unsigned long long) pack_sig, remaining);*/
 	}
 	SHA1_Final(sha1, &ctx);
 	if (hashcmp(sha1, use_pack(p, w_curs, pack_sig, NULL)))
@@ -33,6 +35,7 @@ static int verify_packfile(struct packed_git *p,
 	if (hashcmp(sha1, (unsigned char *)index_base + index_size - 40))
 		return error("Packfile %s SHA1 mismatch with idx",
 			     p->pack_name);
+/*	printf("Success; pack name: %s\t%s\n", p->pack_name, sha1_to_hex(sha1));*/
 	unuse_pack(w_curs);
 
 	/* Make sure everything reachable from idx is valid.  Since we
@@ -44,7 +47,8 @@ static int verify_packfile(struct packed_git *p,
 		unsigned char sha1[20];
 		void *data;
 		char type[20];
-		unsigned long size, offset;
+		unsigned long size;
+		off_t offset;
 
 		if (nth_packed_object_sha1(p, i, sha1))
 			die("internal error pack-check nth-packed-object");
@@ -58,6 +62,23 @@ static int verify_packfile(struct packed_git *p,
 			continue;
 		}
 		if (check_sha1_signature(sha1, data, size, type)) {
+#if 0
+			int fd;
+			fd = open(sha1_to_hex(sha1), O_RDWR | O_CREAT,0777);
+			if (fd >= 0) {
+				ssize_t written = 0, chunk;
+				while (written != size) {
+					chunk = xwrite(fd, data + written, size - written);
+					if (chunk < 0)
+						die("OUch!");
+					written += chunk;
+				}
+				close(fd);
+				printf("wrote the data\n");
+			}
+#endif
+			printf("Size=%lu\n",size);
+
 			err = error("packed %s from %s is corrupt",
 				    sha1_to_hex(sha1), p->pack_name);
 			free(data);
@@ -85,7 +106,7 @@ static void show_pack_info(struct packed_git *p)
 		char type[20];
 		unsigned long size;
 		unsigned long store_size;
-		unsigned long offset;
+		off_t offset;
 		unsigned int delta_chain_length;
 
 		if (nth_packed_object_sha1(p, i, sha1))
@@ -99,10 +120,12 @@ static void show_pack_info(struct packed_git *p)
 					  base_sha1);
 		printf("%s ", sha1_to_hex(sha1));
 		if (!delta_chain_length)
-			printf("%-6s %lu %lu\n", type, size, offset);
+			printf("%-6s %lu %llu\n", type, size,
+				(unsigned long long) offset);
 		else {
-			printf("%-6s %lu %lu %u %s\n", type, size, offset,
-			       delta_chain_length, sha1_to_hex(base_sha1));
+			printf("%-6s %lu %llu %u %s\n", type, size,
+				(unsigned long long) offset,
+				delta_chain_length, sha1_to_hex(base_sha1));
 			if (delta_chain_length < MAX_CHAIN)
 				chain_histogram[delta_chain_length]++;
 			else
diff --git a/sha1_file.c b/sha1_file.c
index c5ab5cc..23c5cb2 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -646,7 +646,7 @@ static int open_packed_git(struct packed_git *p)
 	return -1;
 }
 
-static int in_window(struct pack_window *win, unsigned long offset)
+static int in_window(struct pack_window *win, off_t offset)
 {
 	/* We must promise at least 20 bytes (one hash) after the
 	 * offset is available from this window, otherwise the offset
@@ -661,7 +661,7 @@ static int in_window(struct pack_window *win, unsigned long offset)
 
 unsigned char* use_pack(struct packed_git *p,
 		struct pack_window **w_cursor,
-		unsigned long offset,
+		off_t offset,
 		unsigned int *left)
 {
 	struct pack_window *win = *w_cursor;
@@ -688,9 +688,9 @@ unsigned char* use_pack(struct packed_git *p,
 			size_t window_align = packed_git_window_size / 2;
 			win = xcalloc(1, sizeof(*win));
 			win->offset = (offset / window_align) * window_align;
-			win->len = p->pack_size - win->offset;
-			if (win->len > packed_git_window_size)
-				win->len = packed_git_window_size;
+			win->len = packed_git_window_size;
+			if (win->len > p->pack_size - win->offset)
+				win->len = p->pack_size - win->offset;
 			pack_mapped += win->len;
 			while (packed_git_limit < pack_mapped
 				&& unuse_one_window(p))
@@ -859,7 +859,15 @@ int check_sha1_signature(const unsigned char *sha1, void *map, unsigned long siz
 {
 	unsigned char real_sha1[20];
 	hash_sha1_file(map, size, type, real_sha1);
+#if 0
+	if (hashcmp(sha1, real_sha1) == 0)
+		return 0;
+	printf("%s: src:%s ", __func__, sha1_to_hex(sha1));
+	printf("real: %s type: %s\n", sha1_to_hex(real_sha1), type);
+	return -1;
+#else
 	return hashcmp(sha1, real_sha1) ? -1 : 0;
+#endif
 }
 
 void *map_sha1_file(const unsigned char *sha1, unsigned long *size)
@@ -1061,15 +1069,15 @@ void * unpack_sha1_file(void *map, unsigned long mapsize, char *type, unsigned l
 	return unpack_sha1_rest(&stream, hdr, *size);
 }
 
-static unsigned long get_delta_base(struct packed_git *p,
+static off_t get_delta_base(struct packed_git *p,
 				    struct pack_window **w_curs,
-				    unsigned long offset,
+				    off_t offset,
 				    enum object_type kind,
-				    unsigned long delta_obj_offset,
-				    unsigned long *base_obj_offset)
+				    off_t delta_obj_offset,
+				    off_t *base_obj_offset)
 {
 	unsigned char *base_info = use_pack(p, w_curs, offset, NULL);
-	unsigned long base_offset;
+	off_t base_offset;
 
 	/* use_pack() assured us we have [base_info, base_info + 20)
 	 * as a range that we can look at without walking off the
@@ -1083,7 +1091,7 @@ static unsigned long get_delta_base(struct packed_git *p,
 		base_offset = c & 127;
 		while (c & 128) {
 			base_offset += 1;
-			if (!base_offset || base_offset & ~(~0UL >> 7))
+			if (!base_offset || base_offset & ~(~((off_t) 0) >> 7))
 				die("offset value overflow for delta base object");
 			c = base_info[used++];
 			base_offset = (base_offset << 7) + (c & 127);
@@ -1106,18 +1114,18 @@ static unsigned long get_delta_base(struct packed_git *p,
 }
 
 /* forward declaration for a mutually recursive function */
-static int packed_object_info(struct packed_git *p, unsigned long offset,
+static int packed_object_info(struct packed_git *p, off_t offset,
 			      char *type, unsigned long *sizep);
 
 static int packed_delta_info(struct packed_git *p,
 			     struct pack_window **w_curs,
-			     unsigned long offset,
+			     off_t offset,
 			     enum object_type kind,
-			     unsigned long obj_offset,
+			     off_t obj_offset,
 			     char *type,
 			     unsigned long *sizep)
 {
-	unsigned long base_offset;
+	off_t base_offset;
 
 	offset = get_delta_base(p, w_curs, offset, kind,
 		obj_offset, &base_offset);
@@ -1169,9 +1177,9 @@ static int packed_delta_info(struct packed_git *p,
 	return 0;
 }
 
-static unsigned long unpack_object_header(struct packed_git *p,
+static off_t unpack_object_header(struct packed_git *p,
 		struct pack_window **w_curs,
-		unsigned long offset,
+		off_t offset,
 		enum object_type *type,
 		unsigned long *sizep)
 {
@@ -1194,7 +1202,7 @@ static unsigned long unpack_object_header(struct packed_git *p,
 }
 
 void packed_object_info_detail(struct packed_git *p,
-			       unsigned long offset,
+			       off_t offset,
 			       char *type,
 			       unsigned long *size,
 			       unsigned long *store_size,
@@ -1202,7 +1210,8 @@ void packed_object_info_detail(struct packed_git *p,
 			       unsigned char *base_sha1)
 {
 	struct pack_window *w_curs = NULL;
-	unsigned long obj_offset, val;
+	off_t obj_offset;
+	unsigned long val;
 	unsigned char *next_sha1;
 	enum object_type kind;
 
@@ -1243,7 +1252,7 @@ void packed_object_info_detail(struct packed_git *p,
 	}
 }
 
-static int packed_object_info(struct packed_git *p, unsigned long offset,
+static int packed_object_info(struct packed_git *p, off_t offset,
 			      char *type, unsigned long *sizep)
 {
 	struct pack_window *w_curs = NULL;
@@ -1278,7 +1287,7 @@ static int packed_object_info(struct packed_git *p, unsigned long offset,
 
 static void *unpack_compressed_entry(struct packed_git *p,
 				    struct pack_window **w_curs,
-				    unsigned long offset,
+				    off_t offset,
 				    unsigned long size)
 {
 	int st;
@@ -1309,22 +1318,23 @@ static void *unpack_compressed_entry(struct packed_git *p,
 
 static void *unpack_delta_entry(struct packed_git *p,
 				struct pack_window **w_curs,
-				unsigned long offset,
+				off_t offset,
 				unsigned long delta_size,
 				enum object_type kind,
-				unsigned long obj_offset,
+				off_t obj_offset,
 				char *type,
 				unsigned long *sizep)
 {
 	void *delta_data, *result, *base;
-	unsigned long result_size, base_size, base_offset;
+	unsigned long result_size, base_size;
+	off_t base_offset;
 
 	offset = get_delta_base(p, w_curs, offset, kind,
 		obj_offset, &base_offset);
 	base = unpack_entry(p, base_offset, type, &base_size);
 	if (!base)
-		die("failed to read delta base object at %lu from %s",
-		    base_offset, p->pack_name);
+		die("failed to read delta base object at %llu from %s",
+		    (unsigned long long) base_offset, p->pack_name);
 
 	delta_data = unpack_compressed_entry(p, w_curs, offset, delta_size);
 	result = patch_delta(base, base_size,
@@ -1342,7 +1352,8 @@ void *unpack_entry(struct packed_git *p, off_t offset,
 			  char *type, unsigned long *sizep)
 {
 	struct pack_window *w_curs = NULL;
-	unsigned long size, obj_offset = offset;
+	unsigned long size;
+	off_t obj_offset = offset;
 	enum object_type kind;
 	void *retval;
 
@@ -1495,7 +1506,7 @@ static int matches_pack_name(struct packed_git *p, const char *ig)
 static int find_pack_entry(const unsigned char *sha1, struct pack_entry *e, const char **ignore_packed)
 {
 	struct packed_git *p;
-	unsigned long offset;
+	off_t offset;
 
 	prepare_packed_git();
 
-- 
1.5.0.2

^ permalink raw reply related	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2007-02-28 19:47 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-02-26 22:40 [PATCH] Support 64-bit indexes for pack files Troy Telford
2007-02-26 23:55 ` Shawn O. Pearce
2007-02-27  0:24   ` Nicolas Pitre
2007-02-27  0:31     ` Shawn O. Pearce
2007-02-27  4:32       ` Nicolas Pitre
2007-02-27  4:55         ` Geert Bosch
2007-02-27  5:11           ` Nicolas Pitre
2007-02-27 16:04             ` Geert Bosch
2007-02-27 16:11               ` Shawn O. Pearce
2007-02-27 16:55                 ` Geert Bosch
2007-02-27 17:36                   ` Nicolas Pitre
2007-02-28  3:52                     ` Shawn O. Pearce
2007-02-28  4:12                       ` Nicolas Pitre
2007-02-27 17:03               ` Nicolas Pitre
2007-02-27 20:05               ` Johannes Schindelin
2007-02-27 20:25                 ` Geert Bosch
2007-02-27 20:35                   ` Johannes Schindelin
2007-02-27  1:16   ` Troy Telford
2007-02-27  4:56   ` Nicolas Pitre
2007-02-28 19:46 ` Troy Telford

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.