* [PATCH] Support 64-bit indexes for pack files.
@ 2007-02-26 22:40 Troy Telford
2007-02-26 23:55 ` Shawn O. Pearce
2007-02-28 19:46 ` Troy Telford
0 siblings, 2 replies; 20+ messages in thread
From: Troy Telford @ 2007-02-26 22:40 UTC (permalink / raw)
To: Shawn O. Pearce; +Cc: git
As I've not been involved with git development before, I'm
aware that this may already be on somebody's 'todo' list. It was an itch
I needed to scratch, as I have a repository whose size is multiple gigabytes,
and 'git clone' (by default) forces everything into a single
packfile with >=git-1.5.0.
Since I'm an insane form of lazy, needed the C practice, and wanted to
avoid those few keystrokes needed to tweak the default push/fetch behavior,
I'm attempting to either fix git or break it in a new and interesting way.
The patch introduces a new packfile index version, which adds a:
* header to the index file (index version info).
* leaves the object indexes within the index at 32-bit
* extends the offsets used to describe the packfile to 64-bit.
The new index format is only used when the associated packfile
becomes large enough to warrant a 64-bit index; otherwise the original
index format is used.
Operations such as git-fsck, checkouts, diffs, and branches
all appear to work properly on x86-64 architectures. I've done
testing on both x86 and x86_64 architectures, using a git repository
with a single 5.4 GB packfile.
32-bit architectures still have issues using the 64-bit
indexes (git- fsck, pull/fetch return various errors) so it's not yet
complete.
I did have quite a bit of help from Eric Biederman, whom
I'd like to acknowledge and thank for his help.
Signed-off-by: Troy Telford <ttelford.groups@gmail.com>
---
builtin-pack-objects.c | 50 ++++++++++++-
cache.h | 13 ++-
index-pack.c | 30 ++++++-
pack-check.c | 8 +-
pack-redundant.c | 83 +++++++++++++++------
pack.h | 4 +
sha1_file.c | 193 +++++++++++++++++++++++++++++++++++-------------
show-index.c | 81 +++++++++++++++-----
8 files changed, 349 insertions(+), 113 deletions(-)
diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c
index b5ed9ce..92087c1 100644
--- a/builtin-pack-objects.c
+++ b/builtin-pack-objects.c
@@ -101,7 +101,7 @@ static int object_ix_hashsz;
* get the object sha1 from the main index.
*/
struct revindex_entry {
- unsigned int offset;
+ off_t offset;
unsigned int nr;
};
struct pack_revindex {
@@ -163,12 +163,12 @@ static int cmp_offset(const void *a_, const void *b_)
/*
* Ordered list of offsets of objects in the pack.
*/
-static void prepare_pack_revindex(struct pack_revindex *rix)
+static void prepare_pack_revindex_v0(struct pack_revindex *rix)
{
struct packed_git *p = rix->p;
int num_ent = num_packed_objects(p);
int i;
- void *index = p->index_base + 256;
+ void *index = p->index.base + 256;
rix->revindex = xmalloc(sizeof(*rix->revindex) * (num_ent + 1));
for (i = 0; i < num_ent; i++) {
@@ -184,6 +184,43 @@ static void prepare_pack_revindex(struct pack_revindex *rix)
qsort(rix->revindex, num_ent, sizeof(*rix->revindex), cmp_offset);
}
+static void prepare_pack_revindex_v1(struct pack_revindex *rix)
+{
+ struct packed_git *p = rix->p;
+ int num_ent = num_packed_objects(p);
+ int i;
+ uint32_t *index = p->index.base + 256 + 2;
+
+ rix->revindex = xmalloc(sizeof(*rix->revindex) * (num_ent + 1));
+ for (i = 0; i < num_ent; i++) {
+ off_t offset;
+ offset = ntohl(*(index + (7 * i)));
+ offset <<=32;
+ offset |= ntohl(*(index + ((7 * i) + 1)));
+ rix->revindex[i].offset = offset;
+ rix->revindex[i].nr = i;
+ }
+ /* This knows the pack format -- the 20-byte trailer
+ * follows immediately after the last object data.
+ */
+ rix->revindex[num_ent].offset = p->pack_size - 20;
+ rix->revindex[num_ent].nr = -1;
+ qsort(rix->revindex, num_ent, sizeof(*rix->revindex), cmp_offset);
+}
+static void prepare_pack_revindex(struct pack_revindex *rix)
+{
+ switch (rix->p->index.version)
+ {
+ case 0:
+ return prepare_pack_revindex_v0(rix);
+ case 1:
+ return prepare_pack_revindex_v1(rix);
+ default:
+ die("Unsupported Pack Version");
+
+ }
+}
+
static struct revindex_entry * find_packed_object(struct packed_git *p,
unsigned int ofs)
{
@@ -224,7 +261,12 @@ static unsigned char *find_packed_object_name(struct packed_git *p,
unsigned long ofs)
{
struct revindex_entry *entry = find_packed_object(p, ofs);
- return (unsigned char *)(p->index_base + 256) + 24 * entry->nr + 4;
+ if (p->index.version == 0)
+ return (unsigned char *)(p->index.base + 256) + 24 * entry->nr + 4;
+ else if (p->index.version == 1)
+ return (unsigned char *)(p->index.base + 256 + 2) + 28 * entry->nr + 8;
+ else
+ die("Unsupported index version");
}
static void *delta_against(void *buf, unsigned long size, struct object_entry *entry)
diff --git a/cache.h b/cache.h
index 04f8e63..b329ca4 100644
--- a/cache.h
+++ b/cache.h
@@ -362,12 +362,17 @@ struct pack_window {
unsigned int inuse_cnt;
};
+struct index_info {
+ off_t size;
+ uint32_t *base;
+ uint32_t version;
+};
+
extern struct packed_git {
struct packed_git *next;
struct pack_window *windows;
- uint32_t *index_base;
- off_t index_size;
off_t pack_size;
+ struct index_info index;
int pack_fd;
int pack_local;
unsigned char sha1[20];
@@ -376,7 +381,7 @@ extern struct packed_git {
} *packed_git;
struct pack_entry {
- unsigned int offset;
+ off_t offset;
unsigned char sha1[20];
struct packed_git *p;
};
@@ -420,7 +425,7 @@ extern void unuse_pack(struct pack_window **);
extern struct packed_git *add_packed_git(char *, int, int);
extern int num_packed_objects(const struct packed_git *p);
extern int nth_packed_object_sha1(const struct packed_git *, int, unsigned char*);
-extern unsigned long find_pack_entry_one(const unsigned char *, struct packed_git *);
+extern off_t find_pack_entry_one(const unsigned char *, struct packed_git *);
extern void *unpack_entry(struct packed_git *, unsigned long, char *, unsigned long *);
extern unsigned long unpack_object_header_gently(const unsigned char *buf, unsigned long len, enum object_type *type, unsigned long *sizep);
extern void packed_object_info_detail(struct packed_git *, unsigned long, char *, unsigned long *, unsigned long *, unsigned int *, unsigned char *);
diff --git a/index-pack.c b/index-pack.c
index fa9a0e7..cc22ea2 100644
--- a/index-pack.c
+++ b/index-pack.c
@@ -12,7 +12,7 @@ static const char index_pack_usage[] =
struct object_entry
{
- unsigned long offset;
+ off_t offset;
unsigned long size;
unsigned int hdr_size;
enum object_type type;
@@ -83,7 +83,7 @@ static unsigned display_progress(unsigned n, unsigned total, unsigned last_pc)
/* We always read in 4kB chunks. */
static unsigned char input_buffer[4096];
-static unsigned long input_offset, input_len, consumed_bytes;
+static off_t input_offset, input_len, consumed_bytes;
static SHA_CTX input_ctx;
static int input_fd, output_fd, pack_fd;
@@ -707,6 +707,13 @@ static const char *write_index_file(const char *index_name, unsigned char *sha1)
die("unable to create %s: %s", index_name, strerror(errno));
f = sha1fd(fd, index_name);
+ if ( consumed_bytes >= 0xffffffffUL )
+ {
+ struct pack_idx_header hdr;
+ hdr.idx_signature = htonl(PACK_IDX_SIGNATURE);
+ hdr.idx_version = htonl(1);
+ sha1write(f, &hdr, sizeof(hdr));
+ }
/*
* Write the first-level table (the list is sorted,
* but we use a 256-entry lookup to be able to avoid
@@ -736,9 +743,22 @@ static const char *write_index_file(const char *index_name, unsigned char *sha1)
list = sorted_by_sha;
for (i = 0; i < nr_objects; i++) {
struct object_entry *obj = *list++;
- unsigned int offset = htonl(obj->offset);
- sha1write(f, &offset, 4);
- sha1write(f, obj->sha1, 20);
+ if ( consumed_bytes < 0xffffffffUL )
+ {
+ unsigned int offset = htonl(obj->offset);
+ sha1write(f, &offset, 4);
+ sha1write(f, obj->sha1, 20);
+ }
+ else
+ {
+ uint32_t low;
+ uint32_t high;
+ low=htonl(obj->offset & 0xffffffff);
+ high=htonl((obj->offset >>32) & 0xffffffff);
+ sha1write(f, &high, 4);
+ sha1write(f, &low, 4);
+ sha1write(f, obj->sha1, 20);
+ }
SHA1_Update(&ctx, obj->sha1, 20);
}
sha1write(f, sha1, 20);
diff --git a/pack-check.c b/pack-check.c
index 08a9fd8..12ee933 100644
--- a/pack-check.c
+++ b/pack-check.c
@@ -4,8 +4,8 @@
static int verify_packfile(struct packed_git *p,
struct pack_window **w_curs)
{
- unsigned long index_size = p->index_size;
- void *index_base = p->index_base;
+ off_t index_size = p->index.size;
+ void *index_base = p->index.base;
SHA_CTX ctx;
unsigned char sha1[20];
unsigned long offset = 0, pack_sig = p->pack_size - 20;
@@ -123,8 +123,8 @@ static void show_pack_info(struct packed_git *p)
int verify_pack(struct packed_git *p, int verbose)
{
- unsigned long index_size = p->index_size;
- void *index_base = p->index_base;
+ off_t index_size = p->index.size;
+ void *index_base = p->index.base;
SHA_CTX ctx;
unsigned char sha1[20];
int ret;
diff --git a/pack-redundant.c b/pack-redundant.c
index edb5524..d327ae4 100644
--- a/pack-redundant.c
+++ b/pack-redundant.c
@@ -246,15 +246,29 @@ static struct pack_list * pack_list_difference(const struct pack_list *A,
static void cmp_two_packs(struct pack_list *p1, struct pack_list *p2)
{
int p1_off, p2_off;
+ int p1_inc, p2_inc;
unsigned char *p1_base, *p2_base;
struct llist_item *p1_hint = NULL, *p2_hint = NULL;
- p1_off = p2_off = 256 * 4 + 4;
- p1_base = (unsigned char *) p1->pack->index_base;
- p2_base = (unsigned char *) p2->pack->index_base;
+ if (p1->pack->index.version == 0) {
+ p1_off = 256 * 4 + 4;
+ p1_inc = 24;
+ } else {
+ p1_off = 256 * 4 + 8 + 8;
+ p1_inc = 28;
+ }
+ if (p2->pack->index.version == 0) {
+ p2_off = 256 * 4 + 4;
+ p2_inc = 24;
+ } else {
+ p2_off = 256 * 4 + 8 + 8;
+ p2_inc = 28;
+ }
+ p1_base = (unsigned char *) p1->pack->index.base;
+ p2_base = (unsigned char *) p2->pack->index.base;
- while (p1_off <= p1->pack->index_size - 3 * 20 &&
- p2_off <= p2->pack->index_size - 3 * 20)
+ while (p1_off <= p1->pack->index.size - 3 * 20 &&
+ p2_off <= p2->pack->index.size - 3 * 20)
{
int cmp = hashcmp(p1_base + p1_off, p2_base + p2_off);
/* cmp ~ p1 - p2 */
@@ -263,14 +277,14 @@ static void cmp_two_packs(struct pack_list *p1, struct pack_list *p2)
p1_base + p1_off, p1_hint);
p2_hint = llist_sorted_remove(p2->unique_objects,
p1_base + p1_off, p2_hint);
- p1_off+=24;
- p2_off+=24;
+ p1_off+=p1_inc;
+ p2_off+=p2_inc;
continue;
}
if (cmp < 0) { /* p1 has the object, p2 doesn't */
- p1_off+=24;
+ p1_off+=p1_inc;
} else { /* p2 has the object, p1 doesn't */
- p2_off+=24;
+ p2_off+=p2_inc;
}
}
}
@@ -351,27 +365,41 @@ static size_t sizeof_union(struct packed_git *p1, struct packed_git *p2)
{
size_t ret = 0;
int p1_off, p2_off;
+ int p1_inc, p2_inc;
unsigned char *p1_base, *p2_base;
- p1_off = p2_off = 256 * 4 + 4;
- p1_base = (unsigned char *)p1->index_base;
- p2_base = (unsigned char *)p2->index_base;
+ if (p1->index.version == 0) {
+ p1_off = 256 * 4 + 4;
+ p1_inc = 24;
+ } else {
+ p1_off = 256 * 4 + 8 + 8;
+ p1_inc = 28;
+ }
+ if (p2->index.version == 0) {
+ p2_off = 256 * 4 + 4;
+ p2_inc = 24;
+ } else {
+ p2_off = 256 * 4 + 8 + 8;
+ p2_inc = 28;
+ }
+ p1_base = (unsigned char *)p1->index.base;
+ p2_base = (unsigned char *)p2->index.base;
- while (p1_off <= p1->index_size - 3 * 20 &&
- p2_off <= p2->index_size - 3 * 20)
+ while (p1_off <= p1->index.size - 3 * 20 &&
+ p2_off <= p2->index.size - 3 * 20)
{
int cmp = hashcmp(p1_base + p1_off, p2_base + p2_off);
/* cmp ~ p1 - p2 */
if (cmp == 0) {
ret++;
- p1_off+=24;
- p2_off+=24;
+ p1_off+=p1_inc;
+ p2_off+=p2_inc;
continue;
}
if (cmp < 0) { /* p1 has the object, p2 doesn't */
- p1_off+=24;
+ p1_off+=p1_inc;
} else { /* p2 has the object, p1 doesn't */
- p2_off+=24;
+ p2_off+=p2_inc;
}
}
return ret;
@@ -401,7 +429,7 @@ static inline size_t pack_set_bytecount(struct pack_list *pl)
size_t ret = 0;
while (pl) {
ret += pl->pack->pack_size;
- ret += pl->pack->index_size;
+ ret += pl->pack->index.size;
pl = pl->next;
}
return ret;
@@ -534,6 +562,7 @@ static struct pack_list * add_pack(struct packed_git *p)
{
struct pack_list l;
size_t off;
+ int inc;
unsigned char *base;
if (!p->pack_local && !(alt_odb || verbose))
@@ -541,12 +570,18 @@ static struct pack_list * add_pack(struct packed_git *p)
l.pack = p;
llist_init(&l.all_objects);
-
- off = 256 * 4 + 4;
- base = (unsigned char *)p->index_base;
- while (off <= p->index_size - 3 * 20) {
+ if (p->index.version == 0) {
+ off = 256 * 4 + 4;
+ inc = 24;
+ }
+ if (p->index.version == 1) {
+ off = 256 * 4 + 8 + 8;
+ inc = 28;
+ }
+ base = (unsigned char *)p->index.base;
+ while (off <= p->index.size - 3 * 20) {
llist_insert_back(l.all_objects, base + off);
- off += 24;
+ off += inc;
}
/* this list will be pruned in cmp_two_packs later */
l.unique_objects = llist_copy(l.all_objects);
diff --git a/pack.h b/pack.h
index deb427e..fbf7992 100644
--- a/pack.h
+++ b/pack.h
@@ -41,6 +41,10 @@ struct pack_header {
* byte word. This would be true in the proposed future index
* format as idx_signature would be greater than idx_version.
*/
+struct pack_idx_header {
+ uint32_t idx_signature;
+ uint32_t idx_version;
+};
#define PACK_IDX_SIGNATURE 0xff744f63 /* "\377tOc" */
extern int verify_pack(struct packed_git *, int);
diff --git a/sha1_file.c b/sha1_file.c
index 2c87031..c5ab5cc 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -430,12 +430,9 @@ void pack_report()
pack_mapped, peak_pack_mapped);
}
-static int check_packed_git_idx(const char *path, unsigned long *idx_size_,
- void **idx_map_)
+static int check_packed_git_idx(const char *path, struct index_info * idx )
{
- void *idx_map;
uint32_t *index;
- unsigned long idx_size;
int nr, i;
int fd = open(path, O_RDONLY);
struct stat st;
@@ -445,46 +442,73 @@ static int check_packed_git_idx(const char *path, unsigned long *idx_size_,
close(fd);
return -1;
}
- idx_size = st.st_size;
- idx_map = xmmap(NULL, idx_size, PROT_READ, MAP_PRIVATE, fd, 0);
+ idx->size = st.st_size;
+ idx->base = xmmap(NULL, idx->size, PROT_READ, MAP_PRIVATE, fd, 0);
close(fd);
- index = idx_map;
- *idx_map_ = idx_map;
- *idx_size_ = idx_size;
+ index = idx->base;
- /* check index map */
- if (idx_size < 4*256 + 20 + 20)
+ if (idx->size < 8)
return error("index file %s is too small", path);
-
+ if (index[0] != htonl(PACK_IDX_SIGNATURE))
+ {
+ /* check index map */
+ if (idx->size < 4*256 + 20 + 20)
+ return error("index file %s is too small", path);
+ nr = 0;
+ for (i = 0; i < 256; i++) {
+ unsigned int n = ntohl(index[i]);
+ if (n < nr)
+ return error("non-monotonic index %s", path);
+ nr = n;
+ }
+ /*
+ * Total size:
+ * - 256 index entries 4 bytes each
+ * - 24-byte entries * nr (20-byte sha1 + 4-byte offset)
+ * (on 32-bit indexes)
+ * - 20-byte SHA1 of the packfile
+ * - 20-byte SHA1 file checksum
+ */
+ if (idx->size != 4*256 + nr * 24 + 20 + 20)
+ return error("wrong index file size in %s", path);
+ idx->version = 0;
+ }
+ else if (index[0] == htonl(PACK_IDX_SIGNATURE) && ntohl(index[1]) == 1 )
+ {
+ index += 2;
+ /* check index map */
+ if (idx->size < 8 + 4*256 + 20 + 20 )
+ return error("index file %s is too small", path);
+ nr = 0;
+ for (i = 0; i < 256; i++) {
+ unsigned int n = ntohl(index[i]);
+ if (n < nr)
+ return error("non-monotonic index %s", path);
+ nr = n;
+ }
+ /*
+ * Total size:
+ * - (4+4) byte header
+ * - 256 index entries 4 bytes each
+ * - 28-byte entries * nr (20-byte sha1 + 8 byte offset)
+ * (on 64-bit indexes)
+ * - 20-byte SHA1 of the packfile
+ * - 20-byte SHA1 file checksum
+ */
+ if (idx->size != 8 + 4*256 + nr * 28 + 20 + 20)
+ return error("wrong index file size in %s", path);
+ idx->version = 1;
+ }
/* a future index format would start with this, as older git
* binaries would fail the non-monotonic index check below.
* give a nicer warning to the user if we can.
*/
- if (index[0] == htonl(PACK_IDX_SIGNATURE))
+ else if (index[0] == htonl(PACK_IDX_SIGNATURE))
return error("index file %s is a newer version"
" and is not supported by this binary"
" (try upgrading GIT to a newer version)",
path);
-
- nr = 0;
- for (i = 0; i < 256; i++) {
- unsigned int n = ntohl(index[i]);
- if (n < nr)
- return error("non-monotonic index %s", path);
- nr = n;
- }
-
- /*
- * Total size:
- * - 256 index entries 4 bytes each
- * - 24-byte entries * nr (20-byte sha1 + 4-byte offset)
- * - 20-byte SHA1 of the packfile
- * - 20-byte SHA1 file checksum
- */
- if (idx_size != 4*256 + nr * 24 + 20 + 20)
- return error("wrong index file size in %s", path);
-
return 0;
}
@@ -605,7 +629,7 @@ static int open_packed_git_1(struct packed_git *p)
return error("end of packfile %s is unavailable", p->pack_name);
if (read_in_full(p->pack_fd, sha1, sizeof(sha1)) != sizeof(sha1))
return error("packfile %s signature is unavailable", p->pack_name);
- idx_sha1 = ((unsigned char *)p->index_base) + p->index_size - 40;
+ idx_sha1 = ((unsigned char *)p->index.base) + p->index.size - 40;
if (hashcmp(sha1, idx_sha1))
return error("packfile %s does not match index", p->pack_name);
return 0;
@@ -703,17 +727,16 @@ struct packed_git *add_packed_git(char *path, int path_len, int local)
{
struct stat st;
struct packed_git *p;
- unsigned long idx_size;
- void *idx_map;
+ struct index_info idx;
unsigned char sha1[20];
- if (check_packed_git_idx(path, &idx_size, &idx_map))
+ if (check_packed_git_idx(path, &idx))
return NULL;
/* do we have a corresponding .pack file? */
strcpy(path + path_len - 4, ".pack");
if (stat(path, &st) || !S_ISREG(st.st_mode)) {
- munmap(idx_map, idx_size);
+ munmap(idx.base, idx.size);
return NULL;
}
/* ok, it looks sane as far as we can check without
@@ -721,9 +744,8 @@ struct packed_git *add_packed_git(char *path, int path_len, int local)
*/
p = xmalloc(sizeof(*p) + path_len + 2);
strcpy(p->pack_name, path);
- p->index_size = idx_size;
+ p->index = idx;
p->pack_size = st.st_size;
- p->index_base = idx_map;
p->next = NULL;
p->windows = NULL;
p->pack_fd = -1;
@@ -742,20 +764,18 @@ struct packed_git *parse_pack_index(unsigned char *sha1)
struct packed_git *parse_pack_index_file(const unsigned char *sha1, char *idx_path)
{
struct packed_git *p;
- unsigned long idx_size;
- void *idx_map;
+ struct index_info idx;
char *path;
- if (check_packed_git_idx(idx_path, &idx_size, &idx_map))
+ if (check_packed_git_idx(idx_path, &idx))
return NULL;
path = sha1_pack_name(sha1);
p = xmalloc(sizeof(*p) + strlen(path) + 2);
strcpy(p->pack_name, path);
- p->index_size = idx_size;
+ p->index = idx;
p->pack_size = 0;
- p->index_base = idx_map;
p->next = NULL;
p->windows = NULL;
p->pack_fd = -1;
@@ -1318,7 +1338,7 @@ static void *unpack_delta_entry(struct packed_git *p,
return result;
}
-void *unpack_entry(struct packed_git *p, unsigned long offset,
+void *unpack_entry(struct packed_git *p, off_t offset,
char *type, unsigned long *sizep)
{
struct pack_window *w_curs = NULL;
@@ -1350,27 +1370,56 @@ void *unpack_entry(struct packed_git *p, unsigned long offset,
int num_packed_objects(const struct packed_git *p)
{
+ int objects = 0;
/* See check_packed_git_idx() */
- return (p->index_size - 20 - 20 - 4*256) / 24;
+ if (p->index.version == 0)
+ objects = (p->index.size - 20 - 20 - 4*256) / 24;
+ else if (p->index.version == 1)
+ objects = (p->index.size - 8 - 20 - 20 - 4*256) / 28;
+ return objects;
}
-int nth_packed_object_sha1(const struct packed_git *p, int n,
- unsigned char* sha1)
+static int nth_packed_object_sha1_v0(const struct packed_git *p, int n,
+ unsigned char* sha1)
{
- void *index = p->index_base + 256;
+ void *index = p->index.base + 256;
if (n < 0 || num_packed_objects(p) <= n)
return -1;
hashcpy(sha1, (unsigned char *) index + (24 * n) + 4);
return 0;
}
-unsigned long find_pack_entry_one(const unsigned char *sha1,
+static int nth_packed_object_sha1_v1(const struct packed_git *p, int n,
+ unsigned char* sha1)
+{
+ void *index = p->index.base + 256 + 2;
+ if (n < 0 || num_packed_objects(p) <= n)
+ return -1;
+ hashcpy(sha1, (unsigned char *) index + (28 * n) + 8);
+ return 0;
+}
+
+int nth_packed_object_sha1(const struct packed_git *p, int n,
+ unsigned char* sha1)
+{
+ switch (p->index.version)
+ {
+ case 0:
+ return nth_packed_object_sha1_v0(p, n, sha1);
+ case 1:
+ return nth_packed_object_sha1_v1(p, n, sha1);
+ default:
+ die("Unsupported Pack Version v%d\n", p->index.version);
+ }
+}
+
+static off_t find_pack_entry_one_v0(const unsigned char *sha1,
struct packed_git *p)
{
- uint32_t *level1_ofs = p->index_base;
+ uint32_t *level1_ofs = p->index.base;
int hi = ntohl(level1_ofs[*sha1]);
int lo = ((*sha1 == 0x0) ? 0 : ntohl(level1_ofs[*sha1 - 1]));
- void *index = p->index_base + 256;
+ void *index = p->index.base + 256;
do {
int mi = (lo + hi) / 2;
@@ -1385,6 +1434,46 @@ unsigned long find_pack_entry_one(const unsigned char *sha1,
return 0;
}
+static off_t find_pack_entry_one_v1(const unsigned char *sha1,
+ struct packed_git *p)
+{
+ uint32_t *level1_ofs = p->index.base + 2;
+ int hi = ntohl(level1_ofs[*sha1]);
+ int lo = ((*sha1 == 0x0) ? 0 : ntohl(level1_ofs[*sha1 - 1]));
+ uint32_t *index = p->index.base + 256 + 2;
+
+ do {
+ int mi = (lo + hi) / 2;
+ int cmp = hashcmp((unsigned char *)(index + (7 * mi) + 2), sha1);
+ if (!cmp){
+ off_t offset;
+ offset = ntohl(*(index + (7 * mi)));
+ offset <<=32;
+ offset |= ntohl(*(index + ((7 * mi) + 1)));
+ return offset;
+ }
+ if (cmp > 0)
+ hi = mi;
+ else
+ lo = mi+1;
+ } while (lo < hi);
+ return 0;
+}
+
+off_t find_pack_entry_one(const unsigned char *sha1,
+ struct packed_git *p)
+{
+ switch (p->index.version)
+ {
+ case 0:
+ return find_pack_entry_one_v0(sha1, p);
+ case 1:
+ return find_pack_entry_one_v1(sha1, p);
+ default:
+ die("Unsupported Pack Version: v%d\n", p->index.version);
+ }
+}
+
static int matches_pack_name(struct packed_git *p, const char *ig)
{
const char *last_c, *c;
diff --git a/show-index.c b/show-index.c
index a30a2de..c7ff161 100644
--- a/show-index.c
+++ b/show-index.c
@@ -1,28 +1,69 @@
#include "cache.h"
+#include "pack.h"
+#include <stdio.h>
int main(int argc, char **argv)
{
- int i;
- unsigned nr;
- unsigned int entry[6];
- static unsigned int top_index[256];
+ struct pack_idx_header hdr;
- if (fread(top_index, sizeof(top_index), 1, stdin) != 1)
- die("unable to read index");
- nr = 0;
- for (i = 0; i < 256; i++) {
- unsigned n = ntohl(top_index[i]);
- if (n < nr)
- die("corrupt index file");
- nr = n;
- }
- for (i = 0; i < nr; i++) {
- unsigned offset;
+ if (fread(&hdr, sizeof(hdr), 1, stdin) != 1)
+ die("unable to read header");
+ if (PACK_IDX_SIGNATURE != ntohl(hdr.idx_signature))
+ {
+ int i;
+ unsigned nr;
+ unsigned int entry[6];
+ static unsigned int top_index[256];
+ top_index[0] = hdr.idx_signature;
+ top_index[1] = hdr.idx_version;
+ if (fread(top_index+2, sizeof(top_index) - sizeof(top_index[0])*2, 1, stdin) != 1)
+ die("unable to read index");
+ nr = 0;
+ for (i = 0; i < 256; i++) {
+ unsigned n = ntohl(top_index[i]);
+ if (n < nr)
+ die("corrupt index file");
+ nr = n;
+ }
+ for (i = 0; i < nr; i++) {
+ unsigned offset;
- if (fread(entry, 24, 1, stdin) != 1)
- die("unable to read entry %u/%u", i, nr);
- offset = ntohl(entry[0]);
- printf("%u %s\n", offset, sha1_to_hex((void *)(entry+1)));
+ if (fread(entry, 24, 1, stdin) != 1)
+ die("unable to read entry %u/%u", i, nr);
+ offset = ntohl(entry[0]);
+ printf("%u %s\n", offset, sha1_to_hex((void *)(entry+1)));
+ }
+ return 0;
+ }
+ else if (1 == ntohl(hdr.idx_version))
+ {
+ unsigned int nr;
+ unsigned int i;
+ unsigned int entry[7];
+ uint32_t top_index[256];
+ if (fread(top_index, sizeof(top_index), 1, stdin) != 1)
+ die("unable to read index");
+ nr = 0;
+ for (i = 0; i < 256; i++) {
+ unsigned int n;
+ n = ntohl(top_index[i]);
+ if (n < nr)
+ die("corrupt version 1 index file");
+ nr = n;
+ }
+ for (i = 0; i < nr; i++) {
+ off_t offset;
+ if (fread(entry, 28, 1, stdin) != 1)
+ die("unable to read entry %u/%u", i, nr);
+ offset = ntohl(entry[0]);
+ offset <<= 32;
+ offset |= ntohl (entry[1]);
+ printf("%llu %s\n", (unsigned long long) offset,
+ sha1_to_hex((void *)(entry+2)));
+ }
+ return 0;
}
- return 0;
+ else
+ die("Unsupported header version %d.",
+ hdr.idx_version);
}
--
1.5.0.1
^ permalink raw reply related [flat|nested] 20+ messages in thread
* Re: [PATCH] Support 64-bit indexes for pack files.
2007-02-26 22:40 [PATCH] Support 64-bit indexes for pack files Troy Telford
@ 2007-02-26 23:55 ` Shawn O. Pearce
2007-02-27 0:24 ` Nicolas Pitre
` (2 more replies)
2007-02-28 19:46 ` Troy Telford
1 sibling, 3 replies; 20+ messages in thread
From: Shawn O. Pearce @ 2007-02-26 23:55 UTC (permalink / raw)
To: Troy Telford; +Cc: git
Troy Telford <ttelford.groups@gmail.com> wrote:
> As I've not been involved with git development before, I'm
> aware that this may already be on somebody's 'todo' list. It was an itch
> I needed to scratch, as I have a repository whose size is multiple gigabytes,
> and 'git clone' (by default) forces everything into a single
> packfile with >=git-1.5.0.
>
> The patch introduces a new packfile index version, which adds a:
> * header to the index file (index version info).
> * leaves the object indexes within the index at 32-bit
> * extends the offsets used to describe the packfile to 64-bit.
>
> The new index format is only used when the associated packfile
> becomes large enough to warrant a 64-bit index; otherwise the original
> index format is used.
Clearly a good deal of work has been put into this patch. I cannot
say that reviewed every bit of it... because...
Nico and I are neck deep in our pack version 4 topic. That topic
hits all of the same code you touched with your patch.
Our topic also requires us to change the index file format, and
in doing so we have decided to extend the index records to look
something like the following[*1*]:
object SHA-1
64-bit offset within packfile
32-bit index of next object in packfile
The latter field is to help pack-objects reuse existing packfile
data, as today it needs to sort everything on its own on the fly.
Having that last field of data will help avoid that, and will keep
the index nicely aligned for 64-bit accesses to the offset.
I want to say your patch shouldn't be merged without even bothering
to review it. The last time I was in this part of the git code
(implementing sliding mmap window) Nico and Junio also both went in
here and rewrote huge chunks. Their changes prevented sliding mmap
window from applying. It was 6 months before I got back around to
rewriting the patch.
Right now I'm neck deep in pack v4. I hope to have the topic in
pu-ready state by some time mid-next week, hopefully in time for
Junio's git day. I'm very unlikely to have the time to rewrite the
topic again until late June/July if something like your patch goes
in now.
So would you mind waiting a couple of weeks for 64 bit indexes?
*1*: This was Nico's idea; I'm in agreement with him about it.
The actual disk layout here has not been set in stone as we
want to kick around different layouts before we settle on
something final. I'm a little unhappy with the way the above
lays out in memory when we mmap it, for example.
--
Shawn.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] Support 64-bit indexes for pack files.
2007-02-26 23:55 ` Shawn O. Pearce
@ 2007-02-27 0:24 ` Nicolas Pitre
2007-02-27 0:31 ` Shawn O. Pearce
2007-02-27 1:16 ` Troy Telford
2007-02-27 4:56 ` Nicolas Pitre
2 siblings, 1 reply; 20+ messages in thread
From: Nicolas Pitre @ 2007-02-27 0:24 UTC (permalink / raw)
To: Shawn O. Pearce; +Cc: Troy Telford, git
On Mon, 26 Feb 2007, Shawn O. Pearce wrote:
> Our topic also requires us to change the index file format, and
> in doing so we have decided to extend the index records to look
> something like the following[*1*]:
>
> object SHA-1
> 64-bit offset within packfile
> 32-bit index of next object in packfile
>
> The latter field is to help pack-objects reuse existing packfile
> data, as today it needs to sort everything on its own on the fly.
> Having that last field of data will help avoid that, and will keep
> the index nicely aligned for 64-bit accesses to the offset.
Wouldn't that later field help the sliding mmap code as well, knowing in
advance what storage size a given object has? (I didn't look at the
sliding mmap code so I don't know).
Actually I've been thinking about another format already.
What about keeping the pack offset as 32 bits like it is today, but for
index v2 if the top bit is set then this become an index into another
table containing 64-bit offsets as needed. This way there is no waste
of space for most projects where the pack has yet to reach the 2GB limit
for many years to come.
Nicolas
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] Support 64-bit indexes for pack files.
2007-02-27 0:24 ` Nicolas Pitre
@ 2007-02-27 0:31 ` Shawn O. Pearce
2007-02-27 4:32 ` Nicolas Pitre
0 siblings, 1 reply; 20+ messages in thread
From: Shawn O. Pearce @ 2007-02-27 0:31 UTC (permalink / raw)
To: Nicolas Pitre; +Cc: Troy Telford, git
Nicolas Pitre <nico@cam.org> wrote:
> On Mon, 26 Feb 2007, Shawn O. Pearce wrote:
> > The latter field is to help pack-objects reuse existing packfile
> > data, as today it needs to sort everything on its own on the fly.
> > Having that last field of data will help avoid that, and will keep
> > the index nicely aligned for 64-bit accesses to the offset.
>
> Wouldn't that later field help the sliding mmap code as well, knowing in
> advance what storage size a given object has? (I didn't look at the
> sliding mmap code so I don't know).
Yes, it probably would.
> Actually I've been thinking about another format already.
>
> What about keeping the pack offset as 32 bits like it is today, but for
> index v2 if the top bit is set then this become an index into another
> table containing 64-bit offsets as needed. This way there is no waste
> of space for most projects where the pack has yet to reach the 2GB limit
> for many years to come.
Actually Troy's patch tries to do this by using the current format
and only switching to the new one if the packfile exceeds 4 GiB.
Rather smart.
One thought I had here was to expand the fan-out table from 1<<8
entries to 1<<16 entries, then store only the low 18 bytes of
the SHA-1. We would have another 2 bytes worth of space to store
the offset, pushing our total offset up to 48 bits.
--
Shawn.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] Support 64-bit indexes for pack files.
2007-02-26 23:55 ` Shawn O. Pearce
2007-02-27 0:24 ` Nicolas Pitre
@ 2007-02-27 1:16 ` Troy Telford
2007-02-27 4:56 ` Nicolas Pitre
2 siblings, 0 replies; 20+ messages in thread
From: Troy Telford @ 2007-02-27 1:16 UTC (permalink / raw)
To: Shawn O. Pearce; +Cc: git
On Monday 26 February 2007, Shawn O. Pearce wrote:
> Nico and I are neck deep in our pack version 4 topic. That topic
> hits all of the same code you touched with your patch.
It looked like that may have been the case; but I figured it wouldn't hurt.
I've been coping with the 'too-small' index issue for about a year now, and
have been on IRC and kept at least a lazy eye on it hoping that it would be
attended to.
I've decided I need to start doing some C coding again, and it seemed to be as
good an itch as any to scratch...
> Our topic also requires us to change the index file format, and
> in doing so we have decided to extend the index records to look
> something like the following[*1*]:
>
> object SHA-1
> 64-bit offset within packfile
> 32-bit index of next object in packfile
>
> The latter field is to help pack-objects reuse existing packfile
> data, as today it needs to sort everything on its own on the fly.
> Having that last field of data will help avoid that, and will keep
> the index nicely aligned for 64-bit accesses to the offset.
Exactly why I left the packfile itself alone. Well, that and laziness. I may
have a huge repository, but I don't have single files so large that it needs
a 32-bit index for the next object in the packfile.
Most of the things that large I've seen are binary anyway.
> I want to say your patch shouldn't be merged without even bothering
> to review it.
No hard feelings on my part, it was as much a learning thing for me as
anything else.
> The last time I was in this part of the git code
> (implementing sliding mmap window) Nico and Junio also both went in
> here and rewrote huge chunks. Their changes prevented sliding mmap
> window from applying. It was 6 months before I got back around to
> rewriting the patch.
Ouch.
> Right now I'm neck deep in pack v4. I hope to have the topic in
> pu-ready state by some time mid-next week, hopefully in time for
> Junio's git day. I'm very unlikely to have the time to rewrite the
> topic again until late June/July if something like your patch goes
> in now.
>
> So would you mind waiting a couple of weeks for 64 bit indexes?
All I'm concerned with is being able to handle my (huge) source repository
with git. I wrote the patch in the hopes that it would accelerate the
process, and that I'd learn something. If all I have to do is wait that's
fine. I figured I'd at least be able to bring an idea or two to the table.
If the code doesn't get accepted, but still get the desired functionality, I
still met 1/2 of my goals in doing it.
--
Troy Telford
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] Support 64-bit indexes for pack files.
2007-02-27 0:31 ` Shawn O. Pearce
@ 2007-02-27 4:32 ` Nicolas Pitre
2007-02-27 4:55 ` Geert Bosch
0 siblings, 1 reply; 20+ messages in thread
From: Nicolas Pitre @ 2007-02-27 4:32 UTC (permalink / raw)
To: Shawn O. Pearce; +Cc: Troy Telford, git
On Mon, 26 Feb 2007, Shawn O. Pearce wrote:
> Nicolas Pitre <nico@cam.org> wrote:
> > Actually I've been thinking about another format already.
> >
> > What about keeping the pack offset as 32 bits like it is today, but for
> > index v2 if the top bit is set then this become an index into another
> > table containing 64-bit offsets as needed. This way there is no waste
> > of space for most projects where the pack has yet to reach the 2GB limit
> > for many years to come.
>
> Actually Troy's patch tries to do this by using the current format
> and only switching to the new one if the packfile exceeds 4 GiB.
> Rather smart.
Yes I saw the patch. But what I propose is different. In fact I'd
require far less changes to the existing code. The idea is to continue
to store a 32-bit value along with the SHA1 just like we do today.
Then, appended to that would be another table containing a list of
64-bit offsets.
Now if the offset stored in the index is smaller than 2GB you store it
as we do today. If it is >= 2GB then a 64-bit index would be added to
the extra offset table and the 32-bit entry along with the SHA1 would be
an index into that second table instead, with the top bit set to
distinguish it from a normal 32-bit offset (actually 31 bits). So for
offsets larger than 31 bits then they have an additional level of
indirection.
The code to implement this would be minimal. And since objects placed
at the end of a pack (those more likely to incure the indirection
overhead) are further back in history they won't get accessed
very often anyway.
Then nothing prevents us from inserting the next-object-index table in
between (its size is known while the 64-bit offset one may vary) then
the code that doesn't care about it need no look at it.
> One thought I had here was to expand the fan-out table from 1<<8
> entries to 1<<16 entries, then store only the low 18 bytes of
> the SHA-1. We would have another 2 bytes worth of space to store
> the offset, pushing our total offset up to 48 bits.
That would penalize small packs a lot. the index would always start
from 256KB in size. With a pack of 100 objects (our current treshold
for keeping a pack) that means a 258KB index file. Currently the index
file for a 100-object pack is 3.4KB.
Nicolas
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] Support 64-bit indexes for pack files.
2007-02-27 4:32 ` Nicolas Pitre
@ 2007-02-27 4:55 ` Geert Bosch
2007-02-27 5:11 ` Nicolas Pitre
0 siblings, 1 reply; 20+ messages in thread
From: Geert Bosch @ 2007-02-27 4:55 UTC (permalink / raw)
To: Nicolas Pitre; +Cc: Shawn O. Pearce, Troy Telford, git
On Feb 26, 2007, at 23:32, Nicolas Pitre wrote:
>> One thought I had here was to expand the fan-out table from 1<<8
>> entries to 1<<16 entries, then store only the low 18 bytes of
>> the SHA-1. We would have another 2 bytes worth of space to store
>> the offset, pushing our total offset up to 48 bits.
>
> That would penalize small packs a lot. the index would always start
> from 256KB in size. With a pack of 100 objects (our current treshold
> for keeping a pack) that means a 258KB index file. Currently the
> index
> file for a 100-object pack is 3.4KB.
Why can't we do it with the current 1<<8 entry fan-out?
This would allow increases of pack file size up to 1 TB.
For larger repositories, we just need to use multiple
pack files. A couple hundred 1 TB pack files doesn't seem
to be a big issue.
Say a couple years from now, we can write data to stable storage
(disks/flash/holograms or whatever) at 1 GB/sec, then it would still
take 16 minutes to write a 1 TB file. At that point we'd need a
bigger overhaul than just larger offsets in the pack file.
BTW, here are a few issues with the current pack file format:
- The final SHA1 consists of the count of objects in the file
and all compressed data. Why? This is horrible for streaming
applications where you only know the count of objects at the
end, then you need to access *all* data to compute the SHA-1.
Much better to just use compute a SHA1 over the SHA1's of each
object. That way at least the data streamed can be streamed to
disk. Buffering one SHA1 per object is probably going to be OK.
- The object count is implicit in the SHA1 of all objects and the
objects we find in the file. Why do we need it in the first place?
Better to recompute it when necessary. This makes true streaming
possible.
-Geert
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] Support 64-bit indexes for pack files.
2007-02-26 23:55 ` Shawn O. Pearce
2007-02-27 0:24 ` Nicolas Pitre
2007-02-27 1:16 ` Troy Telford
@ 2007-02-27 4:56 ` Nicolas Pitre
2 siblings, 0 replies; 20+ messages in thread
From: Nicolas Pitre @ 2007-02-27 4:56 UTC (permalink / raw)
To: Shawn O. Pearce; +Cc: Troy Telford, git
On Mon, 26 Feb 2007, Shawn O. Pearce wrote:
> Nico and I are neck deep in our pack version 4 topic. That topic
> hits all of the same code you touched with your patch.
Although a good deal of code has been written already, we're still
throwing completely crazy ideas back and forth which, if they survive
the reality test, might change things quite a lot for the index.
> Our topic also requires us to change the index file format, and
> in doing so we have decided to extend the index records to look
> something like the following[*1*]:
[ delete new index format layout description ]
Although I suggested that format... like... yesterday, I'm already
dismissing it now. I'd go with what I just posted instead.
But yet this depends on whether or not we consider this latest idea of
ours a good thing or not, which require some math to be done to prove
its usefulness.
Maybe we should post our notes here in order to gather extra comments
from people or to debunk some of our insane ideas faster.
> I want to say your patch shouldn't be merged without even bothering
> to review it. The last time I was in this part of the git code
> (implementing sliding mmap window) Nico and Junio also both went in
> here and rewrote huge chunks. Their changes prevented sliding mmap
> window from applying. It was 6 months before I got back around to
> rewriting the patch.
The thing is, regardless of the pack v4 work, I think the approach used
is not the best one. And because of the pack v4 work, we'll have to
settle on something before too long anyway.
> So would you mind waiting a couple of weeks for 64 bit indexes?
Of course Troy's patch might be used by those who need it now. I think
Junio might even park it in pu for reference. The pack index is not a
precious piece of information and it can be thrown away and recreated
with git-index-pack whenever there is an officially supported new index
format.
> *1*: This was Nico's idea; I'm in agreement with him about it.
As mentioned above I'm not in agreement with myself anymore about it.
;-)
Nicolas
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] Support 64-bit indexes for pack files.
2007-02-27 4:55 ` Geert Bosch
@ 2007-02-27 5:11 ` Nicolas Pitre
2007-02-27 16:04 ` Geert Bosch
0 siblings, 1 reply; 20+ messages in thread
From: Nicolas Pitre @ 2007-02-27 5:11 UTC (permalink / raw)
To: Geert Bosch; +Cc: Shawn O. Pearce, Troy Telford, git
On Mon, 26 Feb 2007, Geert Bosch wrote:
> Why can't we do it with the current 1<<8 entry fan-out?
> This would allow increases of pack file size up to 1 TB.
I had the exact same thought while I was writing the previous mail.
It is indeed perfectly fine and would require less than 10 lines of code
to implement.
> BTW, here are a few issues with the current pack file format:
> - The final SHA1 consists of the count of objects in the file
> and all compressed data. Why? This is horrible for streaming
> applications where you only know the count of objects at the
> end, then you need to access *all* data to compute the SHA-1.
> Much better to just use compute a SHA1 over the SHA1's of each
> object. That way at least the data streamed can be streamed to
> disk. Buffering one SHA1 per object is probably going to be OK.
We always know the number of objects before actually constructing or
streaming a pack. Finding best delta matches require that we sort the
object list by type, but for good locality we need to re-sort that list
by recency. So we always know the number of objects before starting to
write since we need to have the list of objects in memory anyway.
Also the receiving end of a streamed pack wants to know the number of
objects first if only to provide the user with some progress report.
> - The object count is implicit in the SHA1 of all objects and the
> objects we find in the file. Why do we need it in the first place?
> Better to recompute it when necessary. This makes true streaming
> possible.
Sorry, I don't follow you here.
Nicolas
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] Support 64-bit indexes for pack files.
2007-02-27 5:11 ` Nicolas Pitre
@ 2007-02-27 16:04 ` Geert Bosch
2007-02-27 16:11 ` Shawn O. Pearce
` (2 more replies)
0 siblings, 3 replies; 20+ messages in thread
From: Geert Bosch @ 2007-02-27 16:04 UTC (permalink / raw)
To: Nicolas Pitre; +Cc: Shawn O. Pearce, Troy Telford, git
On Feb 27, 2007, at 00:11, Nicolas Pitre wrote:
>> BTW, here are a few issues with the current pack file format:
>> - The final SHA1 consists of the count of objects in the file
>> and all compressed data. Why? This is horrible for streaming
>> applications where you only know the count of objects at the
>> end, then you need to access *all* data to compute the SHA-1.
>> Much better to just use compute a SHA1 over the SHA1's of each
>> object. That way at least the data streamed can be streamed to
>> disk. Buffering one SHA1 per object is probably going to be OK.
>
> We always know the number of objects before actually constructing or
> streaming a pack. Finding best delta matches require that we sort the
> object list by type, but for good locality we need to re-sort that
> list
> by recency. So we always know the number of objects before
> starting to
> write since we need to have the list of objects in memory anyway.
When I import a large code-base (such as a *.tar.gz), I don't know
beforehand how many objects I'm going to create. Ideally, I'd like
to stream them directly into a new pack without ever having to write
the expanded source to the filesystem.
> Also the receiving end of a streamed pack wants to know the number of
> objects first if only to provide the user with some progress report.
>
>> - The object count is implicit in the SHA1 of all objects and the
>> objects we find in the file. Why do we need it in the first place?
>> Better to recompute it when necessary. This makes true streaming
>> possible.
>
> Sorry, I don't follow you here.
The object-count at the beginning of the pack is a little strange for
local on-disk pack files, as it is data that can easily be derived.
The *index* would seem to be the proper place for this.
Also, it is not possible to write a dummy 0 in the count and then
fill in
the correct count at the end, because the final SHA1 at the end of
the pack
file is a checksum over the count followed by all the pack data.
So for creating a large pack from a stream of data, you have to do
the following:
1. write out a temporary pack file to disk without correct count
2. fix-up the count
3. read the entire temporary pack file to compute the final SHA-1
4. fix-up the SHA1 at the end of the file
5. construct and write out the index
There are a few ways to fixing this:
- Have a count of 0xffffffff mean: look in the index for the count.
Pulling/pushing would still use regular counted pack files.
- Have the pack file checksum be the SHA1 of (the count followed
by the SHA1 of the compressed data of each object). This would
allow 3.
to be done without reading back all data.
-Geert
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] Support 64-bit indexes for pack files.
2007-02-27 16:04 ` Geert Bosch
@ 2007-02-27 16:11 ` Shawn O. Pearce
2007-02-27 16:55 ` Geert Bosch
2007-02-27 17:03 ` Nicolas Pitre
2007-02-27 20:05 ` Johannes Schindelin
2 siblings, 1 reply; 20+ messages in thread
From: Shawn O. Pearce @ 2007-02-27 16:11 UTC (permalink / raw)
To: Geert Bosch; +Cc: Nicolas Pitre, Troy Telford, git
Geert Bosch <bosch@adacore.com> wrote:
> When I import a large code-base (such as a *.tar.gz), I don't know
> beforehand how many objects I'm going to create. Ideally, I'd like
> to stream them directly into a new pack without ever having to write
> the expanded source to the filesystem.
See git-fast-import. If you are coming from a tar, also see
contrib/fast-import/import-tars.perl. :-)
> So for creating a large pack from a stream of data, you have to do
> the following:
> 1. write out a temporary pack file to disk without correct count
> 2. fix-up the count
> 3. read the entire temporary pack file to compute the final SHA-1
> 4. fix-up the SHA1 at the end of the file
> 5. construct and write out the index
Yes, this is exactly what git-fast-import does. Yes, it sort of
sucks. But its not as bad as you think.
> There are a few ways to fixing this:
> - Have a count of 0xffffffff mean: look in the index for the count.
> Pulling/pushing would still use regular counted pack files.
> - Have the pack file checksum be the SHA1 of (the count followed
> by the SHA1 of the compressed data of each object). This would
> allow 3.
> to be done without reading back all data.
I don't think it is worth it. Aside from git-fast-import we
always know the object count before we start writing any data.
But despite that, fast-import runs quite well.
--
Shawn.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] Support 64-bit indexes for pack files.
2007-02-27 16:11 ` Shawn O. Pearce
@ 2007-02-27 16:55 ` Geert Bosch
2007-02-27 17:36 ` Nicolas Pitre
0 siblings, 1 reply; 20+ messages in thread
From: Geert Bosch @ 2007-02-27 16:55 UTC (permalink / raw)
To: Shawn O. Pearce; +Cc: Nicolas Pitre, Troy Telford, git
On Feb 27, 2007, at 11:11, Shawn O. Pearce wrote:
> Geert Bosch <bosch@adacore.com> wrote:
>> When I import a large code-base (such as a *.tar.gz), I don't know
>> beforehand how many objects I'm going to create. Ideally, I'd like
>> to stream them directly into a new pack without ever having to write
>> the expanded source to the filesystem.
>
> See git-fast-import. If you are coming from a tar, also see
> contrib/fast-import/import-tars.perl. :-)
Yes, I saw that, really nice. I had written something myself to
create pack files from a streaming data source. Basically, I'm
breaking arbitrary data-streams (mostly backups) into chunks
along content-defined boundaries and then link the chunks
together in a tree. This eliminates any duplicate files,
and even chunks of larger files that are identical. I create
new pack files whenever the old one gets larger than a certain
predefined size (128 MB, currently).
I'm happy I can base this on git-fast-import now.
>> So for creating a large pack from a stream of data, you have to do
>> the following:
>> 1. write out a temporary pack file to disk without correct count
>> 2. fix-up the count
>> 3. read the entire temporary pack file to compute the final SHA-1
>> 4. fix-up the SHA1 at the end of the file
>> 5. construct and write out the index
>
> Yes, this is exactly what git-fast-import does. Yes, it sort of
> sucks. But its not as bad as you think.
For smaller packs, the I/O is all going to be buffered anyway,
but if we're going to have >4GB pack files, it adds a lot of real
I/O and SHA1 computation for no good reason. If we get a rare chance
to have a new pack format, why not fix this wart at the same time?
-Geert
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] Support 64-bit indexes for pack files.
2007-02-27 16:04 ` Geert Bosch
2007-02-27 16:11 ` Shawn O. Pearce
@ 2007-02-27 17:03 ` Nicolas Pitre
2007-02-27 20:05 ` Johannes Schindelin
2 siblings, 0 replies; 20+ messages in thread
From: Nicolas Pitre @ 2007-02-27 17:03 UTC (permalink / raw)
To: Geert Bosch; +Cc: Shawn O. Pearce, Troy Telford, git
On Tue, 27 Feb 2007, Geert Bosch wrote:
>
> On Feb 27, 2007, at 00:11, Nicolas Pitre wrote:
>
> > > BTW, here are a few issues with the current pack file format:
> > > - The final SHA1 consists of the count of objects in the file
> > > and all compressed data. Why? This is horrible for streaming
> > > applications where you only know the count of objects at the
> > > end, then you need to access *all* data to compute the SHA-1.
> > > Much better to just use compute a SHA1 over the SHA1's of each
> > > object. That way at least the data streamed can be streamed to
> > > disk. Buffering one SHA1 per object is probably going to be OK.
> >
> > We always know the number of objects before actually constructing or
> > streaming a pack. Finding best delta matches require that we sort the
> > object list by type, but for good locality we need to re-sort that list
> > by recency. So we always know the number of objects before starting to
> > write since we need to have the list of objects in memory anyway.
>
> When I import a large code-base (such as a *.tar.gz), I don't know
> beforehand how many objects I'm going to create. Ideally, I'd like
> to stream them directly into a new pack without ever having to write
> the expanded source to the filesystem.
Have a look at git-fast-import and contrib/fast-import/import-tars.perl.
Regardless, you cannot produce a decent pack without knowing in advance
how many objects you have. Thisis why git-fast-import produces a
suboptimal pack that needs to be repacked in the end.
> The object-count at the beginning of the pack is a little strange for
> local on-disk pack files, as it is data that can easily be derived.
> The *index* would seem to be the proper place for this.
The index is _not_ sent over the network protocol by design. But like I
said the receiving end wants to know up front how many objects it'll
have to parse.
> Also, it is not possible to write a dummy 0 in the count and then fill in
> the correct count at the end, because the final SHA1 at the end of the pack
> file is a checksum over the count followed by all the pack data.
> So for creating a large pack from a stream of data, you have to do the
> following:
> 1. write out a temporary pack file to disk without correct count
> 2. fix-up the count
> 3. read the entire temporary pack file to compute the final SHA-1
> 4. fix-up the SHA1 at the end of the file
> 5. construct and write out the index
Sure. That's in fact what index-pack does when it has to fix up a thin
pack. But did you find any _real_ issue with this? Resolving deltas and
recomputing the index is far more costly than simply redoing the whole
pack checksum.
> There are a few ways to fixing this:
> - Have a count of 0xffffffff mean: look in the index for the count.
> Pulling/pushing would still use regular counted pack files.
> - Have the pack file checksum be the SHA1 of (the count followed
> by the SHA1 of the compressed data of each object). This would allow 3.
> to be done without reading back all data.
Well we _could_ exclude the object count (or even the entire pack
header for that matter) from the pack checksum. This way a streamed
pack streamed to disk could be seeked back to have its object number
fixed up once it is known without needing to start the checksum all
over. Maybe something to consider for pack v4. But in practice I don't
see that as a big issue.
Nicolas
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] Support 64-bit indexes for pack files.
2007-02-27 16:55 ` Geert Bosch
@ 2007-02-27 17:36 ` Nicolas Pitre
2007-02-28 3:52 ` Shawn O. Pearce
0 siblings, 1 reply; 20+ messages in thread
From: Nicolas Pitre @ 2007-02-27 17:36 UTC (permalink / raw)
To: Geert Bosch; +Cc: Shawn O. Pearce, Troy Telford, git
On Tue, 27 Feb 2007, Geert Bosch wrote:
> For smaller packs, the I/O is all going to be buffered anyway,
> but if we're going to have >4GB pack files, it adds a lot of real
> I/O and SHA1 computation for no good reason. If we get a rare chance
> to have a new pack format, why not fix this wart at the same time?
Fair enough.
Nicolas
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] Support 64-bit indexes for pack files.
2007-02-27 16:04 ` Geert Bosch
2007-02-27 16:11 ` Shawn O. Pearce
2007-02-27 17:03 ` Nicolas Pitre
@ 2007-02-27 20:05 ` Johannes Schindelin
2007-02-27 20:25 ` Geert Bosch
2 siblings, 1 reply; 20+ messages in thread
From: Johannes Schindelin @ 2007-02-27 20:05 UTC (permalink / raw)
To: Geert Bosch; +Cc: Nicolas Pitre, Shawn O. Pearce, Troy Telford, git
Hi,
On Tue, 27 Feb 2007, Geert Bosch wrote:
> The object-count at the beginning of the pack is a little strange for
> local on-disk pack files, as it is data that can easily be derived.
The SHA1 is also easily derived. So think of it as a doubly secure way to
ensure integrity.
Remember, there were some people unable to accept that SHA1 collisions are
_unlikely_...
Ciao,
Dscho
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] Support 64-bit indexes for pack files.
2007-02-27 20:05 ` Johannes Schindelin
@ 2007-02-27 20:25 ` Geert Bosch
2007-02-27 20:35 ` Johannes Schindelin
0 siblings, 1 reply; 20+ messages in thread
From: Geert Bosch @ 2007-02-27 20:25 UTC (permalink / raw)
To: Johannes Schindelin; +Cc: Nicolas Pitre, Shawn O. Pearce, Troy Telford, git
On Feb 27, 2007, at 15:05, Johannes Schindelin wrote:
> On Tue, 27 Feb 2007, Geert Bosch wrote:
>
>> The object-count at the beginning of the pack is a little strange for
>> local on-disk pack files, as it is data that can easily be derived.
>
> The SHA1 is also easily derived. So think of it as a doubly secure
> way to
> ensure integrity.
>
> Remember, there were some people unable to accept that SHA1
> collisions are
> _unlikely_...
Ah, you include the count because you don't trust the SHA1
and now you can count that you got the right number of objects ;-)
It's just that I don't see a use for the count in a local on-disk
pack. In these cases we'll always have an index. We can keep the
current protocol for sending/receiving packs, no need to ever upgrade
to v4 there.
-Geert
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] Support 64-bit indexes for pack files.
2007-02-27 20:25 ` Geert Bosch
@ 2007-02-27 20:35 ` Johannes Schindelin
0 siblings, 0 replies; 20+ messages in thread
From: Johannes Schindelin @ 2007-02-27 20:35 UTC (permalink / raw)
To: Geert Bosch; +Cc: Nicolas Pitre, Shawn O. Pearce, Troy Telford, git
Hi,
On Tue, 27 Feb 2007, Geert Bosch wrote:
>
> On Feb 27, 2007, at 15:05, Johannes Schindelin wrote:
> > On Tue, 27 Feb 2007, Geert Bosch wrote:
> >
> > > The object-count at the beginning of the pack is a little strange
> > > for local on-disk pack files, as it is data that can easily be
> > > derived.
> >
> > The SHA1 is also easily derived. So think of it as a doubly secure way
> > to ensure integrity.
> >
> > Remember, there were some people unable to accept that SHA1 collisions
> > are _unlikely_...
>
> Ah, you include the count because you don't trust the SHA1 and now you
> can count that you got the right number of objects ;-)
I don't know if it was included for that reason, but you can excuse it
that way. :-)
> It's just that I don't see a use for the count in a local on-disk pack.
> In these cases we'll always have an index. We can keep the current
> protocol for sending/receiving packs, no need to ever upgrade to v4
> there.
But the protocol expects a valid pack! (Including a count...)
Besides, the pack index is a purely local thin, reconstructed from the
pack when fetching...
Ciao,
Dscho
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] Support 64-bit indexes for pack files.
2007-02-27 17:36 ` Nicolas Pitre
@ 2007-02-28 3:52 ` Shawn O. Pearce
2007-02-28 4:12 ` Nicolas Pitre
0 siblings, 1 reply; 20+ messages in thread
From: Shawn O. Pearce @ 2007-02-28 3:52 UTC (permalink / raw)
To: Nicolas Pitre; +Cc: Geert Bosch, Troy Telford, git
Nicolas Pitre <nico@cam.org> wrote:
> On Tue, 27 Feb 2007, Geert Bosch wrote:
> > For smaller packs, the I/O is all going to be buffered anyway,
> > but if we're going to have >4GB pack files, it adds a lot of real
> > I/O and SHA1 computation for no good reason. If we get a rare chance
> > to have a new pack format, why not fix this wart at the same time?
>
> Fair enough.
OK, so lets say that if both ends of a network transport support
pack v4 then we can use pack v4. If pack v4 omits the count field
from its header (because its easily derived or obtained from the
index, and doesn't add any additional data protection over the
SHA-1s) why not add some machine-readable sideband that can provide
transfer progress?
I think we would want four values, number of objects (sent/total)
and uncompressed bytes (sent/total), to send to the client.
Estimating the total uncompressed bytes is very easy in pack-objects
before we start sending even the header; actually if we are reusing
a majority of the objects from an existing packfile we even have
a good approximation of the compressed size ready. That would
give the client a reasonable progress meter; certainly better than
nothing at all! ;-)
--
Shawn.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] Support 64-bit indexes for pack files.
2007-02-28 3:52 ` Shawn O. Pearce
@ 2007-02-28 4:12 ` Nicolas Pitre
0 siblings, 0 replies; 20+ messages in thread
From: Nicolas Pitre @ 2007-02-28 4:12 UTC (permalink / raw)
To: Shawn O. Pearce; +Cc: Geert Bosch, Troy Telford, git
On Tue, 27 Feb 2007, Shawn O. Pearce wrote:
> Nicolas Pitre <nico@cam.org> wrote:
> > On Tue, 27 Feb 2007, Geert Bosch wrote:
> > > For smaller packs, the I/O is all going to be buffered anyway,
> > > but if we're going to have >4GB pack files, it adds a lot of real
> > > I/O and SHA1 computation for no good reason. If we get a rare chance
> > > to have a new pack format, why not fix this wart at the same time?
> >
> > Fair enough.
>
> OK, so lets say that if both ends of a network transport support
> pack v4 then we can use pack v4. If pack v4 omits the count field
> from its header (because its easily derived or obtained from the
> index, and doesn't add any additional data protection over the
> SHA-1s)
I don't agree we should omit it at all.
I agree it might just not be included in the whole pack SHA1. Actually
the whole pack header might be ommitted from the pack SHA1. This way, if
data is appended at the end like index-pack does in the thin pack case,
the header can be fixed up without invalidating the SHA1 that was
computed while receiving the pack. Why the whole header? Well who
knows if we won't dynamically change the pack version while receiving it
as well in the future. In fact, we could just add the header to the
SHA1 checksum but after the data payload (for the checksum only not in
the actual pack). this way we preserve the same protection/validity
strength as we have today.
> why not add some machine-readable sideband that can provide
> transfer progress?
>
> I think we would want four values, number of objects (sent/total)
> and uncompressed bytes (sent/total), to send to the client.
Uncompressed bytes is meaningless when evaluating throughput of
compressed data since compression ratio is variable during a single
fetch. It doesn't provide more accurate progress than what we have
today with number of objects either.
> Estimating the total uncompressed bytes is very easy in pack-objects
> before we start sending even the header; actually if we are reusing
> a majority of the objects from an existing packfile we even have
> a good approximation of the compressed size ready. That would
> give the client a reasonable progress meter; certainly better than
> nothing at all! ;-)
IMHO it is not worth it. And it won't help git-pack-index if run on a
local pack either.
Nicolas
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] Support 64-bit indexes for pack files.
2007-02-26 22:40 [PATCH] Support 64-bit indexes for pack files Troy Telford
2007-02-26 23:55 ` Shawn O. Pearce
@ 2007-02-28 19:46 ` Troy Telford
1 sibling, 0 replies; 20+ messages in thread
From: Troy Telford @ 2007-02-28 19:46 UTC (permalink / raw)
To: Shawn O. Pearce; +Cc: git
Subject: [PATCH] 32-bit support for large indexes
This patch continues the work I've done on large indexes, and was completed
largely as an educational exercise; I'm obviously aware of Pack v4 making
these two patches unnecessary, and that inclusion is unlikely. I'm
submitting the patch to make my previous patch on 64-bit indexes more
complete, should anyone actually use it.
It also introduced changes that may be expressing a bug
in the SHA1 code. (Or at least, that's all myself and Eric Biederman can
think of... but it was late and we were tired.)
The long and the short of it is a large number of 'offset' variables are
'unsigned long', and needed to be changed to 'off_t', with a few cleanups
to accommodate this change.
The primary goal was to get the extended index to work on x86 (as well as
x86_64) machines. Operations such as 'git fetch/pull', 'git cat-file -p', and
of course 'git-fsck --full' were either failing or reporting errors.
In the process of testing, I found an issue on x86 (x86_64 worked fine) where
'git-fsck' would report problems "error: packed <sha1> from <packfile> is
corrupt" (On 3 files in a very large repository; the number of 'corrupt' files
is a small percentage)
The patch has some (currently disabled) code that would write the offending
blobs to disk, as well as print some related data to the screen. In the test
repository, the 'corrupt' blobs were fairly large binaries, which ended up
being either tar.gz or tar.bz2. (Of sizes 200-19 MB). I checked the blobs for
internal consistency (gzip -t and bzip2 -t), and they checked out OK.
Similarly, git-cat-file -p could extract the 'corrupt' blob without any issues.
Being a bit confused at this point, I checked with Eric Biederman, who also
couldn't find why the blob was being labeled as 'corrupt'. Eric's only guess
was that something funny was happening in the SHA1 generation, which I don't
believe my code even touches.
That being said, the only problem I've found at this point is with git-fsck,
and then only with 32-bit machines. Other (previously broken with both
the old and 'new' index formats) operations such as fetch/pull, and cat-file
now work on 32-bit systems. This marks the first time I've ever been able
to use git with my large repository on 32-bit systems.
---
builtin-pack-objects.c | 30 ++++++++++----------
builtin-unpack-objects.c | 12 ++++----
cache.h | 6 ++--
index-pack.c | 20 +++++++------
pack-check.c | 35 ++++++++++++++++++++----
sha1_file.c | 67 ++++++++++++++++++++++++++-------------------
6 files changed, 103 insertions(+), 67 deletions(-)
diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c
index 92087c1..c1b41b5 100644
--- a/builtin-pack-objects.c
+++ b/builtin-pack-objects.c
@@ -23,7 +23,7 @@ git-pack-objects [{ -q | --progress | --all-progress }] \n\
struct object_entry {
unsigned char sha1[20];
unsigned long size; /* uncompressed size */
- unsigned long offset; /* offset into the final pack file;
+ off_t offset; /* offset into the final pack file;
* nonzero if already written.
*/
unsigned int depth; /* delta depth */
@@ -222,7 +222,7 @@ static void prepare_pack_revindex(struct pack_revindex *rix)
}
static struct revindex_entry * find_packed_object(struct packed_git *p,
- unsigned int ofs)
+ off_t ofs)
{
int num;
int lo, hi;
@@ -250,15 +250,15 @@ static struct revindex_entry * find_packed_object(struct packed_git *p,
die("internal error: pack revindex corrupt");
}
-static unsigned long find_packed_object_size(struct packed_git *p,
- unsigned long ofs)
+static off_t find_packed_object_size(struct packed_git *p,
+ off_t ofs)
{
struct revindex_entry *entry = find_packed_object(p, ofs);
return entry[1].offset - ofs;
}
static unsigned char *find_packed_object_name(struct packed_git *p,
- unsigned long ofs)
+ off_t ofs)
{
struct revindex_entry *entry = find_packed_object(p, ofs);
if (p->index.version == 0)
@@ -320,7 +320,7 @@ static int encode_header(enum object_type type, unsigned long size, unsigned cha
*/
static int check_pack_inflate(struct packed_git *p,
struct pack_window **w_curs,
- unsigned long offset,
+ off_t offset,
unsigned long len,
unsigned long expect)
{
@@ -347,7 +347,7 @@ static int check_pack_inflate(struct packed_git *p,
static void copy_pack_data(struct sha1file *f,
struct packed_git *p,
struct pack_window **w_curs,
- unsigned long offset,
+ off_t offset,
unsigned long len)
{
unsigned char *in;
@@ -483,7 +483,7 @@ static unsigned long write_object(struct sha1file *f,
* encoding of the relative offset for the delta
* base from this object's position in the pack.
*/
- unsigned long ofs = entry->offset - entry->delta->offset;
+ off_t ofs = entry->offset - entry->delta->offset;
unsigned pos = sizeof(header) - 1;
header[pos] = ofs & 127;
while (ofs >>= 7)
@@ -504,7 +504,7 @@ static unsigned long write_object(struct sha1file *f,
else {
struct packed_git *p = entry->in_pack;
struct pack_window *w_curs = NULL;
- unsigned long offset;
+ off_t offset;
if (entry->delta) {
obj_type = (allow_ofs_delta && entry->delta->offset) ?
@@ -514,7 +514,7 @@ static unsigned long write_object(struct sha1file *f,
hdrlen = encode_header(obj_type, entry->size, header);
sha1write(f, header, hdrlen);
if (obj_type == OBJ_OFS_DELTA) {
- unsigned long ofs = entry->offset - entry->delta->offset;
+ off_t ofs = entry->offset - entry->delta->offset;
unsigned pos = sizeof(header) - 1;
header[pos] = ofs & 127;
while (ofs >>= 7)
@@ -544,7 +544,7 @@ static unsigned long write_object(struct sha1file *f,
static unsigned long write_one(struct sha1file *f,
struct object_entry *e,
- unsigned long offset)
+ off_t offset)
{
if (e->offset || e->preferred_base)
/* offset starts from header size and cannot be zero
@@ -562,7 +562,7 @@ static void write_pack_file(void)
{
int i;
struct sha1file *f;
- unsigned long offset;
+ off_t offset;
struct pack_header hdr;
unsigned last_percent = 999;
int do_progress = progress;
@@ -721,7 +721,7 @@ static int add_object_entry(const unsigned char *sha1, unsigned hash, int exclud
if (!exclude) {
for (p = packed_git; p; p = p->next) {
- unsigned long offset = find_pack_entry_one(sha1, p);
+ off_t offset = find_pack_entry_one(sha1, p);
if (offset) {
if (incremental)
return 0;
@@ -1044,7 +1044,7 @@ static void check_object(struct object_entry *entry)
*/
if (!no_reuse_delta) {
unsigned char c, *base_name;
- unsigned long ofs;
+ off_t ofs;
unsigned long used_0;
/* there is at least 20 bytes left in the pack */
switch (entry->in_pack_type) {
@@ -1061,7 +1061,7 @@ static void check_object(struct object_entry *entry)
ofs = c & 127;
while (c & 128) {
ofs += 1;
- if (!ofs || ofs & ~(~0UL >> 7))
+ if (!ofs || ofs & ~(~((off_t) 0) >> 7))
die("delta base offset overflow in pack for %s",
sha1_to_hex(entry->sha1));
c = buf[used_0++];
diff --git a/builtin-unpack-objects.c b/builtin-unpack-objects.c
index 8f8e898..b85c557 100644
--- a/builtin-unpack-objects.c
+++ b/builtin-unpack-objects.c
@@ -13,7 +13,7 @@ static const char unpack_usage[] = "git-unpack-objects [-n] [-q] [-r] < pack-fil
/* We always read in 4kB chunks. */
static unsigned char buffer[4096];
-static unsigned long offset, len, consumed_bytes;
+static off_t offset, len, consumed_bytes;
static SHA_CTX ctx;
/*
@@ -88,7 +88,7 @@ static void *get_data(unsigned long size)
struct delta_info {
unsigned char base_sha1[20];
- unsigned long base_offset;
+ off_t base_offset;
unsigned long size;
void *delta;
unsigned nr;
@@ -98,7 +98,7 @@ struct delta_info {
static struct delta_info *delta_list;
static void add_delta_to_list(unsigned nr, unsigned const char *base_sha1,
- unsigned long base_offset,
+ off_t base_offset,
void *delta, unsigned long size)
{
struct delta_info *info = xmalloc(sizeof(*info));
@@ -113,7 +113,7 @@ static void add_delta_to_list(unsigned nr, unsigned const char *base_sha1,
}
struct obj_info {
- unsigned long offset;
+ off_t offset;
unsigned char sha1[20];
};
@@ -209,7 +209,7 @@ static void unpack_delta_entry(enum object_type kind, unsigned long delta_size,
} else {
unsigned base_found = 0;
unsigned char *pack, c;
- unsigned long base_offset;
+ off_t base_offset;
unsigned lo, mid, hi;
pack = fill(1);
@@ -218,7 +218,7 @@ static void unpack_delta_entry(enum object_type kind, unsigned long delta_size,
base_offset = c & 127;
while (c & 128) {
base_offset += 1;
- if (!base_offset || base_offset & ~(~0UL >> 7))
+ if (!base_offset || base_offset & ~(~((off_t) 0) >> 7))
die("offset value overflow for delta base object");
pack = fill(1);
c = *pack;
diff --git a/cache.h b/cache.h
index b329ca4..88565b0 100644
--- a/cache.h
+++ b/cache.h
@@ -420,15 +420,15 @@ extern struct packed_git *find_sha1_pack(const unsigned char *sha1,
struct packed_git *packs);
extern void pack_report(void);
-extern unsigned char* use_pack(struct packed_git *, struct pack_window **, unsigned long, unsigned int *);
+extern unsigned char* use_pack(struct packed_git *, struct pack_window **, off_t, unsigned int *);
extern void unuse_pack(struct pack_window **);
extern struct packed_git *add_packed_git(char *, int, int);
extern int num_packed_objects(const struct packed_git *p);
extern int nth_packed_object_sha1(const struct packed_git *, int, unsigned char*);
extern off_t find_pack_entry_one(const unsigned char *, struct packed_git *);
-extern void *unpack_entry(struct packed_git *, unsigned long, char *, unsigned long *);
+extern void *unpack_entry(struct packed_git *, off_t, char *, unsigned long *);
extern unsigned long unpack_object_header_gently(const unsigned char *buf, unsigned long len, enum object_type *type, unsigned long *sizep);
-extern void packed_object_info_detail(struct packed_git *, unsigned long, char *, unsigned long *, unsigned long *, unsigned int *, unsigned char *);
+extern void packed_object_info_detail(struct packed_git *, off_t, char *, unsigned long *, unsigned long *, unsigned int *, unsigned char *);
/* Dumb servers support */
extern int update_server_info(int);
diff --git a/index-pack.c b/index-pack.c
index cc22ea2..ef10e38 100644
--- a/index-pack.c
+++ b/index-pack.c
@@ -22,7 +22,7 @@ struct object_entry
union delta_base {
unsigned char sha1[20];
- unsigned long offset;
+ off_t offset;
};
/*
@@ -173,10 +173,10 @@ static void parse_pack_header(void)
use(sizeof(struct pack_header));
}
-static void bad_object(unsigned long offset, const char *format,
+static void bad_object(off_t offset, const char *format,
...) NORETURN __attribute__((format (printf, 2, 3)));
-static void bad_object(unsigned long offset, const char *format, ...)
+static void bad_object(off_t offset, const char *format, ...)
{
va_list params;
char buf[1024];
@@ -184,10 +184,11 @@ static void bad_object(unsigned long offset, const char *format, ...)
va_start(params, format);
vsnprintf(buf, sizeof(buf), format, params);
va_end(params);
- die("pack has bad object at offset %lu: %s", offset, buf);
+ die("pack has bad object at offset %llu: %s",
+ (unsigned long long) offset, buf);
}
-static void *unpack_entry_data(unsigned long offset, unsigned long size)
+static void *unpack_entry_data(off_t offset, unsigned long size)
{
z_stream stream;
void *buf = xmalloc(size);
@@ -216,7 +217,8 @@ static void *unpack_entry_data(unsigned long offset, unsigned long size)
static void *unpack_raw_entry(struct object_entry *obj, union delta_base *delta_base)
{
unsigned char *p, c;
- unsigned long size, base_offset;
+ unsigned long size;
+ off_t base_offset;
unsigned shift;
obj->offset = consumed_bytes;
@@ -249,7 +251,7 @@ static void *unpack_raw_entry(struct object_entry *obj, union delta_base *delta_
base_offset = c & 127;
while (c & 128) {
base_offset += 1;
- if (!base_offset || base_offset & ~(~0UL >> 7))
+ if (!base_offset || base_offset & ~(~((off_t)0) >> 7))
bad_object(obj->offset, "offset value overflow for delta base object");
p = fill(1);
c = *p;
@@ -275,8 +277,8 @@ static void *unpack_raw_entry(struct object_entry *obj, union delta_base *delta_
static void *get_data_from_pack(struct object_entry *obj)
{
- unsigned long from = obj[0].offset + obj[0].hdr_size;
- unsigned long len = obj[1].offset - from;
+ off_t from = obj[0].offset + obj[0].hdr_size;
+ off_t len = obj[1].offset - from;
unsigned char *src, *data;
z_stream stream;
int st;
diff --git a/pack-check.c b/pack-check.c
index 12ee933..af5fd3b 100644
--- a/pack-check.c
+++ b/pack-check.c
@@ -8,7 +8,7 @@ static int verify_packfile(struct packed_git *p,
void *index_base = p->index.base;
SHA_CTX ctx;
unsigned char sha1[20];
- unsigned long offset = 0, pack_sig = p->pack_size - 20;
+ off_t offset = 0, pack_sig = p->pack_size - 20;
int nr_objects, err, i;
/* Note that the pack header checks are actually performed by
@@ -22,9 +22,11 @@ static int verify_packfile(struct packed_git *p,
unsigned int remaining;
unsigned char *in = use_pack(p, w_curs, offset, &remaining);
offset += remaining;
+/* printf("Offset: %llu\tpack_sig: %llu\tremaining: %u\tfirst foo\n",(unsigned long long) offset, (unsigned long long) pack_sig, remaining);*/
if (offset > pack_sig)
remaining -= offset - pack_sig;
SHA1_Update(&ctx, in, remaining);
+/* printf("Offset: %llu\tpack_sig: %llu\tremaining: %u\n",(unsigned long long) offset, (unsigned long long) pack_sig, remaining);*/
}
SHA1_Final(sha1, &ctx);
if (hashcmp(sha1, use_pack(p, w_curs, pack_sig, NULL)))
@@ -33,6 +35,7 @@ static int verify_packfile(struct packed_git *p,
if (hashcmp(sha1, (unsigned char *)index_base + index_size - 40))
return error("Packfile %s SHA1 mismatch with idx",
p->pack_name);
+/* printf("Success; pack name: %s\t%s\n", p->pack_name, sha1_to_hex(sha1));*/
unuse_pack(w_curs);
/* Make sure everything reachable from idx is valid. Since we
@@ -44,7 +47,8 @@ static int verify_packfile(struct packed_git *p,
unsigned char sha1[20];
void *data;
char type[20];
- unsigned long size, offset;
+ unsigned long size;
+ off_t offset;
if (nth_packed_object_sha1(p, i, sha1))
die("internal error pack-check nth-packed-object");
@@ -58,6 +62,23 @@ static int verify_packfile(struct packed_git *p,
continue;
}
if (check_sha1_signature(sha1, data, size, type)) {
+#if 0
+ int fd;
+ fd = open(sha1_to_hex(sha1), O_RDWR | O_CREAT,0777);
+ if (fd >= 0) {
+ ssize_t written = 0, chunk;
+ while (written != size) {
+ chunk = xwrite(fd, data + written, size - written);
+ if (chunk < 0)
+ die("OUch!");
+ written += chunk;
+ }
+ close(fd);
+ printf("wrote the data\n");
+ }
+#endif
+ printf("Size=%lu\n",size);
+
err = error("packed %s from %s is corrupt",
sha1_to_hex(sha1), p->pack_name);
free(data);
@@ -85,7 +106,7 @@ static void show_pack_info(struct packed_git *p)
char type[20];
unsigned long size;
unsigned long store_size;
- unsigned long offset;
+ off_t offset;
unsigned int delta_chain_length;
if (nth_packed_object_sha1(p, i, sha1))
@@ -99,10 +120,12 @@ static void show_pack_info(struct packed_git *p)
base_sha1);
printf("%s ", sha1_to_hex(sha1));
if (!delta_chain_length)
- printf("%-6s %lu %lu\n", type, size, offset);
+ printf("%-6s %lu %llu\n", type, size,
+ (unsigned long long) offset);
else {
- printf("%-6s %lu %lu %u %s\n", type, size, offset,
- delta_chain_length, sha1_to_hex(base_sha1));
+ printf("%-6s %lu %llu %u %s\n", type, size,
+ (unsigned long long) offset,
+ delta_chain_length, sha1_to_hex(base_sha1));
if (delta_chain_length < MAX_CHAIN)
chain_histogram[delta_chain_length]++;
else
diff --git a/sha1_file.c b/sha1_file.c
index c5ab5cc..23c5cb2 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -646,7 +646,7 @@ static int open_packed_git(struct packed_git *p)
return -1;
}
-static int in_window(struct pack_window *win, unsigned long offset)
+static int in_window(struct pack_window *win, off_t offset)
{
/* We must promise at least 20 bytes (one hash) after the
* offset is available from this window, otherwise the offset
@@ -661,7 +661,7 @@ static int in_window(struct pack_window *win, unsigned long offset)
unsigned char* use_pack(struct packed_git *p,
struct pack_window **w_cursor,
- unsigned long offset,
+ off_t offset,
unsigned int *left)
{
struct pack_window *win = *w_cursor;
@@ -688,9 +688,9 @@ unsigned char* use_pack(struct packed_git *p,
size_t window_align = packed_git_window_size / 2;
win = xcalloc(1, sizeof(*win));
win->offset = (offset / window_align) * window_align;
- win->len = p->pack_size - win->offset;
- if (win->len > packed_git_window_size)
- win->len = packed_git_window_size;
+ win->len = packed_git_window_size;
+ if (win->len > p->pack_size - win->offset)
+ win->len = p->pack_size - win->offset;
pack_mapped += win->len;
while (packed_git_limit < pack_mapped
&& unuse_one_window(p))
@@ -859,7 +859,15 @@ int check_sha1_signature(const unsigned char *sha1, void *map, unsigned long siz
{
unsigned char real_sha1[20];
hash_sha1_file(map, size, type, real_sha1);
+#if 0
+ if (hashcmp(sha1, real_sha1) == 0)
+ return 0;
+ printf("%s: src:%s ", __func__, sha1_to_hex(sha1));
+ printf("real: %s type: %s\n", sha1_to_hex(real_sha1), type);
+ return -1;
+#else
return hashcmp(sha1, real_sha1) ? -1 : 0;
+#endif
}
void *map_sha1_file(const unsigned char *sha1, unsigned long *size)
@@ -1061,15 +1069,15 @@ void * unpack_sha1_file(void *map, unsigned long mapsize, char *type, unsigned l
return unpack_sha1_rest(&stream, hdr, *size);
}
-static unsigned long get_delta_base(struct packed_git *p,
+static off_t get_delta_base(struct packed_git *p,
struct pack_window **w_curs,
- unsigned long offset,
+ off_t offset,
enum object_type kind,
- unsigned long delta_obj_offset,
- unsigned long *base_obj_offset)
+ off_t delta_obj_offset,
+ off_t *base_obj_offset)
{
unsigned char *base_info = use_pack(p, w_curs, offset, NULL);
- unsigned long base_offset;
+ off_t base_offset;
/* use_pack() assured us we have [base_info, base_info + 20)
* as a range that we can look at without walking off the
@@ -1083,7 +1091,7 @@ static unsigned long get_delta_base(struct packed_git *p,
base_offset = c & 127;
while (c & 128) {
base_offset += 1;
- if (!base_offset || base_offset & ~(~0UL >> 7))
+ if (!base_offset || base_offset & ~(~((off_t) 0) >> 7))
die("offset value overflow for delta base object");
c = base_info[used++];
base_offset = (base_offset << 7) + (c & 127);
@@ -1106,18 +1114,18 @@ static unsigned long get_delta_base(struct packed_git *p,
}
/* forward declaration for a mutually recursive function */
-static int packed_object_info(struct packed_git *p, unsigned long offset,
+static int packed_object_info(struct packed_git *p, off_t offset,
char *type, unsigned long *sizep);
static int packed_delta_info(struct packed_git *p,
struct pack_window **w_curs,
- unsigned long offset,
+ off_t offset,
enum object_type kind,
- unsigned long obj_offset,
+ off_t obj_offset,
char *type,
unsigned long *sizep)
{
- unsigned long base_offset;
+ off_t base_offset;
offset = get_delta_base(p, w_curs, offset, kind,
obj_offset, &base_offset);
@@ -1169,9 +1177,9 @@ static int packed_delta_info(struct packed_git *p,
return 0;
}
-static unsigned long unpack_object_header(struct packed_git *p,
+static off_t unpack_object_header(struct packed_git *p,
struct pack_window **w_curs,
- unsigned long offset,
+ off_t offset,
enum object_type *type,
unsigned long *sizep)
{
@@ -1194,7 +1202,7 @@ static unsigned long unpack_object_header(struct packed_git *p,
}
void packed_object_info_detail(struct packed_git *p,
- unsigned long offset,
+ off_t offset,
char *type,
unsigned long *size,
unsigned long *store_size,
@@ -1202,7 +1210,8 @@ void packed_object_info_detail(struct packed_git *p,
unsigned char *base_sha1)
{
struct pack_window *w_curs = NULL;
- unsigned long obj_offset, val;
+ off_t obj_offset;
+ unsigned long val;
unsigned char *next_sha1;
enum object_type kind;
@@ -1243,7 +1252,7 @@ void packed_object_info_detail(struct packed_git *p,
}
}
-static int packed_object_info(struct packed_git *p, unsigned long offset,
+static int packed_object_info(struct packed_git *p, off_t offset,
char *type, unsigned long *sizep)
{
struct pack_window *w_curs = NULL;
@@ -1278,7 +1287,7 @@ static int packed_object_info(struct packed_git *p, unsigned long offset,
static void *unpack_compressed_entry(struct packed_git *p,
struct pack_window **w_curs,
- unsigned long offset,
+ off_t offset,
unsigned long size)
{
int st;
@@ -1309,22 +1318,23 @@ static void *unpack_compressed_entry(struct packed_git *p,
static void *unpack_delta_entry(struct packed_git *p,
struct pack_window **w_curs,
- unsigned long offset,
+ off_t offset,
unsigned long delta_size,
enum object_type kind,
- unsigned long obj_offset,
+ off_t obj_offset,
char *type,
unsigned long *sizep)
{
void *delta_data, *result, *base;
- unsigned long result_size, base_size, base_offset;
+ unsigned long result_size, base_size;
+ off_t base_offset;
offset = get_delta_base(p, w_curs, offset, kind,
obj_offset, &base_offset);
base = unpack_entry(p, base_offset, type, &base_size);
if (!base)
- die("failed to read delta base object at %lu from %s",
- base_offset, p->pack_name);
+ die("failed to read delta base object at %llu from %s",
+ (unsigned long long) base_offset, p->pack_name);
delta_data = unpack_compressed_entry(p, w_curs, offset, delta_size);
result = patch_delta(base, base_size,
@@ -1342,7 +1352,8 @@ void *unpack_entry(struct packed_git *p, off_t offset,
char *type, unsigned long *sizep)
{
struct pack_window *w_curs = NULL;
- unsigned long size, obj_offset = offset;
+ unsigned long size;
+ off_t obj_offset = offset;
enum object_type kind;
void *retval;
@@ -1495,7 +1506,7 @@ static int matches_pack_name(struct packed_git *p, const char *ig)
static int find_pack_entry(const unsigned char *sha1, struct pack_entry *e, const char **ignore_packed)
{
struct packed_git *p;
- unsigned long offset;
+ off_t offset;
prepare_packed_git();
--
1.5.0.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
end of thread, other threads:[~2007-02-28 19:47 UTC | newest]
Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-02-26 22:40 [PATCH] Support 64-bit indexes for pack files Troy Telford
2007-02-26 23:55 ` Shawn O. Pearce
2007-02-27 0:24 ` Nicolas Pitre
2007-02-27 0:31 ` Shawn O. Pearce
2007-02-27 4:32 ` Nicolas Pitre
2007-02-27 4:55 ` Geert Bosch
2007-02-27 5:11 ` Nicolas Pitre
2007-02-27 16:04 ` Geert Bosch
2007-02-27 16:11 ` Shawn O. Pearce
2007-02-27 16:55 ` Geert Bosch
2007-02-27 17:36 ` Nicolas Pitre
2007-02-28 3:52 ` Shawn O. Pearce
2007-02-28 4:12 ` Nicolas Pitre
2007-02-27 17:03 ` Nicolas Pitre
2007-02-27 20:05 ` Johannes Schindelin
2007-02-27 20:25 ` Geert Bosch
2007-02-27 20:35 ` Johannes Schindelin
2007-02-27 1:16 ` Troy Telford
2007-02-27 4:56 ` Nicolas Pitre
2007-02-28 19:46 ` Troy Telford
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.