git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 00/38] pack version 4 basic functionalities
@ 2013-09-05  6:19 Nicolas Pitre
  2013-09-05  6:19 ` [PATCH 01/38] pack v4: initial pack dictionary structure and code Nicolas Pitre
                   ` (39 more replies)
  0 siblings, 40 replies; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-05  6:19 UTC (permalink / raw)
  To: git

After the initial posting here:

  http://news.gmane.org/group/gmane.comp.version-control.git/thread=233061

This is a repost plus the basic read side working, at least to validate
the write side and the pack format itself.  And many many bug fixes.

This can also be fetched here:

  git://git.linaro.org/people/nico/git

I consider the actual pack format definition final as implemented
by this code.

TODO:

- index-pack support

- native tree walk support

- native commit graph walk support

- better heuristics when creating tree delta encoding

- integration with pack-objects

- transfer protocol backward compatibility

- thin pack completion

- figure out unexplained runtime performance issues

However, as I mentioned already, I've put more time on this project lately
than I actually had available.  I really wanted to bring this project far
enough to be able to kick it out the door for others to take over, and
there we are.

I'm always available for design discussions and code review.  But don't
expect much additional code from me at this point.

@junio: I'm hoping you can take this branch as is, and apply any ffurther
patches on top.

The diffstat goes like this:

 Makefile        |    3 +
 cache.h         |   11 +
 hex.c           |   11 +
 pack-check.c    |    4 +-
 pack-revindex.c |    7 +-
 pack-write.c    |    6 +-
 packv4-create.c | 1105 +++++++++++++++++++++++++++++++++++++++++++++++++
 packv4-parse.c  |  408 ++++++++++++++++++
 packv4-parse.h  |    9 +
 sha1_file.c     |  110 ++++-
 10 files changed, 1648 insertions(+), 26 deletions(-)

Enjoy !

^ permalink raw reply	[flat|nested] 124+ messages in thread

* [PATCH 01/38] pack v4: initial pack dictionary structure and code
  2013-09-05  6:19 [PATCH 00/38] pack version 4 basic functionalities Nicolas Pitre
@ 2013-09-05  6:19 ` Nicolas Pitre
  2013-09-05  6:19 ` [PATCH 02/38] export packed_object_info() Nicolas Pitre
                   ` (38 subsequent siblings)
  39 siblings, 0 replies; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-05  6:19 UTC (permalink / raw)
  To: git

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 packv4-create.c | 137 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 137 insertions(+)
 create mode 100644 packv4-create.c

diff --git a/packv4-create.c b/packv4-create.c
new file mode 100644
index 0000000..2de6d41
--- /dev/null
+++ b/packv4-create.c
@@ -0,0 +1,137 @@
+/*
+ * packv4-create.c: management of dictionary tables used in pack v4
+ *
+ * (C) Nicolas Pitre <nico@fluxnic.net>
+ *
+ * This code is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include "cache.h"
+
+struct data_entry {
+	unsigned offset;
+	unsigned hits;
+};
+
+struct dict_table {
+	char *data;
+	unsigned ptr;
+	unsigned size;
+	struct data_entry *entry;
+	unsigned nb_entries;
+	unsigned max_entries;
+	unsigned *hash;
+	unsigned hash_size;
+};
+
+struct dict_table *create_dict_table(void)
+{
+	return xcalloc(sizeof(struct dict_table), 1);
+}
+
+void destroy_dict_table(struct dict_table *t)
+{
+	free(t->data);
+	free(t->entry);
+	free(t->hash);
+	free(t);
+}
+
+static int locate_entry(struct dict_table *t, const char *str)
+{
+	int i = 0;
+	const unsigned char *s = (const unsigned char *) str;
+
+	while (*s)
+		i = i * 111 + *s++;
+	i = (unsigned)i % t->hash_size;
+
+	while (t->hash[i]) {
+		unsigned n = t->hash[i] - 1;
+		if (!strcmp(str, t->data + t->entry[n].offset))
+			return n;
+		if (++i >= t->hash_size)
+			i = 0;
+	}
+	return -1 - i;
+}
+
+static void rehash_entries(struct dict_table *t)
+{
+	unsigned n;
+
+	t->hash_size *= 2;
+	if (t->hash_size < 1024)
+		t->hash_size = 1024;
+	t->hash = xrealloc(t->hash, t->hash_size * sizeof(*t->hash));
+	memset(t->hash, 0, t->hash_size * sizeof(*t->hash));
+
+	for (n = 0; n < t->nb_entries; n++) {
+		int i = locate_entry(t, t->data + t->entry[n].offset);
+		if (i < 0)
+			t->hash[-1 - i] = n + 1;
+	}
+}
+
+int dict_add_entry(struct dict_table *t, const char *str)
+{
+	int i, len = strlen(str) + 1;
+
+	if (t->ptr + len >= t->size) {
+		t->size = (t->size + len + 1024) * 3 / 2;
+		t->data = xrealloc(t->data, t->size);
+	}
+	memcpy(t->data + t->ptr, str, len);
+
+	i = (t->nb_entries) ? locate_entry(t, t->data + t->ptr) : -1;
+	if (i >= 0) {
+		t->entry[i].hits++;
+		return i;
+	}
+
+	if (t->nb_entries >= t->max_entries) {
+		t->max_entries = (t->max_entries + 1024) * 3 / 2;
+		t->entry = xrealloc(t->entry, t->max_entries * sizeof(*t->entry));
+	}
+	t->entry[t->nb_entries].offset = t->ptr;
+	t->entry[t->nb_entries].hits = 1;
+	t->ptr += len + 1;
+	t->nb_entries++;
+
+	if (t->hash_size * 3 <= t->nb_entries * 4)
+		rehash_entries(t);
+	else
+		t->hash[-1 - i] = t->nb_entries;
+
+	return t->nb_entries - 1;
+}
+
+static int cmp_dict_entries(const void *a_, const void *b_)
+{
+	const struct data_entry *a = a_;
+	const struct data_entry *b = b_;
+	int diff = b->hits - a->hits;
+	if (!diff)
+		diff = a->offset - b->offset;
+	return diff;
+}
+
+static void sort_dict_entries_by_hits(struct dict_table *t)
+{
+	qsort(t->entry, t->nb_entries, sizeof(*t->entry), cmp_dict_entries);
+	t->hash_size = (t->nb_entries * 4 / 3) / 2;
+	rehash_entries(t);
+}
+
+void dict_dump(struct dict_table *t)
+{
+	int i;
+
+	sort_dict_entries_by_hits(t);
+	for (i = 0; i < t->nb_entries; i++)
+		printf("%d\t%s\n",
+			t->entry[i].hits,
+			t->data + t->entry[i].offset);
+}
-- 
1.8.4.38.g317e65b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 02/38] export packed_object_info()
  2013-09-05  6:19 [PATCH 00/38] pack version 4 basic functionalities Nicolas Pitre
  2013-09-05  6:19 ` [PATCH 01/38] pack v4: initial pack dictionary structure and code Nicolas Pitre
@ 2013-09-05  6:19 ` Nicolas Pitre
  2013-09-05  6:19 ` [PATCH 03/38] pack v4: scan tree objects Nicolas Pitre
                   ` (37 subsequent siblings)
  39 siblings, 0 replies; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-05  6:19 UTC (permalink / raw)
  To: git

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 cache.h     | 1 +
 sha1_file.c | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/cache.h b/cache.h
index 85b544f..b6634c4 100644
--- a/cache.h
+++ b/cache.h
@@ -1160,6 +1160,7 @@ struct object_info {
 	} u;
 };
 extern int sha1_object_info_extended(const unsigned char *, struct object_info *);
+extern int packed_object_info(struct packed_git *, off_t, struct object_info *);
 
 /* Dumb servers support */
 extern int update_server_info(int);
diff --git a/sha1_file.c b/sha1_file.c
index 8e27db1..c2020d0 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -1782,8 +1782,8 @@ unwind:
 	goto out;
 }
 
-static int packed_object_info(struct packed_git *p, off_t obj_offset,
-			      struct object_info *oi)
+int packed_object_info(struct packed_git *p, off_t obj_offset,
+		       struct object_info *oi)
 {
 	struct pack_window *w_curs = NULL;
 	unsigned long size;
-- 
1.8.4.38.g317e65b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 03/38] pack v4: scan tree objects
  2013-09-05  6:19 [PATCH 00/38] pack version 4 basic functionalities Nicolas Pitre
  2013-09-05  6:19 ` [PATCH 01/38] pack v4: initial pack dictionary structure and code Nicolas Pitre
  2013-09-05  6:19 ` [PATCH 02/38] export packed_object_info() Nicolas Pitre
@ 2013-09-05  6:19 ` Nicolas Pitre
  2013-09-05  6:19 ` [PATCH 04/38] pack v4: add tree entry mode support to dictionary entries Nicolas Pitre
                   ` (36 subsequent siblings)
  39 siblings, 0 replies; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-05  6:19 UTC (permalink / raw)
  To: git

Let's read a pack to feed our dictionary with all the path strings
contained in all the tree objects.

Dump the resulting dictionary sorted by frequency to stdout.

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 Makefile        |   1 +
 packv4-create.c | 137 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 138 insertions(+)

diff --git a/Makefile b/Makefile
index 3588ca1..4716113 100644
--- a/Makefile
+++ b/Makefile
@@ -550,6 +550,7 @@ PROGRAM_OBJS += shell.o
 PROGRAM_OBJS += show-index.o
 PROGRAM_OBJS += upload-pack.o
 PROGRAM_OBJS += remote-testsvn.o
+PROGRAM_OBJS += packv4-create.o
 
 # Binary suffix, set to .exe for Windows builds
 X =
diff --git a/packv4-create.c b/packv4-create.c
index 2de6d41..00762a5 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -9,6 +9,8 @@
  */
 
 #include "cache.h"
+#include "object.h"
+#include "tree-walk.h"
 
 struct data_entry {
 	unsigned offset;
@@ -125,6 +127,22 @@ static void sort_dict_entries_by_hits(struct dict_table *t)
 	rehash_entries(t);
 }
 
+static struct dict_table *tree_path_table;
+
+static int add_tree_dict_entries(void *buf, unsigned long size)
+{
+	struct tree_desc desc;
+	struct name_entry name_entry;
+
+	if (!tree_path_table)
+		tree_path_table = create_dict_table();
+
+	init_tree_desc(&desc, buf, size);
+	while (tree_entry(&desc, &name_entry))
+		dict_add_entry(tree_path_table, name_entry.path);
+	return 0;
+}
+
 void dict_dump(struct dict_table *t)
 {
 	int i;
@@ -135,3 +153,122 @@ void dict_dump(struct dict_table *t)
 			t->entry[i].hits,
 			t->data + t->entry[i].offset);
 }
+
+struct idx_entry
+{
+	off_t                offset;
+	const unsigned char *sha1;
+};
+
+static int sort_by_offset(const void *e1, const void *e2)
+{
+	const struct idx_entry *entry1 = e1;
+	const struct idx_entry *entry2 = e2;
+	if (entry1->offset < entry2->offset)
+		return -1;
+	if (entry1->offset > entry2->offset)
+		return 1;
+	return 0;
+}
+static int create_pack_dictionaries(struct packed_git *p)
+{
+	uint32_t nr_objects, i;
+	struct idx_entry *objects;
+
+	nr_objects = p->num_objects;
+	objects = xmalloc((nr_objects + 1) * sizeof(*objects));
+	objects[nr_objects].offset = p->index_size - 40;
+	for (i = 0; i < nr_objects; i++) {
+		objects[i].sha1 = nth_packed_object_sha1(p, i);
+		objects[i].offset = nth_packed_object_offset(p, i);
+	}
+	qsort(objects, nr_objects, sizeof(*objects), sort_by_offset);
+
+	for (i = 0; i < nr_objects; i++) {
+		void *data;
+		enum object_type type;
+		unsigned long size;
+		struct object_info oi = {};
+
+		oi.typep = &type;
+		oi.sizep = &size;
+		if (packed_object_info(p, objects[i].offset, &oi) < 0)
+			die("cannot get type of %s from %s",
+			    sha1_to_hex(objects[i].sha1), p->pack_name);
+
+		switch (type) {
+		case OBJ_TREE:
+			break;
+		default:
+			continue;
+		}
+		data = unpack_entry(p, objects[i].offset, &type, &size);
+		if (!data)
+			die("cannot unpack %s from %s",
+			    sha1_to_hex(objects[i].sha1), p->pack_name);
+		if (check_sha1_signature(objects[i].sha1, data, size, typename(type)))
+			die("packed %s from %s is corrupt",
+			    sha1_to_hex(objects[i].sha1), p->pack_name);
+		if (add_tree_dict_entries(data, size) < 0)
+			die("can't process %s object %s",
+				typename(type), sha1_to_hex(objects[i].sha1));
+		free(data);
+	}
+	free(objects);
+
+	return 0;
+}
+
+static int process_one_pack(const char *path)
+{
+	char arg[PATH_MAX];
+	int len;
+	struct packed_git *p;
+
+	len = strlcpy(arg, path, PATH_MAX);
+	if (len >= PATH_MAX)
+		return error("name too long: %s", path);
+
+	/*
+	 * In addition to "foo.idx" we accept "foo.pack" and "foo";
+	 * normalize these forms to "foo.idx" for add_packed_git().
+	 */
+	if (has_extension(arg, ".pack")) {
+		strcpy(arg + len - 5, ".idx");
+		len--;
+	} else if (!has_extension(arg, ".idx")) {
+		if (len + 4 >= PATH_MAX)
+			return error("name too long: %s.idx", arg);
+		strcpy(arg + len, ".idx");
+		len += 4;
+	}
+
+	/*
+	 * add_packed_git() uses our buffer (containing "foo.idx") to
+	 * build the pack filename ("foo.pack").  Make sure it fits.
+	 */
+	if (len + 1 >= PATH_MAX) {
+		arg[len - 4] = '\0';
+		return error("name too long: %s.pack", arg);
+	}
+
+	p = add_packed_git(arg, len, 1);
+	if (!p)
+		return error("packfile %s not found.", arg);
+
+	install_packed_git(p);
+	if (open_pack_index(p))
+		return error("packfile %s index not opened", p->pack_name);
+	return create_pack_dictionaries(p);
+}
+
+int main(int argc, char *argv[])
+{
+	if (argc != 2) {
+		fprintf(stderr, "Usage: %s <packfile>\n", argv[0]);
+		exit(1);
+	}
+	process_one_pack(argv[1]);
+	dict_dump(tree_path_table);
+	return 0;
+}
-- 
1.8.4.38.g317e65b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 04/38] pack v4: add tree entry mode support to dictionary entries
  2013-09-05  6:19 [PATCH 00/38] pack version 4 basic functionalities Nicolas Pitre
                   ` (2 preceding siblings ...)
  2013-09-05  6:19 ` [PATCH 03/38] pack v4: scan tree objects Nicolas Pitre
@ 2013-09-05  6:19 ` Nicolas Pitre
  2013-09-05  6:19 ` [PATCH 05/38] pack v4: add commit object parsing Nicolas Pitre
                   ` (35 subsequent siblings)
  39 siblings, 0 replies; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-05  6:19 UTC (permalink / raw)
  To: git

Augment dict entries with a 16-bit prefix in order to store the file
mode value of tree entries.

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 packv4-create.c | 56 ++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 36 insertions(+), 20 deletions(-)

diff --git a/packv4-create.c b/packv4-create.c
index 00762a5..eccd9fc 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -14,11 +14,12 @@
 
 struct data_entry {
 	unsigned offset;
+	unsigned size;
 	unsigned hits;
 };
 
 struct dict_table {
-	char *data;
+	unsigned char *data;
 	unsigned ptr;
 	unsigned size;
 	struct data_entry *entry;
@@ -41,18 +42,19 @@ void destroy_dict_table(struct dict_table *t)
 	free(t);
 }
 
-static int locate_entry(struct dict_table *t, const char *str)
+static int locate_entry(struct dict_table *t, const void *data, int size)
 {
-	int i = 0;
-	const unsigned char *s = (const unsigned char *) str;
+	int i = 0, len = size;
+	const unsigned char *p = data;
 
-	while (*s)
-		i = i * 111 + *s++;
+	while (len--)
+		i = i * 111 + *p++;
 	i = (unsigned)i % t->hash_size;
 
 	while (t->hash[i]) {
 		unsigned n = t->hash[i] - 1;
-		if (!strcmp(str, t->data + t->entry[n].offset))
+		if (t->entry[n].size == size &&
+		    memcmp(t->data + t->entry[n].offset, data, size) == 0)
 			return n;
 		if (++i >= t->hash_size)
 			i = 0;
@@ -71,23 +73,28 @@ static void rehash_entries(struct dict_table *t)
 	memset(t->hash, 0, t->hash_size * sizeof(*t->hash));
 
 	for (n = 0; n < t->nb_entries; n++) {
-		int i = locate_entry(t, t->data + t->entry[n].offset);
+		int i = locate_entry(t, t->data + t->entry[n].offset,
+					t->entry[n].size);
 		if (i < 0)
 			t->hash[-1 - i] = n + 1;
 	}
 }
 
-int dict_add_entry(struct dict_table *t, const char *str)
+int dict_add_entry(struct dict_table *t, int val, const char *str)
 {
-	int i, len = strlen(str) + 1;
+	int i, val_len = 2, str_len = strlen(str) + 1;
 
-	if (t->ptr + len >= t->size) {
-		t->size = (t->size + len + 1024) * 3 / 2;
+	if (t->ptr + val_len + str_len > t->size) {
+		t->size = (t->size + val_len + str_len + 1024) * 3 / 2;
 		t->data = xrealloc(t->data, t->size);
 	}
-	memcpy(t->data + t->ptr, str, len);
 
-	i = (t->nb_entries) ? locate_entry(t, t->data + t->ptr) : -1;
+	t->data[t->ptr] = val >> 8;
+	t->data[t->ptr + 1] = val;
+	memcpy(t->data + t->ptr + val_len, str, str_len);
+
+	i = (t->nb_entries) ?
+		locate_entry(t, t->data + t->ptr, val_len + str_len) : -1;
 	if (i >= 0) {
 		t->entry[i].hits++;
 		return i;
@@ -98,8 +105,9 @@ int dict_add_entry(struct dict_table *t, const char *str)
 		t->entry = xrealloc(t->entry, t->max_entries * sizeof(*t->entry));
 	}
 	t->entry[t->nb_entries].offset = t->ptr;
+	t->entry[t->nb_entries].size = val_len + str_len;
 	t->entry[t->nb_entries].hits = 1;
-	t->ptr += len + 1;
+	t->ptr += val_len + str_len;
 	t->nb_entries++;
 
 	if (t->hash_size * 3 <= t->nb_entries * 4)
@@ -139,7 +147,8 @@ static int add_tree_dict_entries(void *buf, unsigned long size)
 
 	init_tree_desc(&desc, buf, size);
 	while (tree_entry(&desc, &name_entry))
-		dict_add_entry(tree_path_table, name_entry.path);
+		dict_add_entry(tree_path_table, name_entry.mode,
+			       name_entry.path);
 	return 0;
 }
 
@@ -148,10 +157,16 @@ void dict_dump(struct dict_table *t)
 	int i;
 
 	sort_dict_entries_by_hits(t);
-	for (i = 0; i < t->nb_entries; i++)
-		printf("%d\t%s\n",
-			t->entry[i].hits,
-			t->data + t->entry[i].offset);
+	for (i = 0; i < t->nb_entries; i++) {
+		int16_t val;
+		uint16_t uval;
+		val = t->data[t->entry[i].offset] << 8;
+		val |= t->data[t->entry[i].offset + 1];
+		uval = val;
+		printf("%d\t%d\t%o\t%s\n",
+			t->entry[i].hits, val, uval,
+			t->data + t->entry[i].offset + 2);
+	}
 }
 
 struct idx_entry
@@ -170,6 +185,7 @@ static int sort_by_offset(const void *e1, const void *e2)
 		return 1;
 	return 0;
 }
+
 static int create_pack_dictionaries(struct packed_git *p)
 {
 	uint32_t nr_objects, i;
-- 
1.8.4.38.g317e65b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 05/38] pack v4: add commit object parsing
  2013-09-05  6:19 [PATCH 00/38] pack version 4 basic functionalities Nicolas Pitre
                   ` (3 preceding siblings ...)
  2013-09-05  6:19 ` [PATCH 04/38] pack v4: add tree entry mode support to dictionary entries Nicolas Pitre
@ 2013-09-05  6:19 ` Nicolas Pitre
  2013-09-05 10:30   ` SZEDER Gábor
  2013-09-05  6:19 ` [PATCH 06/38] pack v4: split the object list and dictionary creation Nicolas Pitre
                   ` (34 subsequent siblings)
  39 siblings, 1 reply; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-05  6:19 UTC (permalink / raw)
  To: git

Let's create another dictionary table to hold the author and committer
entries.  We use the same table format used for tree entries where the
16 bit data prefix is conveniently used to store the timezone value.

In order to copy straight from a commit object buffer, dict_add_entry()
is modified to get the string length as the provided string pointer is
not always be null terminated.

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 packv4-create.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 89 insertions(+), 9 deletions(-)

diff --git a/packv4-create.c b/packv4-create.c
index eccd9fc..5c08871 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -1,5 +1,5 @@
 /*
- * packv4-create.c: management of dictionary tables used in pack v4
+ * packv4-create.c: creation of dictionary tables and objects used in pack v4
  *
  * (C) Nicolas Pitre <nico@fluxnic.net>
  *
@@ -80,9 +80,9 @@ static void rehash_entries(struct dict_table *t)
 	}
 }
 
-int dict_add_entry(struct dict_table *t, int val, const char *str)
+int dict_add_entry(struct dict_table *t, int val, const char *str, int str_len)
 {
-	int i, val_len = 2, str_len = strlen(str) + 1;
+	int i, val_len = 2;
 
 	if (t->ptr + val_len + str_len > t->size) {
 		t->size = (t->size + val_len + str_len + 1024) * 3 / 2;
@@ -92,6 +92,7 @@ int dict_add_entry(struct dict_table *t, int val, const char *str)
 	t->data[t->ptr] = val >> 8;
 	t->data[t->ptr + 1] = val;
 	memcpy(t->data + t->ptr + val_len, str, str_len);
+	t->data[t->ptr + val_len + str_len] = 0;
 
 	i = (t->nb_entries) ?
 		locate_entry(t, t->data + t->ptr, val_len + str_len) : -1;
@@ -107,7 +108,7 @@ int dict_add_entry(struct dict_table *t, int val, const char *str)
 	t->entry[t->nb_entries].offset = t->ptr;
 	t->entry[t->nb_entries].size = val_len + str_len;
 	t->entry[t->nb_entries].hits = 1;
-	t->ptr += val_len + str_len;
+	t->ptr += val_len + str_len + 1;
 	t->nb_entries++;
 
 	if (t->hash_size * 3 <= t->nb_entries * 4)
@@ -135,8 +136,73 @@ static void sort_dict_entries_by_hits(struct dict_table *t)
 	rehash_entries(t);
 }
 
+static struct dict_table *commit_name_table;
 static struct dict_table *tree_path_table;
 
+/*
+ * Parse the author/committer line from a canonical commit object.
+ * The 'from' argument points right after the "author " or "committer "
+ * string.  The time zone is parsed and stored in *tz_val.  The returned
+ * pointer is right after the end of the email address which is also just
+ * before the time value, or NULL if a parsing error is encountered.
+ */
+static char *get_nameend_and_tz(char *from, int *tz_val)
+{
+	char *end, *tz;
+
+	tz = strchr(from, '\n');
+	/* let's assume the smallest possible string to be "x <x> 0 +0000\n" */
+	if (!tz || tz - from < 13)
+		return NULL;
+	tz -= 4;
+	end = tz - 4;
+	while (end - from > 5 && *end != ' ')
+		end--;
+	if (end[-1] != '>' || end[0] != ' ' || tz[-2] != ' ')
+		return NULL;
+	*tz_val = (tz[0] - '0') * 1000 +
+		  (tz[1] - '0') * 100 +
+		  (tz[2] - '0') * 10 +
+		  (tz[3] - '0');
+	switch (tz[-1]) {
+	default:	return NULL;
+	case '+':	break;
+	case '-':	*tz_val = -*tz_val;
+	}
+	return end;
+}
+
+static int add_commit_dict_entries(void *buf, unsigned long size)
+{
+	char *name, *end = NULL;
+	int tz_val;
+
+	if (!commit_name_table)
+		commit_name_table = create_dict_table();
+
+	/* parse and add author info */
+	name = strstr(buf, "\nauthor ");
+	if (name) {
+		name += 8;
+		end = get_nameend_and_tz(name, &tz_val);
+	}
+	if (!name || !end)
+		return -1;
+	dict_add_entry(commit_name_table, tz_val, name, end - name);
+
+	/* parse and add committer info */
+	name = strstr(end, "\ncommitter ");
+	if (name) {
+	       name += 11;
+	       end = get_nameend_and_tz(name, &tz_val);
+	}
+	if (!name || !end)
+		return -1;
+	dict_add_entry(commit_name_table, tz_val, name, end - name);
+
+	return 0;
+}
+
 static int add_tree_dict_entries(void *buf, unsigned long size)
 {
 	struct tree_desc desc;
@@ -146,13 +212,16 @@ static int add_tree_dict_entries(void *buf, unsigned long size)
 		tree_path_table = create_dict_table();
 
 	init_tree_desc(&desc, buf, size);
-	while (tree_entry(&desc, &name_entry))
+	while (tree_entry(&desc, &name_entry)) {
+		int pathlen = tree_entry_len(&name_entry);
 		dict_add_entry(tree_path_table, name_entry.mode,
-			       name_entry.path);
+				name_entry.path, pathlen);
+	}
+
 	return 0;
 }
 
-void dict_dump(struct dict_table *t)
+void dump_dict_table(struct dict_table *t)
 {
 	int i;
 
@@ -169,6 +238,12 @@ void dict_dump(struct dict_table *t)
 	}
 }
 
+static void dict_dump(void)
+{
+	dump_dict_table(commit_name_table);
+	dump_dict_table(tree_path_table);
+}
+
 struct idx_entry
 {
 	off_t                offset;
@@ -205,6 +280,7 @@ static int create_pack_dictionaries(struct packed_git *p)
 		enum object_type type;
 		unsigned long size;
 		struct object_info oi = {};
+		int (*add_dict_entries)(void *, unsigned long);
 
 		oi.typep = &type;
 		oi.sizep = &size;
@@ -213,7 +289,11 @@ static int create_pack_dictionaries(struct packed_git *p)
 			    sha1_to_hex(objects[i].sha1), p->pack_name);
 
 		switch (type) {
+		case OBJ_COMMIT:
+			add_dict_entries = add_commit_dict_entries;
+			break;
 		case OBJ_TREE:
+			add_dict_entries = add_tree_dict_entries;
 			break;
 		default:
 			continue;
@@ -225,7 +305,7 @@ static int create_pack_dictionaries(struct packed_git *p)
 		if (check_sha1_signature(objects[i].sha1, data, size, typename(type)))
 			die("packed %s from %s is corrupt",
 			    sha1_to_hex(objects[i].sha1), p->pack_name);
-		if (add_tree_dict_entries(data, size) < 0)
+		if (add_dict_entries(data, size) < 0)
 			die("can't process %s object %s",
 				typename(type), sha1_to_hex(objects[i].sha1));
 		free(data);
@@ -285,6 +365,6 @@ int main(int argc, char *argv[])
 		exit(1);
 	}
 	process_one_pack(argv[1]);
-	dict_dump(tree_path_table);
+	dict_dump();
 	return 0;
 }
-- 
1.8.4.38.g317e65b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 06/38] pack v4: split the object list and dictionary creation
  2013-09-05  6:19 [PATCH 00/38] pack version 4 basic functionalities Nicolas Pitre
                   ` (4 preceding siblings ...)
  2013-09-05  6:19 ` [PATCH 05/38] pack v4: add commit object parsing Nicolas Pitre
@ 2013-09-05  6:19 ` Nicolas Pitre
  2013-09-05  6:19 ` [PATCH 07/38] pack v4: move to struct pack_idx_entry and get rid of our own struct idx_entry Nicolas Pitre
                   ` (33 subsequent siblings)
  39 siblings, 0 replies; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-05  6:19 UTC (permalink / raw)
  To: git

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 packv4-create.c | 58 +++++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 44 insertions(+), 14 deletions(-)

diff --git a/packv4-create.c b/packv4-create.c
index 5c08871..20d97a4 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -261,7 +261,7 @@ static int sort_by_offset(const void *e1, const void *e2)
 	return 0;
 }
 
-static int create_pack_dictionaries(struct packed_git *p)
+static struct idx_entry *get_packed_object_list(struct packed_git *p)
 {
 	uint32_t nr_objects, i;
 	struct idx_entry *objects;
@@ -275,7 +275,15 @@ static int create_pack_dictionaries(struct packed_git *p)
 	}
 	qsort(objects, nr_objects, sizeof(*objects), sort_by_offset);
 
-	for (i = 0; i < nr_objects; i++) {
+	return objects;
+}
+
+static int create_pack_dictionaries(struct packed_git *p,
+				    struct idx_entry *objects)
+{
+	unsigned int i;
+
+	for (i = 0; i < p->num_objects; i++) {
 		void *data;
 		enum object_type type;
 		unsigned long size;
@@ -310,20 +318,21 @@ static int create_pack_dictionaries(struct packed_git *p)
 				typename(type), sha1_to_hex(objects[i].sha1));
 		free(data);
 	}
-	free(objects);
 
 	return 0;
 }
 
-static int process_one_pack(const char *path)
+static struct packed_git *open_pack(const char *path)
 {
 	char arg[PATH_MAX];
 	int len;
 	struct packed_git *p;
 
 	len = strlcpy(arg, path, PATH_MAX);
-	if (len >= PATH_MAX)
-		return error("name too long: %s", path);
+	if (len >= PATH_MAX) {
+		error("name too long: %s", path);
+		return NULL;
+	}
 
 	/*
 	 * In addition to "foo.idx" we accept "foo.pack" and "foo";
@@ -333,8 +342,10 @@ static int process_one_pack(const char *path)
 		strcpy(arg + len - 5, ".idx");
 		len--;
 	} else if (!has_extension(arg, ".idx")) {
-		if (len + 4 >= PATH_MAX)
-			return error("name too long: %s.idx", arg);
+		if (len + 4 >= PATH_MAX) {
+			error("name too long: %s.idx", arg);
+			return NULL;
+		}
 		strcpy(arg + len, ".idx");
 		len += 4;
 	}
@@ -345,17 +356,36 @@ static int process_one_pack(const char *path)
 	 */
 	if (len + 1 >= PATH_MAX) {
 		arg[len - 4] = '\0';
-		return error("name too long: %s.pack", arg);
+		error("name too long: %s.pack", arg);
+		return NULL;
 	}
 
 	p = add_packed_git(arg, len, 1);
-	if (!p)
-		return error("packfile %s not found.", arg);
+	if (!p) {
+		error("packfile %s not found.", arg);
+		return NULL;
+	}
 
 	install_packed_git(p);
-	if (open_pack_index(p))
-		return error("packfile %s index not opened", p->pack_name);
-	return create_pack_dictionaries(p);
+	if (open_pack_index(p)) {
+		error("packfile %s index not opened", p->pack_name);
+		return NULL;
+	}
+
+	return p;
+}
+
+static void process_one_pack(char *src_pack)
+{
+	struct packed_git *p;
+	struct idx_entry *objs;
+
+	p = open_pack(src_pack);
+	if (!p)
+		die("unable to open source pack");
+
+	objs = get_packed_object_list(p);
+	create_pack_dictionaries(p, objs);
 }
 
 int main(int argc, char *argv[])
-- 
1.8.4.38.g317e65b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 07/38] pack v4: move to struct pack_idx_entry and get rid of our own struct idx_entry
  2013-09-05  6:19 [PATCH 00/38] pack version 4 basic functionalities Nicolas Pitre
                   ` (5 preceding siblings ...)
  2013-09-05  6:19 ` [PATCH 06/38] pack v4: split the object list and dictionary creation Nicolas Pitre
@ 2013-09-05  6:19 ` Nicolas Pitre
  2013-09-05  6:19 ` [PATCH 08/38] pack v4: basic SHA1 reference encoding Nicolas Pitre
                   ` (32 subsequent siblings)
  39 siblings, 0 replies; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-05  6:19 UTC (permalink / raw)
  To: git

Let's create a struct pack_idx_entry list with sorted sha1 which will
be useful later.  The offset sorted list is now a separate indirect
list.

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 packv4-create.c | 72 +++++++++++++++++++++++++++++++++------------------------
 1 file changed, 42 insertions(+), 30 deletions(-)

diff --git a/packv4-create.c b/packv4-create.c
index 20d97a4..012129b 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -11,6 +11,7 @@
 #include "cache.h"
 #include "object.h"
 #include "tree-walk.h"
+#include "pack.h"
 
 struct data_entry {
 	unsigned offset;
@@ -244,46 +245,53 @@ static void dict_dump(void)
 	dump_dict_table(tree_path_table);
 }
 
-struct idx_entry
+static struct pack_idx_entry *get_packed_object_list(struct packed_git *p)
 {
-	off_t                offset;
-	const unsigned char *sha1;
-};
+	unsigned i, nr_objects = p->num_objects;
+	struct pack_idx_entry *objects;
+
+	objects = xmalloc((nr_objects + 1) * sizeof(*objects));
+	objects[nr_objects].offset = p->pack_size - 20;
+	for (i = 0; i < nr_objects; i++) {
+		hashcpy(objects[i].sha1, nth_packed_object_sha1(p, i));
+		objects[i].offset = nth_packed_object_offset(p, i);
+	}
+
+	return objects;
+}
 
 static int sort_by_offset(const void *e1, const void *e2)
 {
-	const struct idx_entry *entry1 = e1;
-	const struct idx_entry *entry2 = e2;
-	if (entry1->offset < entry2->offset)
+	const struct pack_idx_entry * const *entry1 = e1;
+	const struct pack_idx_entry * const *entry2 = e2;
+	if ((*entry1)->offset < (*entry2)->offset)
 		return -1;
-	if (entry1->offset > entry2->offset)
+	if ((*entry1)->offset > (*entry2)->offset)
 		return 1;
 	return 0;
 }
 
-static struct idx_entry *get_packed_object_list(struct packed_git *p)
+static struct pack_idx_entry **sort_objs_by_offset(struct pack_idx_entry *list,
+						    unsigned nr_objects)
 {
-	uint32_t nr_objects, i;
-	struct idx_entry *objects;
+	unsigned i;
+	struct pack_idx_entry **sorted;
 
-	nr_objects = p->num_objects;
-	objects = xmalloc((nr_objects + 1) * sizeof(*objects));
-	objects[nr_objects].offset = p->index_size - 40;
-	for (i = 0; i < nr_objects; i++) {
-		objects[i].sha1 = nth_packed_object_sha1(p, i);
-		objects[i].offset = nth_packed_object_offset(p, i);
-	}
-	qsort(objects, nr_objects, sizeof(*objects), sort_by_offset);
+	sorted = xmalloc((nr_objects + 1) * sizeof(*sorted));
+	for (i = 0; i < nr_objects + 1; i++)
+		sorted[i] = &list[i];
+	qsort(sorted, nr_objects + 1, sizeof(*sorted), sort_by_offset);
 
-	return objects;
+	return sorted;
 }
 
 static int create_pack_dictionaries(struct packed_git *p,
-				    struct idx_entry *objects)
+				    struct pack_idx_entry **obj_list)
 {
 	unsigned int i;
 
 	for (i = 0; i < p->num_objects; i++) {
+		struct pack_idx_entry *obj = obj_list[i];
 		void *data;
 		enum object_type type;
 		unsigned long size;
@@ -292,9 +300,9 @@ static int create_pack_dictionaries(struct packed_git *p,
 
 		oi.typep = &type;
 		oi.sizep = &size;
-		if (packed_object_info(p, objects[i].offset, &oi) < 0)
+		if (packed_object_info(p, obj->offset, &oi) < 0)
 			die("cannot get type of %s from %s",
-			    sha1_to_hex(objects[i].sha1), p->pack_name);
+			    sha1_to_hex(obj->sha1), p->pack_name);
 
 		switch (type) {
 		case OBJ_COMMIT:
@@ -306,16 +314,16 @@ static int create_pack_dictionaries(struct packed_git *p,
 		default:
 			continue;
 		}
-		data = unpack_entry(p, objects[i].offset, &type, &size);
+		data = unpack_entry(p, obj->offset, &type, &size);
 		if (!data)
 			die("cannot unpack %s from %s",
-			    sha1_to_hex(objects[i].sha1), p->pack_name);
-		if (check_sha1_signature(objects[i].sha1, data, size, typename(type)))
+			    sha1_to_hex(obj->sha1), p->pack_name);
+		if (check_sha1_signature(obj->sha1, data, size, typename(type)))
 			die("packed %s from %s is corrupt",
-			    sha1_to_hex(objects[i].sha1), p->pack_name);
+			    sha1_to_hex(obj->sha1), p->pack_name);
 		if (add_dict_entries(data, size) < 0)
 			die("can't process %s object %s",
-				typename(type), sha1_to_hex(objects[i].sha1));
+				typename(type), sha1_to_hex(obj->sha1));
 		free(data);
 	}
 
@@ -378,14 +386,18 @@ static struct packed_git *open_pack(const char *path)
 static void process_one_pack(char *src_pack)
 {
 	struct packed_git *p;
-	struct idx_entry *objs;
+	struct pack_idx_entry *objs, **p_objs;
+	unsigned nr_objects;
 
 	p = open_pack(src_pack);
 	if (!p)
 		die("unable to open source pack");
 
+	nr_objects = p->num_objects;
 	objs = get_packed_object_list(p);
-	create_pack_dictionaries(p, objs);
+	p_objs = sort_objs_by_offset(objs, nr_objects);
+
+	create_pack_dictionaries(p, p_objs);
 }
 
 int main(int argc, char *argv[])
-- 
1.8.4.38.g317e65b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 08/38] pack v4: basic SHA1 reference encoding
  2013-09-05  6:19 [PATCH 00/38] pack version 4 basic functionalities Nicolas Pitre
                   ` (6 preceding siblings ...)
  2013-09-05  6:19 ` [PATCH 07/38] pack v4: move to struct pack_idx_entry and get rid of our own struct idx_entry Nicolas Pitre
@ 2013-09-05  6:19 ` Nicolas Pitre
  2013-09-05  6:19 ` [PATCH 09/38] introduce get_sha1_lowhex() Nicolas Pitre
                   ` (31 subsequent siblings)
  39 siblings, 0 replies; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-05  6:19 UTC (permalink / raw)
  To: git

The SHA1 reference is either an index into a SHA1 table using the variable
length number encoding, or the literal 20 bytes SHA1 prefixed with a 0.

The index 0 discriminates between an actual index value or the literal
SHA1.  Therefore when the index is used its value must be increased by 1.

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 packv4-create.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/packv4-create.c b/packv4-create.c
index 012129b..12527c0 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -12,6 +12,7 @@
 #include "object.h"
 #include "tree-walk.h"
 #include "pack.h"
+#include "varint.h"
 
 struct data_entry {
 	unsigned offset;
@@ -245,6 +246,34 @@ static void dict_dump(void)
 	dump_dict_table(tree_path_table);
 }
 
+/*
+ * Encode an object SHA1 reference with either an object index into the
+ * pack SHA1 table incremented by 1, or the literal SHA1 value prefixed
+ * with a zero byte if the needed SHA1 is not available in the table.
+ */
+static struct pack_idx_entry *all_objs;
+static unsigned all_objs_nr;
+static int encode_sha1ref(const unsigned char *sha1, unsigned char *buf)
+{
+	unsigned lo = 0, hi = all_objs_nr;
+
+	do {
+		unsigned mi = (lo + hi) / 2;
+		int cmp = hashcmp(all_objs[mi].sha1, sha1);
+
+		if (cmp == 0)
+			return encode_varint(mi + 1, buf);
+		if (cmp > 0)
+			hi = mi;
+		else
+			lo = mi+1;
+	} while (lo < hi);
+
+	*buf++ = 0;
+	hashcpy(buf, sha1);
+	return 1 + 20;
+}
+
 static struct pack_idx_entry *get_packed_object_list(struct packed_git *p)
 {
 	unsigned i, nr_objects = p->num_objects;
-- 
1.8.4.38.g317e65b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 09/38] introduce get_sha1_lowhex()
  2013-09-05  6:19 [PATCH 00/38] pack version 4 basic functionalities Nicolas Pitre
                   ` (7 preceding siblings ...)
  2013-09-05  6:19 ` [PATCH 08/38] pack v4: basic SHA1 reference encoding Nicolas Pitre
@ 2013-09-05  6:19 ` Nicolas Pitre
  2013-09-05  6:19 ` [PATCH 10/38] pack v4: commit object encoding Nicolas Pitre
                   ` (30 subsequent siblings)
  39 siblings, 0 replies; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-05  6:19 UTC (permalink / raw)
  To: git

This is like get_sha1_hex() but stricter in accepting lowercase letters
only.

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 cache.h |  3 +++
 hex.c   | 11 +++++++++++
 2 files changed, 14 insertions(+)

diff --git a/cache.h b/cache.h
index b6634c4..4231dfa 100644
--- a/cache.h
+++ b/cache.h
@@ -850,8 +850,11 @@ extern int for_each_abbrev(const char *prefix, each_abbrev_fn, void *);
  * Return 0 on success.  Reading stops if a NUL is encountered in the
  * input, so it is safe to pass this function an arbitrary
  * null-terminated string.
+ *
+ * The "low" version accepts numbers and lowercase letters only.
  */
 extern int get_sha1_hex(const char *hex, unsigned char *sha1);
+extern int get_sha1_lowhex(const char *hex, unsigned char *sha1);
 
 extern char *sha1_to_hex(const unsigned char *sha1);	/* static buffer result! */
 extern int read_ref_full(const char *refname, unsigned char *sha1,
diff --git a/hex.c b/hex.c
index 9ebc050..1d7eae1 100644
--- a/hex.c
+++ b/hex.c
@@ -56,6 +56,17 @@ int get_sha1_hex(const char *hex, unsigned char *sha1)
 	return 0;
 }
 
+int get_sha1_lowhex(const char *hex, unsigned char *sha1)
+{
+	int i;
+
+	/* uppercase letters (as well as '\0') have bit 5 clear */
+	for (i = 0; i < 20; i++)
+		if (!(hex[i] & 0x20))
+			return -1;
+	return get_sha1_hex(hex, sha1);
+}
+
 char *sha1_to_hex(const unsigned char *sha1)
 {
 	static int bufno;
-- 
1.8.4.38.g317e65b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 10/38] pack v4: commit object encoding
  2013-09-05  6:19 [PATCH 00/38] pack version 4 basic functionalities Nicolas Pitre
                   ` (8 preceding siblings ...)
  2013-09-05  6:19 ` [PATCH 09/38] introduce get_sha1_lowhex() Nicolas Pitre
@ 2013-09-05  6:19 ` Nicolas Pitre
  2013-09-06  6:57   ` Junio C Hamano
  2013-09-05  6:19 ` [PATCH 11/38] pack v4: tree " Nicolas Pitre
                   ` (29 subsequent siblings)
  39 siblings, 1 reply; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-05  6:19 UTC (permalink / raw)
  To: git

This goes as follows:

- Tree reference: either variable length encoding of the index
  into the SHA1 table or the literal SHA1 prefixed by 0 (see
  encode_sha1ref()).

- Parent count: variable length encoding of the number of parents.
  This is normally going to occupy a single byte but doesn't have to.

- List of parent references: a list of encode_sha1ref() encoded
  references, or nothing if the parent count was zero.

- Author reference: variable length encoding of an index into the author
  identifier dictionary table which also covers the time zone.  To make
  the overall encoding efficient, the author table is sorted by usage
  frequency so the most used names are first and require the shortest
  index encoding.

- Author time stamp: variable length encoded.  Year 2038 ready!

- Committer reference: same as author reference.

- Committer time stamp: same as author time stamp.

The remainder of the canonical commit object content is then zlib
compressed and appended to the above.

Rationale: The most important commit object data is densely encoded while
requiring no zlib inflate processing on access, and all SHA1 references
are most likely to be direct indices into the pack index file requiring
no SHA1 search into the pack index file.

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 packv4-create.c | 119 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 119 insertions(+)

diff --git a/packv4-create.c b/packv4-create.c
index 12527c0..d4a79f4 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -14,6 +14,9 @@
 #include "pack.h"
 #include "varint.h"
 
+
+static int pack_compression_level = Z_DEFAULT_COMPRESSION;
+
 struct data_entry {
 	unsigned offset;
 	unsigned size;
@@ -274,6 +277,122 @@ static int encode_sha1ref(const unsigned char *sha1, unsigned char *buf)
 	return 1 + 20;
 }
 
+/*
+ * This converts a canonical commit object buffer into its
+ * tightly packed representation using the already populated
+ * and sorted commit_name_table dictionary.  The parsing is
+ * strict so to ensure the canonical version may always be
+ * regenerated and produce the same hash.
+ */
+void *pv4_encode_commit(void *buffer, unsigned long *sizep)
+{
+	unsigned long size = *sizep;
+	char *in, *tail, *end;
+	unsigned char *out;
+	unsigned char sha1[20];
+	int nb_parents, index, tz_val;
+	unsigned long time;
+	z_stream stream;
+	int status;
+
+	/*
+	 * It is guaranteed that the output is always going to be smaller
+	 * than the input.  We could even do this conversion in place.
+	 */
+	in = buffer;
+	tail = in + size;
+	buffer = xmalloc(size);
+	out = buffer;
+
+	/* parse the "tree" line */
+	if (in + 46 >= tail || memcmp(in, "tree ", 5) || in[45] != '\n')
+		goto bad_data;
+	if (get_sha1_lowhex(in + 5, sha1) < 0)
+		goto bad_data;
+	in += 46;
+	out += encode_sha1ref(sha1, out);
+
+	/* count how many "parent" lines */
+	nb_parents = 0;
+	while (in + 48 < tail && !memcmp(in, "parent ", 7) && in[47] == '\n') {
+		nb_parents++;
+		in += 48;
+	}
+	out += encode_varint(nb_parents, out);
+
+	/* rewind and parse the "parent" lines */
+	in -= 48 * nb_parents;
+	while (nb_parents--) {
+		if (get_sha1_lowhex(in + 7, sha1))
+			goto bad_data;
+		out += encode_sha1ref(sha1, out);
+		in += 48;
+	}
+
+	/* parse the "author" line */
+	/* it must be at least "author x <x> 0 +0000\n" i.e. 21 chars */
+	if (in + 21 >= tail || memcmp(in, "author ", 7))
+		goto bad_data;
+	in += 7;
+	end = get_nameend_and_tz(in, &tz_val);
+	if (!end)
+		goto bad_data;
+	index = dict_add_entry(commit_name_table, tz_val, in, end - in);
+	if (index < 0)
+		goto bad_dict;
+	out += encode_varint(index, out);
+	time = strtoul(end, &end, 10);
+	if (!end || end[0] != ' ' || end[6] != '\n')
+		goto bad_data;
+	out += encode_varint(time, out);
+	in = end + 7;
+
+	/* parse the "committer" line */
+	/* it must be at least "committer x <x> 0 +0000\n" i.e. 24 chars */
+	if (in + 24 >= tail || memcmp(in, "committer ", 7))
+		goto bad_data;
+	in += 10;
+	end = get_nameend_and_tz(in, &tz_val);
+	if (!end)
+		goto bad_data;
+	index = dict_add_entry(commit_name_table, tz_val, in, end - in);
+	if (index < 0)
+		goto bad_dict;
+	out += encode_varint(index, out);
+	time = strtoul(end, &end, 10);
+	if (!end || end[0] != ' ' || end[6] != '\n')
+		goto bad_data;
+	out += encode_varint(time, out);
+	in = end + 7;
+
+	/* finally, deflate the remaining data */
+	memset(&stream, 0, sizeof(stream));
+	deflateInit(&stream, pack_compression_level);
+	stream.next_in = (unsigned char *)in;
+	stream.avail_in = tail - in;
+	stream.next_out = (unsigned char *)out;
+	stream.avail_out = size - (out - (unsigned char *)buffer);
+	status = deflate(&stream, Z_FINISH);
+	end = (char *)stream.next_out;
+	deflateEnd(&stream);
+	if (status != Z_STREAM_END) {
+		error("deflate error status %d", status);
+		goto bad;
+	}
+
+	*sizep = end - (char *)buffer;
+	return buffer;
+
+bad_data:
+	error("bad commit data");
+	goto bad;
+bad_dict:
+	error("bad dict entry");
+bad:
+	free(buffer);
+	return NULL;
+}
+
 static struct pack_idx_entry *get_packed_object_list(struct packed_git *p)
 {
 	unsigned i, nr_objects = p->num_objects;
-- 
1.8.4.38.g317e65b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 11/38] pack v4: tree object encoding
  2013-09-05  6:19 [PATCH 00/38] pack version 4 basic functionalities Nicolas Pitre
                   ` (9 preceding siblings ...)
  2013-09-05  6:19 ` [PATCH 10/38] pack v4: commit object encoding Nicolas Pitre
@ 2013-09-05  6:19 ` Nicolas Pitre
  2013-09-05  6:19 ` [PATCH 12/38] pack v4: dictionary table output Nicolas Pitre
                   ` (28 subsequent siblings)
  39 siblings, 0 replies; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-05  6:19 UTC (permalink / raw)
  To: git

This goes as follows:

- Number of tree entries: variable length encoded.

Then for each tree entry:

- Path component reference: variable length encoded index into the path
  dictionary table which also covers the entry mode. To make the overall
  encoding efficient, the path table is already sorted by usage frequency
  so the most used path names are first and require the shortest index
  encoding.

- SHA1 reference: either variable length encoding of the index into the
  SHA1 table or the literal SHA1 prefixed by 0 (see encode_sha1ref()).

Rationale: all the tree object data is densely encoded while requiring
no zlib inflate processing on access, and all SHA1 references are most
likely to be direct indices into the pack index file requiring no SHA1
search.  Path filtering can be accomplished on the path index directly
without any string comparison during the tree traversal.

Still lacking is some kind of delta encoding for multiple tree objects
with only small differences between them.  But that'll come later.

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 packv4-create.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/packv4-create.c b/packv4-create.c
index d4a79f4..b91ee0b 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -393,6 +393,72 @@ bad:
 	return NULL;
 }
 
+/*
+ * This converts a canonical tree object buffer into its
+ * tightly packed representation using the already populated
+ * and sorted tree_path_table dictionary.  The parsing is
+ * strict so to ensure the canonical version may always be
+ * regenerated and produce the same hash.
+ */
+void *pv4_encode_tree(void *_buffer, unsigned long *sizep)
+{
+	unsigned long size = *sizep;
+	unsigned char *in, *out, *end, *buffer = _buffer;
+	struct tree_desc desc;
+	struct name_entry name_entry;
+	int nb_entries;
+
+	if (!size)
+		return NULL;
+
+	/*
+	 * We can't make sure the result will always be smaller than the
+	 * input. The smallest possible entry is "0 x\0<40 byte SHA1>"
+	 * or 44 bytes.  The output entry may have a realistic path index
+	 * encoding using up to 3 bytes, and a non indexable SHA1 meaning
+	 * 41 bytes.  And the output data already has the nb_entries
+	 * headers.  In practice the output size will be significantly
+	 * smaller but for now let's make it simple.
+	 */
+	in = buffer;
+	out = xmalloc(size + 48);
+	end = out + size + 48;
+	buffer = out;
+
+	/* let's count how many entries there are */
+	init_tree_desc(&desc, in, size);
+	nb_entries = 0;
+	while (tree_entry(&desc, &name_entry))
+		nb_entries++;
+	out += encode_varint(nb_entries, out);
+
+	init_tree_desc(&desc, in, size);
+	while (tree_entry(&desc, &name_entry)) {
+		int pathlen, index;
+
+		if (end - out < 48) {
+			unsigned long sofar = out - buffer;
+			buffer = xrealloc(buffer, (sofar + 48)*2);
+			end = buffer + (sofar + 48)*2;
+			out = buffer + sofar;
+		}
+
+		pathlen = tree_entry_len(&name_entry);
+		index = dict_add_entry(tree_path_table, name_entry.mode,
+				       name_entry.path, pathlen);
+		if (index < 0) {
+			error("missing tree dict entry");
+			free(buffer);
+			return NULL;
+		}
+		out += encode_varint(index, out);
+		out += encode_sha1ref(name_entry.sha1, out);
+	}
+
+	*sizep = out - buffer;
+	return buffer;
+}
+
 static struct pack_idx_entry *get_packed_object_list(struct packed_git *p)
 {
 	unsigned i, nr_objects = p->num_objects;
-- 
1.8.4.38.g317e65b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 12/38] pack v4: dictionary table output
  2013-09-05  6:19 [PATCH 00/38] pack version 4 basic functionalities Nicolas Pitre
                   ` (10 preceding siblings ...)
  2013-09-05  6:19 ` [PATCH 11/38] pack v4: tree " Nicolas Pitre
@ 2013-09-05  6:19 ` Nicolas Pitre
  2013-09-05  6:19 ` [PATCH 13/38] pack v4: creation code Nicolas Pitre
                   ` (27 subsequent siblings)
  39 siblings, 0 replies; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-05  6:19 UTC (permalink / raw)
  To: git

Here's the code to dump a table into a pack.  Table entries are written
according to the current sort order. This is important as objects use
this order to index into the table.

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 packv4-create.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/packv4-create.c b/packv4-create.c
index b91ee0b..92d3662 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -544,6 +544,55 @@ static int create_pack_dictionaries(struct packed_git *p,
 	return 0;
 }
 
+static unsigned long write_dict_table(struct sha1file *f, struct dict_table *t)
+{
+	unsigned char buffer[1024];
+	unsigned hdrlen;
+	unsigned long size, datalen;
+	z_stream stream;
+	int i, status;
+
+	/*
+	 * Stored dict table format: uncompressed data length followed by
+	 * compressed content.
+	 */
+
+	datalen = t->ptr;
+	hdrlen = encode_varint(datalen, buffer);
+	sha1write(f, buffer, hdrlen);
+
+	memset(&stream, 0, sizeof(stream));
+	deflateInit(&stream, pack_compression_level);
+
+	for (i = 0; i < t->nb_entries; i++) {
+		stream.next_in = t->data + t->entry[i].offset;
+		stream.avail_in = 2 + strlen((char *)t->data + t->entry[i].offset + 2) + 1;
+		do {
+			stream.next_out = buffer;
+			stream.avail_out = sizeof(buffer);
+			status = deflate(&stream, 0);
+			size = stream.next_out - (unsigned char *)buffer;
+			sha1write(f, buffer, size);
+		} while (status == Z_OK);
+	}
+	do {
+		stream.next_out = buffer;
+		stream.avail_out = sizeof(buffer);
+		status = deflate(&stream, Z_FINISH);
+		size = stream.next_out - (unsigned char *)buffer;
+		sha1write(f, buffer, size);
+	} while (status == Z_OK);
+	if (status != Z_STREAM_END)
+		die("unable to deflate dictionary table (%d)", status);
+	if (stream.total_in != datalen)
+		die("dict data size mismatch (%ld vs %ld)",
+		    stream.total_in, datalen);
+	datalen = stream.total_out;
+	deflateEnd(&stream);
+
+	return hdrlen + datalen;
+}
+
 static struct packed_git *open_pack(const char *path)
 {
 	char arg[PATH_MAX];
-- 
1.8.4.38.g317e65b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 13/38] pack v4: creation code
  2013-09-05  6:19 [PATCH 00/38] pack version 4 basic functionalities Nicolas Pitre
                   ` (11 preceding siblings ...)
  2013-09-05  6:19 ` [PATCH 12/38] pack v4: dictionary table output Nicolas Pitre
@ 2013-09-05  6:19 ` Nicolas Pitre
  2013-09-05  6:19 ` [PATCH 14/38] pack v4: object headers Nicolas Pitre
                   ` (26 subsequent siblings)
  39 siblings, 0 replies; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-05  6:19 UTC (permalink / raw)
  To: git

Let's actually open the destination pack file and write the header and
the tables.

The header isn't much different from pack v3, except for the pack version
number of course.

The first table is the sorted SHA1 table normally found in the pack index
file.  With pack v4 we write this table in the main pack file instead as
it is index referenced by subsequent objects in the pack.  Doing so has
many advantages:

- The SHA1 references used to be duplicated on disk: once in the pack
  index file, and then at least once or more within commit and tree
  objects referencing them.  The only SHA1 which is not being listed more
  than once this way is the one for a branch tip commit object and those
  are normally very few.  Now all that SHA1 data is represented only once.

- The SHA1 references found in commit and tree objects can be obtained
  on disk directly without having to deflate those objects first.

The SHA1 table size is obtained by multiplying the number of objects by 20.

And then the commit and path dictionary tables are written right after
the SHA1 table.

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 packv4-create.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 55 insertions(+), 5 deletions(-)

diff --git a/packv4-create.c b/packv4-create.c
index 92d3662..61b70c8 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -593,6 +593,48 @@ static unsigned long write_dict_table(struct sha1file *f, struct dict_table *t)
 	return hdrlen + datalen;
 }
 
+static struct sha1file * packv4_open(char *path)
+{
+	int fd;
+
+	fd = open(path, O_CREAT|O_EXCL|O_WRONLY, 0600);
+	if (fd < 0)
+		die_errno("unable to create '%s'", path);
+	return sha1fd(fd, path);
+}
+
+static unsigned int packv4_write_header(struct sha1file *f, unsigned nr_objects)
+{
+	struct pack_header hdr;
+
+	hdr.hdr_signature = htonl(PACK_SIGNATURE);
+	hdr.hdr_version = htonl(4);
+	hdr.hdr_entries = htonl(nr_objects);
+	sha1write(f, &hdr, sizeof(hdr));
+
+	return sizeof(hdr);
+}
+
+static unsigned long packv4_write_tables(struct sha1file *f, unsigned nr_objects,
+					 struct pack_idx_entry *objs)
+{
+	unsigned i;
+	unsigned long written = 0;
+
+	/* The sorted list of object SHA1's is always first */
+	for (i = 0; i < nr_objects; i++)
+		sha1write(f, objs[i].sha1, 20);
+	written = 20 * nr_objects;
+
+	/* Then the commit dictionary table */
+	written += write_dict_table(f, commit_name_table);
+
+	/* Followed by the path component dictionary table */
+	written += write_dict_table(f, tree_path_table);
+
+	return written;
+}
+
 static struct packed_git *open_pack(const char *path)
 {
 	char arg[PATH_MAX];
@@ -646,9 +688,10 @@ static struct packed_git *open_pack(const char *path)
 	return p;
 }
 
-static void process_one_pack(char *src_pack)
+static void process_one_pack(char *src_pack, char *dst_pack)
 {
 	struct packed_git *p;
+	struct sha1file *f;
 	struct pack_idx_entry *objs, **p_objs;
 	unsigned nr_objects;
 
@@ -661,15 +704,22 @@ static void process_one_pack(char *src_pack)
 	p_objs = sort_objs_by_offset(objs, nr_objects);
 
 	create_pack_dictionaries(p, p_objs);
+
+	f = packv4_open(dst_pack);
+	if (!f)
+		die("unable to open destination pack");
+	packv4_write_header(f, nr_objects);
+	packv4_write_tables(f, nr_objects, objs);
 }
 
 int main(int argc, char *argv[])
 {
-	if (argc != 2) {
-		fprintf(stderr, "Usage: %s <packfile>\n", argv[0]);
+	if (argc != 3) {
+		fprintf(stderr, "Usage: %s <src_packfile> <dst_packfile>\n", argv[0]);
 		exit(1);
 	}
-	process_one_pack(argv[1]);
-	dict_dump();
+	process_one_pack(argv[1], argv[2]);
+	if (0)
+		dict_dump();
 	return 0;
 }
-- 
1.8.4.38.g317e65b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 14/38] pack v4: object headers
  2013-09-05  6:19 [PATCH 00/38] pack version 4 basic functionalities Nicolas Pitre
                   ` (12 preceding siblings ...)
  2013-09-05  6:19 ` [PATCH 13/38] pack v4: creation code Nicolas Pitre
@ 2013-09-05  6:19 ` Nicolas Pitre
  2013-09-05  6:19 ` [PATCH 15/38] pack v4: object data copy Nicolas Pitre
                   ` (25 subsequent siblings)
  39 siblings, 0 replies; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-05  6:19 UTC (permalink / raw)
  To: git

In pack v4 the object size and type is encoded differently from pack v3.
The object size uses the same efficient variable length number encoding
already used elsewhere.

The object type has 4 bits allocated to it compared to 3 bits in pack v3.
This should be quite sufficient for the foreseeable future, especially
since pack v4 has only one type of delta object instead of two.

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 packv4-create.c | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/packv4-create.c b/packv4-create.c
index 61b70c8..6098062 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -635,6 +635,33 @@ static unsigned long packv4_write_tables(struct sha1file *f, unsigned nr_objects
 	return written;
 }
 
+static int write_object_header(struct sha1file *f, enum object_type type, unsigned long size)
+{
+	unsigned char buf[16];
+	uint64_t val;
+	int len;
+
+	/*
+	 * We really have only one kind of delta object.
+	 */
+	if (type == OBJ_OFS_DELTA)
+		type = OBJ_REF_DELTA;
+
+	/*
+	 * We allocate 4 bits in the LSB for the object type which should
+	 * be good for quite a while, given that we effectively encodes
+	 * only 5 object types: commit, tree, blob, delta, tag.
+	 */
+	val = size;
+	if (MSB(val, 4))
+		die("fixme: the code doesn't currently cope with big sizes");
+	val <<= 4;
+	val |= type;
+	len = encode_varint(val, buf);
+	sha1write(f, buf, len);
+	return len;
+}
+
 static struct packed_git *open_pack(const char *path)
 {
 	char arg[PATH_MAX];
-- 
1.8.4.38.g317e65b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 15/38] pack v4: object data copy
  2013-09-05  6:19 [PATCH 00/38] pack version 4 basic functionalities Nicolas Pitre
                   ` (13 preceding siblings ...)
  2013-09-05  6:19 ` [PATCH 14/38] pack v4: object headers Nicolas Pitre
@ 2013-09-05  6:19 ` Nicolas Pitre
  2013-09-05  6:19 ` [PATCH 16/38] pack v4: object writing Nicolas Pitre
                   ` (24 subsequent siblings)
  39 siblings, 0 replies; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-05  6:19 UTC (permalink / raw)
  To: git

Blob and tag objects have no particular changes except for their object
header.

Delta objects are also copied as is, except for their delta base reference
which is converted to the new way as used elsewhere in pack v4 encoding
i.e. an index into the SHA1 table or a literal SHA1 prefixed by 0 if not
found in the table (see encode_sha1ref).  This is true for both REF_DELTA
as well as OFS_DELTA.

Object payload is validated against the recorded CRC32 in the source
pack index file when possible before being copied.

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 packv4-create.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/packv4-create.c b/packv4-create.c
index 6098062..b0e344f 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -12,6 +12,7 @@
 #include "object.h"
 #include "tree-walk.h"
 #include "pack.h"
+#include "pack-revindex.h"
 #include "varint.h"
 
 
@@ -662,6 +663,65 @@ static int write_object_header(struct sha1file *f, enum object_type type, unsign
 	return len;
 }
 
+static unsigned long copy_object_data(struct sha1file *f, struct packed_git *p,
+				      off_t offset)
+{
+	struct pack_window *w_curs = NULL;
+	struct revindex_entry *revidx;
+	enum object_type type;
+	unsigned long avail, size, datalen, written;
+	int hdrlen, reflen, idx_nr;
+	unsigned char *src, buf[24];
+
+	revidx = find_pack_revindex(p, offset);
+	idx_nr = revidx->nr;
+	datalen = revidx[1].offset - offset;
+
+	src = use_pack(p, &w_curs, offset, &avail);
+	hdrlen = unpack_object_header_buffer(src, avail, &type, &size);
+
+	written = write_object_header(f, type, size);
+
+	if (type == OBJ_OFS_DELTA) {
+		const unsigned char *cp = src + hdrlen;
+		off_t base_offset = decode_varint(&cp);
+		hdrlen = cp - src;
+		base_offset = offset - base_offset;
+		if (base_offset <= 0 || base_offset >= offset)
+			die("delta offset out of bound");
+		revidx = find_pack_revindex(p, base_offset);
+		reflen = encode_sha1ref(nth_packed_object_sha1(p, revidx->nr), buf);
+		sha1write(f, buf, reflen);
+		written += reflen;
+	} else if (type == OBJ_REF_DELTA) {
+		reflen = encode_sha1ref(src + hdrlen, buf);
+		hdrlen += 20;
+		sha1write(f, buf, reflen);
+		written += reflen;
+	}
+
+	if (p->index_version > 1 &&
+	    check_pack_crc(p, &w_curs, offset, datalen, idx_nr))
+		die("bad CRC for object at offset %"PRIuMAX" in %s",
+		    (uintmax_t)offset, p->pack_name);
+
+	offset += hdrlen;
+	datalen -= hdrlen;
+
+	while (datalen) {
+		src = use_pack(p, &w_curs, offset, &avail);
+		if (avail > datalen)
+			avail = datalen;
+		sha1write(f, src, avail);
+		written += avail;
+		offset += avail;
+		datalen -= avail;
+	}
+	unuse_pack(&w_curs);
+
+	return written;
+}
+
 static struct packed_git *open_pack(const char *path)
 {
 	char arg[PATH_MAX];
-- 
1.8.4.38.g317e65b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 16/38] pack v4: object writing
  2013-09-05  6:19 [PATCH 00/38] pack version 4 basic functionalities Nicolas Pitre
                   ` (14 preceding siblings ...)
  2013-09-05  6:19 ` [PATCH 15/38] pack v4: object data copy Nicolas Pitre
@ 2013-09-05  6:19 ` Nicolas Pitre
  2013-09-05  6:19 ` [PATCH 17/38] pack v4: tree object delta encoding Nicolas Pitre
                   ` (23 subsequent siblings)
  39 siblings, 0 replies; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-05  6:19 UTC (permalink / raw)
  To: git

This adds the missing code to finally be able to produce a complete
pack file version 4.  We trap commit and tree objects as those have
a completely new encoding.  Other object types are copied almost
unchanged.

As we go the pack index entries are updated  in place to store the new
object offsets once they're written to the destination file.  This will
be needed later for writing the pack index file.

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 packv4-create.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 71 insertions(+), 3 deletions(-)

diff --git a/packv4-create.c b/packv4-create.c
index b0e344f..5d76234 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -722,6 +722,59 @@ static unsigned long copy_object_data(struct sha1file *f, struct packed_git *p,
 	return written;
 }
 
+static off_t packv4_write_object(struct sha1file *f, struct packed_git *p,
+				 struct pack_idx_entry *obj)
+{
+	void *src, *result;
+	struct object_info oi = {};
+	enum object_type type;
+	unsigned long size;
+	unsigned int hdrlen;
+
+	oi.typep = &type;
+	oi.sizep = &size;
+	if (packed_object_info(p, obj->offset, &oi) < 0)
+		die("cannot get type of %s from %s",
+		    sha1_to_hex(obj->sha1), p->pack_name);
+
+	/* Some objects are copied without decompression */
+	switch (type) {
+	case OBJ_COMMIT:
+	case OBJ_TREE:
+		break;
+	default:
+		return copy_object_data(f, p, obj->offset);
+	}
+
+	/* The rest is converted into their new format */
+	src = unpack_entry(p, obj->offset, &type, &size);
+	if (!src)
+		die("cannot unpack %s from %s",
+		    sha1_to_hex(obj->sha1), p->pack_name);
+	if (check_sha1_signature(obj->sha1, src, size, typename(type)))
+		die("packed %s from %s is corrupt",
+		    sha1_to_hex(obj->sha1), p->pack_name);
+
+	hdrlen = write_object_header(f, type, size);
+	switch (type) {
+	case OBJ_COMMIT:
+		result = pv4_encode_commit(src, &size);
+		break;
+	case OBJ_TREE:
+		result = pv4_encode_tree(src, &size);
+		break;
+	default:
+		die("unexpected object type %d", type);
+	}
+	free(src);
+	if (!result)
+		die("can't convert %s object %s",
+		    typename(type), sha1_to_hex(obj->sha1));
+	sha1write(f, result, size);
+	free(result);
+	return hdrlen + size;
+}
+
 static struct packed_git *open_pack(const char *path)
 {
 	char arg[PATH_MAX];
@@ -780,7 +833,8 @@ static void process_one_pack(char *src_pack, char *dst_pack)
 	struct packed_git *p;
 	struct sha1file *f;
 	struct pack_idx_entry *objs, **p_objs;
-	unsigned nr_objects;
+	unsigned i, nr_objects;
+	off_t written = 0;
 
 	p = open_pack(src_pack);
 	if (!p)
@@ -791,12 +845,26 @@ static void process_one_pack(char *src_pack, char *dst_pack)
 	p_objs = sort_objs_by_offset(objs, nr_objects);
 
 	create_pack_dictionaries(p, p_objs);
+	sort_dict_entries_by_hits(commit_name_table);
+	sort_dict_entries_by_hits(tree_path_table);
 
 	f = packv4_open(dst_pack);
 	if (!f)
 		die("unable to open destination pack");
-	packv4_write_header(f, nr_objects);
-	packv4_write_tables(f, nr_objects, objs);
+	written += packv4_write_header(f, nr_objects);
+	written += packv4_write_tables(f, nr_objects, objs);
+
+	/* Let's write objects out, updating the object index list in place */
+	all_objs = objs;
+	all_objs_nr = nr_objects;
+	for (i = 0; i < nr_objects; i++) {
+		off_t obj_pos = written;
+		struct pack_idx_entry *obj = p_objs[i];
+		written += packv4_write_object(f, p, obj);
+		obj->offset = obj_pos;
+	}
+
+	sha1close(f, NULL, CSUM_CLOSE | CSUM_FSYNC);
 }
 
 int main(int argc, char *argv[])
-- 
1.8.4.38.g317e65b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 17/38] pack v4: tree object delta encoding
  2013-09-05  6:19 [PATCH 00/38] pack version 4 basic functionalities Nicolas Pitre
                   ` (15 preceding siblings ...)
  2013-09-05  6:19 ` [PATCH 16/38] pack v4: object writing Nicolas Pitre
@ 2013-09-05  6:19 ` Nicolas Pitre
  2013-09-05  6:19 ` [PATCH 18/38] pack v4: load delta candidate for encoding tree objects Nicolas Pitre
                   ` (22 subsequent siblings)
  39 siblings, 0 replies; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-05  6:19 UTC (permalink / raw)
  To: git

In order to be able to quickly walk tree objects, let's encode their
"delta" as a range of entries into another tree object.

In order to discriminate between a copy sequence from a regular entry,
the entry index LSB is reserved to indicate a copy sequence.  Therefore
the actual index of a path component is shifted left one bit.

The encoding allows for the base object to change so multiple base
objects can be borrowed from.  The code doesn't try to exploit this
possibility at the moment though.

The code isn't optimal at the moment as it doesn't consider the case
where a copy sequence could be larger than the local sequence it
means to replace.

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 packv4-create.c | 108 +++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 103 insertions(+), 5 deletions(-)

diff --git a/packv4-create.c b/packv4-create.c
index 5d76234..6830a0a 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -394,24 +394,53 @@ bad:
 	return NULL;
 }
 
+static int compare_tree_entries(struct name_entry *e1, struct name_entry *e2)
+{
+	int len1 = tree_entry_len(e1);
+	int len2 = tree_entry_len(e2);
+	int len = len1 < len2 ? len1 : len2;
+	unsigned char c1, c2;
+	int cmp;
+
+	cmp = memcmp(e1->path, e2->path, len);
+	if (cmp)
+		return cmp;
+	c1 = e1->path[len];
+	c2 = e2->path[len];
+	if (!c1 && S_ISDIR(e1->mode))
+		c1 = '/';
+	if (!c2 && S_ISDIR(e2->mode))
+		c2 = '/';
+	return c1 - c2;
+}
+
 /*
  * This converts a canonical tree object buffer into its
  * tightly packed representation using the already populated
  * and sorted tree_path_table dictionary.  The parsing is
  * strict so to ensure the canonical version may always be
  * regenerated and produce the same hash.
+ *
+ * If a delta buffer is provided, we may encode multiple ranges of tree
+ * entries against that buffer.
  */
-void *pv4_encode_tree(void *_buffer, unsigned long *sizep)
+void *pv4_encode_tree(void *_buffer, unsigned long *sizep,
+		      void *delta, unsigned long delta_size,
+		      const unsigned char *delta_sha1)
 {
 	unsigned long size = *sizep;
 	unsigned char *in, *out, *end, *buffer = _buffer;
-	struct tree_desc desc;
-	struct name_entry name_entry;
+	struct tree_desc desc, delta_desc;
+	struct name_entry name_entry, delta_entry;
 	int nb_entries;
+	unsigned int copy_start, copy_count = 0, delta_pos = 0, first_delta = 1;
 
 	if (!size)
 		return NULL;
 
+	if (!delta_size)
+		delta = NULL;
+
 	/*
 	 * We can't make sure the result will always be smaller than the
 	 * input. The smallest possible entry is "0 x\0<40 byte SHA1>"
@@ -434,9 +463,42 @@ void *pv4_encode_tree(void *_buffer, unsigned long *sizep)
 	out += encode_varint(nb_entries, out);
 
 	init_tree_desc(&desc, in, size);
+	if (delta) {
+		init_tree_desc(&delta_desc, delta, delta_size);
+		if (!tree_entry(&delta_desc, &delta_entry))
+			delta = NULL;
+	}
+
 	while (tree_entry(&desc, &name_entry)) {
 		int pathlen, index;
 
+		/*
+		 * Try to match entries against our delta object.
+		 */
+		if (delta) {
+			int ret;
+
+			do {
+				ret = compare_tree_entries(&name_entry, &delta_entry);
+				if (ret <= 0 || copy_count != 0)
+					break;
+				delta_pos++;
+				if (!tree_entry(&delta_desc, &delta_entry))
+					delta = NULL;
+			} while (delta);
+
+			if (ret == 0 && name_entry.mode == delta_entry.mode &&
+			    hashcmp(name_entry.sha1, delta_entry.sha1) == 0) {
+				if (!copy_count)
+					copy_start = delta_pos;
+				copy_count++;
+				delta_pos++;
+				if (!tree_entry(&delta_desc, &delta_entry))
+					delta = NULL;
+				continue;
+			}
+		}
+
 		if (end - out < 48) {
 			unsigned long sofar = out - buffer;
 			buffer = xrealloc(buffer, (sofar + 48)*2);
@@ -444,6 +506,32 @@ void *pv4_encode_tree(void *_buffer, unsigned long *sizep)
 			out = buffer + sofar;
 		}
 
+		if (copy_count) {
+			/*
+			 * Let's write a sequence indicating we're copying
+			 * entries from another object:
+			 *
+			 * entry_start + entry_count + object_ref
+			 *
+			 * To distinguish between 'entry_start' and an actual
+			 * entry index, we use the LSB = 1.
+			 *
+			 * Furthermore, if object_ref is the same as the
+			 * preceding one, we can omit it and save some
+			 * more space, especially if that ends up being a
+			 * full sha1 reference.  Let's steal the LSB
+			 * of entry_count for that purpose.
+			 */
+			copy_start = (copy_start << 1) | 1;
+			copy_count = (copy_count << 1) | first_delta;
+			out += encode_varint(copy_start, out);
+			out += encode_varint(copy_count, out);
+			if (first_delta)
+				out += encode_sha1ref(delta_sha1, out);
+			copy_count = 0;
+			first_delta = 0;
+		}
+
 		pathlen = tree_entry_len(&name_entry);
 		index = dict_add_entry(tree_path_table, name_entry.mode,
 				       name_entry.path, pathlen);
@@ -452,10 +540,20 @@ void *pv4_encode_tree(void *_buffer, unsigned long *sizep)
 			free(buffer);
 			return NULL;
 		}
-		out += encode_varint(index, out);
+		out += encode_varint(index << 1, out);
 		out += encode_sha1ref(name_entry.sha1, out);
 	}
 
+	if (copy_count) {
+		/* flush the trailing copy */
+		copy_start = (copy_start << 1) | 1;
+		copy_count = (copy_count << 1) | first_delta;
+		out += encode_varint(copy_start, out);
+		out += encode_varint(copy_count, out);
+		if (first_delta)
+			out += encode_sha1ref(delta_sha1, out);
+	}
+
 	*sizep = out - buffer;
 	return buffer;
 }
@@ -761,7 +859,7 @@ static off_t packv4_write_object(struct sha1file *f, struct packed_git *p,
 		result = pv4_encode_commit(src, &size);
 		break;
 	case OBJ_TREE:
-		result = pv4_encode_tree(src, &size);
+		result = pv4_encode_tree(src, &size, NULL, 0, NULL);
 		break;
 	default:
 		die("unexpected object type %d", type);
-- 
1.8.4.38.g317e65b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 18/38] pack v4: load delta candidate for encoding tree objects
  2013-09-05  6:19 [PATCH 00/38] pack version 4 basic functionalities Nicolas Pitre
                   ` (16 preceding siblings ...)
  2013-09-05  6:19 ` [PATCH 17/38] pack v4: tree object delta encoding Nicolas Pitre
@ 2013-09-05  6:19 ` Nicolas Pitre
  2013-09-05  6:19 ` [PATCH 19/38] packv4-create: optimize delta encoding Nicolas Pitre
                   ` (21 subsequent siblings)
  39 siblings, 0 replies; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-05  6:19 UTC (permalink / raw)
  To: git

The SHA1 of the base object is retrieved and the corresponding object
is loaded in memory for pv4_encode_tree() to look at.  Simple but
effective.  Obviously this relies on the delta matching already performed
during the pack v3 delta search.  Some native delta search for pack v4
could be investigated eventually.

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 packv4-create.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 60 insertions(+), 3 deletions(-)

diff --git a/packv4-create.c b/packv4-create.c
index 6830a0a..15c5959 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -820,18 +820,56 @@ static unsigned long copy_object_data(struct sha1file *f, struct packed_git *p,
 	return written;
 }
 
+static unsigned char *get_delta_base(struct packed_git *p, off_t offset,
+				     unsigned char *sha1_buf)
+{
+	struct pack_window *w_curs = NULL;
+	enum object_type type;
+	unsigned long avail, size;
+	int hdrlen;
+	unsigned char *src;
+	const unsigned char *base_sha1 = NULL; ;
+
+	src = use_pack(p, &w_curs, offset, &avail);
+	hdrlen = unpack_object_header_buffer(src, avail, &type, &size);
+
+	if (type == OBJ_OFS_DELTA) {
+		const unsigned char *cp = src + hdrlen;
+		off_t base_offset = decode_varint(&cp);
+		base_offset = offset - base_offset;
+		if (base_offset <= 0 || base_offset >= offset) {
+			error("delta offset out of bound");
+		} else {
+			struct revindex_entry *revidx;
+			revidx = find_pack_revindex(p, base_offset);
+			base_sha1 = nth_packed_object_sha1(p, revidx->nr);
+		}
+	} else if (type == OBJ_REF_DELTA) {
+		base_sha1 = src + hdrlen;
+	} else
+		error("expected to get a delta but got a %s", typename(type));
+
+	unuse_pack(&w_curs);
+
+	if (!base_sha1)
+		return NULL;
+	hashcpy(sha1_buf, base_sha1);
+	return sha1_buf;
+}
+
 static off_t packv4_write_object(struct sha1file *f, struct packed_git *p,
 				 struct pack_idx_entry *obj)
 {
 	void *src, *result;
 	struct object_info oi = {};
-	enum object_type type;
+	enum object_type type, packed_type;
 	unsigned long size;
 	unsigned int hdrlen;
 
 	oi.typep = &type;
 	oi.sizep = &size;
-	if (packed_object_info(p, obj->offset, &oi) < 0)
+	packed_type = packed_object_info(p, obj->offset, &oi);
+	if (packed_type < 0)
 		die("cannot get type of %s from %s",
 		    sha1_to_hex(obj->sha1), p->pack_name);
 
@@ -859,7 +897,26 @@ static off_t packv4_write_object(struct sha1file *f, struct packed_git *p,
 		result = pv4_encode_commit(src, &size);
 		break;
 	case OBJ_TREE:
-		result = pv4_encode_tree(src, &size, NULL, 0, NULL);
+		if (packed_type != OBJ_TREE) {
+			unsigned char sha1_buf[20], *ref_sha1;
+			void *ref;
+			enum object_type ref_type;
+			unsigned long ref_size;
+
+			ref_sha1 = get_delta_base(p, obj->offset, sha1_buf);
+			if (!ref_sha1)
+				die("unable to get delta base sha1 for %s",
+						sha1_to_hex(obj->sha1));
+			ref = read_sha1_file(ref_sha1, &ref_type, &ref_size);
+			if (!ref || ref_type != OBJ_TREE)
+				die("cannot obtain delta base for %s",
+						sha1_to_hex(obj->sha1));
+			result = pv4_encode_tree(src, &size,
+						 ref, ref_size, ref_sha1);
+			free(ref);
+		} else {
+			result = pv4_encode_tree(src, &size, NULL, 0, NULL);
+		}
 		break;
 	default:
 		die("unexpected object type %d", type);
-- 
1.8.4.38.g317e65b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 19/38] packv4-create: optimize delta encoding
  2013-09-05  6:19 [PATCH 00/38] pack version 4 basic functionalities Nicolas Pitre
                   ` (17 preceding siblings ...)
  2013-09-05  6:19 ` [PATCH 18/38] pack v4: load delta candidate for encoding tree objects Nicolas Pitre
@ 2013-09-05  6:19 ` Nicolas Pitre
  2013-09-05  6:19 ` [PATCH 20/38] pack v4: honor pack.compression config option Nicolas Pitre
                   ` (20 subsequent siblings)
  39 siblings, 0 replies; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-05  6:19 UTC (permalink / raw)
  To: git

Make sure the copy sequence is smaller than the list of tree entries it
is meant to replace.  We do so by encoding tree entries in parallel with
the delta entry comparison, and then comparing the length of both
sequences.

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 packv4-create.c | 65 +++++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 45 insertions(+), 20 deletions(-)

diff --git a/packv4-create.c b/packv4-create.c
index 15c5959..c8d3053 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -433,7 +433,8 @@ void *pv4_encode_tree(void *_buffer, unsigned long *sizep,
 	struct tree_desc desc, delta_desc;
 	struct name_entry name_entry, delta_entry;
 	int nb_entries;
-	unsigned int copy_start, copy_count = 0, delta_pos = 0, first_delta = 1;
+	unsigned int copy_start = 0, copy_count = 0, copy_pos = 0, copy_end = 0;
+	unsigned int delta_pos = 0, first_delta = 1;
 
 	if (!size)
 		return NULL;
@@ -489,24 +490,23 @@ void *pv4_encode_tree(void *_buffer, unsigned long *sizep,
 
 			if (ret == 0 && name_entry.mode == delta_entry.mode &&
 			    hashcmp(name_entry.sha1, delta_entry.sha1) == 0) {
-				if (!copy_count)
+				if (!copy_count) {
 					copy_start = delta_pos;
+					copy_pos = out - buffer;
+					copy_end = 0;
+				}
 				copy_count++;
 				delta_pos++;
 				if (!tree_entry(&delta_desc, &delta_entry))
 					delta = NULL;
-				continue;
-			}
-		}
+			} else
+				copy_end = 1;
+		} else
+			copy_end = 1;
 
-		if (end - out < 48) {
-			unsigned long sofar = out - buffer;
-			buffer = xrealloc(buffer, (sofar + 48)*2);
-			end = buffer + (sofar + 48)*2;
-			out = buffer + sofar;
-		}
+		if (copy_count && copy_end) {
+			unsigned char copy_buf[48], *cp = copy_buf;
 
-		if (copy_count) {
 			/*
 			 * Let's write a sequence indicating we're copying
 			 * entries from another object:
@@ -524,12 +524,31 @@ void *pv4_encode_tree(void *_buffer, unsigned long *sizep,
 			 */
 			copy_start = (copy_start << 1) | 1;
 			copy_count = (copy_count << 1) | first_delta;
-			out += encode_varint(copy_start, out);
-			out += encode_varint(copy_count, out);
+			cp += encode_varint(copy_start, cp);
+			cp += encode_varint(copy_count, cp);
 			if (first_delta)
-				out += encode_sha1ref(delta_sha1, out);
+				cp += encode_sha1ref(delta_sha1, cp);
 			copy_count = 0;
-			first_delta = 0;
+
+			/*
+			 * Now let's make sure this is going to take less
+			 * space than the corresponding direct entries we've
+			 * created in parallel.  If so we dump the copy
+			 * sequence over those entries in the output buffer.
+			 */
+			if (cp - copy_buf < out - &buffer[copy_pos]) {
+				out = buffer + copy_pos;
+				memcpy(out, copy_buf, cp - copy_buf);
+				out += cp - copy_buf;
+				first_delta = 0;
+			}
+		}
+
+		if (end - out < 48) {
+			unsigned long sofar = out - buffer;
+			buffer = xrealloc(buffer, (sofar + 48)*2);
+			end = buffer + (sofar + 48)*2;
+			out = buffer + sofar;
 		}
 
 		pathlen = tree_entry_len(&name_entry);
@@ -545,13 +564,19 @@ void *pv4_encode_tree(void *_buffer, unsigned long *sizep,
 	}
 
 	if (copy_count) {
-		/* flush the trailing copy */
+		/* process the trailing copy */
+		unsigned char copy_buf[48], *cp = copy_buf;
 		copy_start = (copy_start << 1) | 1;
 		copy_count = (copy_count << 1) | first_delta;
-		out += encode_varint(copy_start, out);
-		out += encode_varint(copy_count, out);
+		cp += encode_varint(copy_start, cp);
+		cp += encode_varint(copy_count, cp);
 		if (first_delta)
-			out += encode_sha1ref(delta_sha1, out);
+			cp += encode_sha1ref(delta_sha1, cp);
+		if (cp - copy_buf < out - &buffer[copy_pos]) {
+			out = buffer + copy_pos;
+			memcpy(out, copy_buf, cp - copy_buf);
+			out += cp - copy_buf;
+		}
 	}
 
 	*sizep = out - buffer;
-- 
1.8.4.38.g317e65b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 20/38] pack v4: honor pack.compression config option
  2013-09-05  6:19 [PATCH 00/38] pack version 4 basic functionalities Nicolas Pitre
                   ` (18 preceding siblings ...)
  2013-09-05  6:19 ` [PATCH 19/38] packv4-create: optimize delta encoding Nicolas Pitre
@ 2013-09-05  6:19 ` Nicolas Pitre
  2013-09-05  6:19 ` [PATCH 21/38] pack v4: relax commit parsing a bit Nicolas Pitre
                   ` (19 subsequent siblings)
  39 siblings, 0 replies; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-05  6:19 UTC (permalink / raw)
  To: git

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 packv4-create.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/packv4-create.c b/packv4-create.c
index c8d3053..45f8427 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -16,6 +16,7 @@
 #include "varint.h"
 
 
+static int pack_compression_seen;
 static int pack_compression_level = Z_DEFAULT_COMPRESSION;
 
 struct data_entry {
@@ -1047,12 +1048,30 @@ static void process_one_pack(char *src_pack, char *dst_pack)
 	sha1close(f, NULL, CSUM_CLOSE | CSUM_FSYNC);
 }
 
+static int git_pack_config(const char *k, const char *v, void *cb)
+{
+	if (!strcmp(k, "pack.compression")) {
+		int level = git_config_int(k, v);
+		if (level == -1)
+			level = Z_DEFAULT_COMPRESSION;
+		else if (level < 0 || level > Z_BEST_COMPRESSION)
+			die("bad pack compression level %d", level);
+		pack_compression_level = level;
+		pack_compression_seen = 1;
+		return 0;
+	}
+	return git_default_config(k, v, cb);
+}
+
 int main(int argc, char *argv[])
 {
 	if (argc != 3) {
 		fprintf(stderr, "Usage: %s <src_packfile> <dst_packfile>\n", argv[0]);
 		exit(1);
 	}
+	git_config(git_pack_config, NULL);
+	if (!pack_compression_seen && core_compression_seen)
+		pack_compression_level = core_compression_level;
 	process_one_pack(argv[1], argv[2]);
 	if (0)
 		dict_dump();
-- 
1.8.4.38.g317e65b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 21/38] pack v4: relax commit parsing a bit
  2013-09-05  6:19 [PATCH 00/38] pack version 4 basic functionalities Nicolas Pitre
                   ` (19 preceding siblings ...)
  2013-09-05  6:19 ` [PATCH 20/38] pack v4: honor pack.compression config option Nicolas Pitre
@ 2013-09-05  6:19 ` Nicolas Pitre
  2013-09-05  6:19 ` [PATCH 22/38] pack index v3 Nicolas Pitre
                   ` (18 subsequent siblings)
  39 siblings, 0 replies; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-05  6:19 UTC (permalink / raw)
  To: git

At least commit af25e94d4dcfb9608846242fabdd4e6014e5c9f0 in the Linux
kernel repository has "author  <> 1120285620 -0700"

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 packv4-create.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/packv4-create.c b/packv4-create.c
index 45f8427..a9e9002 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -158,12 +158,12 @@ static char *get_nameend_and_tz(char *from, int *tz_val)
 	char *end, *tz;
 
 	tz = strchr(from, '\n');
-	/* let's assume the smallest possible string to be "x <x> 0 +0000\n" */
-	if (!tz || tz - from < 13)
+	/* let's assume the smallest possible string to be " <> 0 +0000\n" */
+	if (!tz || tz - from < 11)
 		return NULL;
 	tz -= 4;
 	end = tz - 4;
-	while (end - from > 5 && *end != ' ')
+	while (end - from > 3 && *end != ' ')
 		end--;
 	if (end[-1] != '>' || end[0] != ' ' || tz[-2] != ' ')
 		return NULL;
-- 
1.8.4.38.g317e65b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 22/38] pack index v3
  2013-09-05  6:19 [PATCH 00/38] pack version 4 basic functionalities Nicolas Pitre
                   ` (20 preceding siblings ...)
  2013-09-05  6:19 ` [PATCH 21/38] pack v4: relax commit parsing a bit Nicolas Pitre
@ 2013-09-05  6:19 ` Nicolas Pitre
  2013-09-05  6:19 ` [PATCH 23/38] packv4-create: normalize pack name to properly generate the pack index file name Nicolas Pitre
                   ` (17 subsequent siblings)
  39 siblings, 0 replies; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-05  6:19 UTC (permalink / raw)
  To: git

This is a minor change over pack index v2.  Since pack v4 already contains
the sorted SHA1 table, it is therefore ommitted from the index file.

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 pack-write.c    |  6 +++++-
 packv4-create.c | 10 +++++++++-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/pack-write.c b/pack-write.c
index ca9e63b..631007e 100644
--- a/pack-write.c
+++ b/pack-write.c
@@ -87,6 +87,8 @@ const char *write_idx_file(const char *index_name, struct pack_idx_entry **objec
 
 	/* if last object's offset is >= 2^31 we should use index V2 */
 	index_version = need_large_offset(last_obj_offset, opts) ? 2 : opts->version;
+	if (index_version < opts->version)
+		index_version = opts->version;
 
 	/* index versions 2 and above need a header */
 	if (index_version >= 2) {
@@ -127,7 +129,9 @@ const char *write_idx_file(const char *index_name, struct pack_idx_entry **objec
 			uint32_t offset = htonl(obj->offset);
 			sha1write(f, &offset, 4);
 		}
-		sha1write(f, obj->sha1, 20);
+		/* Pack v4 (using index v3) carries the SHA1 table already */
+		if (index_version < 3)
+			sha1write(f, obj->sha1, 20);
 		git_SHA1_Update(&ctx, obj->sha1, 20);
 		if ((opts->flags & WRITE_IDX_STRICT) &&
 		    (i && !hashcmp(list[-2]->sha1, obj->sha1)))
diff --git a/packv4-create.c b/packv4-create.c
index a9e9002..22cdf8e 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -1014,8 +1014,10 @@ static void process_one_pack(char *src_pack, char *dst_pack)
 	struct packed_git *p;
 	struct sha1file *f;
 	struct pack_idx_entry *objs, **p_objs;
+	struct pack_idx_option idx_opts;
 	unsigned i, nr_objects;
 	off_t written = 0;
+	unsigned char pack_sha1[20];
 
 	p = open_pack(src_pack);
 	if (!p)
@@ -1041,11 +1043,17 @@ static void process_one_pack(char *src_pack, char *dst_pack)
 	for (i = 0; i < nr_objects; i++) {
 		off_t obj_pos = written;
 		struct pack_idx_entry *obj = p_objs[i];
+		crc32_begin(f);
 		written += packv4_write_object(f, p, obj);
 		obj->offset = obj_pos;
+		obj->crc32 = crc32_end(f);
 	}
 
-	sha1close(f, NULL, CSUM_CLOSE | CSUM_FSYNC);
+	sha1close(f, pack_sha1, CSUM_CLOSE | CSUM_FSYNC);
+
+	reset_pack_idx_option(&idx_opts);
+	idx_opts.version = 3;
+	write_idx_file(dst_pack, p_objs, nr_objects, &idx_opts, pack_sha1);
 }
 
 static int git_pack_config(const char *k, const char *v, void *cb)
-- 
1.8.4.38.g317e65b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 23/38] packv4-create: normalize pack name to properly generate the pack index file name
  2013-09-05  6:19 [PATCH 00/38] pack version 4 basic functionalities Nicolas Pitre
                   ` (21 preceding siblings ...)
  2013-09-05  6:19 ` [PATCH 22/38] pack index v3 Nicolas Pitre
@ 2013-09-05  6:19 ` Nicolas Pitre
  2013-09-05  6:19 ` [PATCH 24/38] packv4-create: add progress display Nicolas Pitre
                   ` (16 subsequent siblings)
  39 siblings, 0 replies; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-05  6:19 UTC (permalink / raw)
  To: git

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 packv4-create.c | 73 +++++++++++++++++++++++++++------------------------------
 1 file changed, 34 insertions(+), 39 deletions(-)

diff --git a/packv4-create.c b/packv4-create.c
index 22cdf8e..c23c791 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -956,56 +956,46 @@ static off_t packv4_write_object(struct sha1file *f, struct packed_git *p,
 	return hdrlen + size;
 }
 
-static struct packed_git *open_pack(const char *path)
+static char *normalize_pack_name(const char *path)
 {
-	char arg[PATH_MAX];
+	char buf[PATH_MAX];
 	int len;
-	struct packed_git *p;
 
-	len = strlcpy(arg, path, PATH_MAX);
-	if (len >= PATH_MAX) {
-		error("name too long: %s", path);
-		return NULL;
-	}
+	len = strlcpy(buf, path, PATH_MAX);
+	if (len >= PATH_MAX - 6)
+		die("name too long: %s", path);
 
 	/*
 	 * In addition to "foo.idx" we accept "foo.pack" and "foo";
-	 * normalize these forms to "foo.idx" for add_packed_git().
+	 * normalize these forms to "foo.pack".
 	 */
-	if (has_extension(arg, ".pack")) {
-		strcpy(arg + len - 5, ".idx");
-		len--;
-	} else if (!has_extension(arg, ".idx")) {
-		if (len + 4 >= PATH_MAX) {
-			error("name too long: %s.idx", arg);
-			return NULL;
-		}
-		strcpy(arg + len, ".idx");
-		len += 4;
+	if (has_extension(buf, ".idx")) {
+		strcpy(buf + len - 4, ".pack");
+		len++;
+	} else if (!has_extension(buf, ".pack")) {
+		strcpy(buf + len, ".pack");
+		len += 5;
 	}
 
-	/*
-	 * add_packed_git() uses our buffer (containing "foo.idx") to
-	 * build the pack filename ("foo.pack").  Make sure it fits.
-	 */
-	if (len + 1 >= PATH_MAX) {
-		arg[len - 4] = '\0';
-		error("name too long: %s.pack", arg);
-		return NULL;
-	}
+	return xstrdup(buf);
+}
 
-	p = add_packed_git(arg, len, 1);
-	if (!p) {
-		error("packfile %s not found.", arg);
-		return NULL;
-	}
+static struct packed_git *open_pack(const char *path)
+{
+	char *packname = normalize_pack_name(path);
+	int len = strlen(packname);
+	struct packed_git *p;
+
+	strcpy(packname + len - 5, ".idx");
+	p = add_packed_git(packname, len - 1, 1);
+	if (!p)
+		die("packfile %s not found.", packname);
 
 	install_packed_git(p);
-	if (open_pack_index(p)) {
-		error("packfile %s index not opened", p->pack_name);
-		return NULL;
-	}
+	if (open_pack_index(p))
+		die("packfile %s index not opened", p->pack_name);
 
+	free(packname);
 	return p;
 }
 
@@ -1017,6 +1007,7 @@ static void process_one_pack(char *src_pack, char *dst_pack)
 	struct pack_idx_option idx_opts;
 	unsigned i, nr_objects;
 	off_t written = 0;
+	char *packname;
 	unsigned char pack_sha1[20];
 
 	p = open_pack(src_pack);
@@ -1031,7 +1022,8 @@ static void process_one_pack(char *src_pack, char *dst_pack)
 	sort_dict_entries_by_hits(commit_name_table);
 	sort_dict_entries_by_hits(tree_path_table);
 
-	f = packv4_open(dst_pack);
+	packname = normalize_pack_name(dst_pack);
+	f = packv4_open(packname);
 	if (!f)
 		die("unable to open destination pack");
 	written += packv4_write_header(f, nr_objects);
@@ -1053,7 +1045,10 @@ static void process_one_pack(char *src_pack, char *dst_pack)
 
 	reset_pack_idx_option(&idx_opts);
 	idx_opts.version = 3;
-	write_idx_file(dst_pack, p_objs, nr_objects, &idx_opts, pack_sha1);
+	strcpy(packname + strlen(packname) - 5, ".idx");
+	write_idx_file(packname, p_objs, nr_objects, &idx_opts, pack_sha1);
+
+	free(packname);
 }
 
 static int git_pack_config(const char *k, const char *v, void *cb)
-- 
1.8.4.38.g317e65b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 24/38] packv4-create: add progress display
  2013-09-05  6:19 [PATCH 00/38] pack version 4 basic functionalities Nicolas Pitre
                   ` (22 preceding siblings ...)
  2013-09-05  6:19 ` [PATCH 23/38] packv4-create: normalize pack name to properly generate the pack index file name Nicolas Pitre
@ 2013-09-05  6:19 ` Nicolas Pitre
  2013-09-05  6:19 ` [PATCH 25/38] pack v4: initial pack index v3 support on the read side Nicolas Pitre
                   ` (15 subsequent siblings)
  39 siblings, 0 replies; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-05  6:19 UTC (permalink / raw)
  To: git

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 packv4-create.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/packv4-create.c b/packv4-create.c
index c23c791..fd16222 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -13,6 +13,7 @@
 #include "tree-walk.h"
 #include "pack.h"
 #include "pack-revindex.h"
+#include "progress.h"
 #include "varint.h"
 
 
@@ -627,8 +628,10 @@ static struct pack_idx_entry **sort_objs_by_offset(struct pack_idx_entry *list,
 static int create_pack_dictionaries(struct packed_git *p,
 				    struct pack_idx_entry **obj_list)
 {
+	struct progress *progress_state;
 	unsigned int i;
 
+	progress_state = start_progress("Scanning objects", p->num_objects);
 	for (i = 0; i < p->num_objects; i++) {
 		struct pack_idx_entry *obj = obj_list[i];
 		void *data;
@@ -637,6 +640,8 @@ static int create_pack_dictionaries(struct packed_git *p,
 		struct object_info oi = {};
 		int (*add_dict_entries)(void *, unsigned long);
 
+		display_progress(progress_state, i+1);
+
 		oi.typep = &type;
 		oi.sizep = &size;
 		if (packed_object_info(p, obj->offset, &oi) < 0)
@@ -666,6 +671,7 @@ static int create_pack_dictionaries(struct packed_git *p,
 		free(data);
 	}
 
+	stop_progress(&progress_state);
 	return 0;
 }
 
@@ -1009,6 +1015,7 @@ static void process_one_pack(char *src_pack, char *dst_pack)
 	off_t written = 0;
 	char *packname;
 	unsigned char pack_sha1[20];
+	struct progress *progress_state;
 
 	p = open_pack(src_pack);
 	if (!p)
@@ -1030,6 +1037,7 @@ static void process_one_pack(char *src_pack, char *dst_pack)
 	written += packv4_write_tables(f, nr_objects, objs);
 
 	/* Let's write objects out, updating the object index list in place */
+	progress_state = start_progress("Writing objects", nr_objects);
 	all_objs = objs;
 	all_objs_nr = nr_objects;
 	for (i = 0; i < nr_objects; i++) {
@@ -1039,7 +1047,9 @@ static void process_one_pack(char *src_pack, char *dst_pack)
 		written += packv4_write_object(f, p, obj);
 		obj->offset = obj_pos;
 		obj->crc32 = crc32_end(f);
+		display_progress(progress_state, i+1);
 	}
+	stop_progress(&progress_state);
 
 	sha1close(f, pack_sha1, CSUM_CLOSE | CSUM_FSYNC);
 
-- 
1.8.4.38.g317e65b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 25/38] pack v4: initial pack index v3 support on the read side
  2013-09-05  6:19 [PATCH 00/38] pack version 4 basic functionalities Nicolas Pitre
                   ` (23 preceding siblings ...)
  2013-09-05  6:19 ` [PATCH 24/38] packv4-create: add progress display Nicolas Pitre
@ 2013-09-05  6:19 ` Nicolas Pitre
  2013-09-05  6:19 ` [PATCH 26/38] pack v4: object header decode Nicolas Pitre
                   ` (14 subsequent siblings)
  39 siblings, 0 replies; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-05  6:19 UTC (permalink / raw)
  To: git

A bit crud but good enough for now.

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 cache.h         |  1 +
 pack-check.c    |  4 +++-
 pack-revindex.c |  7 ++++---
 sha1_file.c     | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++------
 4 files changed, 58 insertions(+), 10 deletions(-)

diff --git a/cache.h b/cache.h
index 4231dfa..c939b60 100644
--- a/cache.h
+++ b/cache.h
@@ -1021,6 +1021,7 @@ extern struct packed_git {
 	off_t pack_size;
 	const void *index_data;
 	size_t index_size;
+	const unsigned char *sha1_table;
 	uint32_t num_objects;
 	uint32_t num_bad_objects;
 	unsigned char *bad_object_sha1;
diff --git a/pack-check.c b/pack-check.c
index 63a595c..8200f24 100644
--- a/pack-check.c
+++ b/pack-check.c
@@ -25,6 +25,7 @@ int check_pack_crc(struct packed_git *p, struct pack_window **w_curs,
 {
 	const uint32_t *index_crc;
 	uint32_t data_crc = crc32(0, NULL, 0);
+	unsigned sha1_table;
 
 	do {
 		unsigned long avail;
@@ -36,8 +37,9 @@ int check_pack_crc(struct packed_git *p, struct pack_window **w_curs,
 		len -= avail;
 	} while (len);
 
+	sha1_table = p->index_version < 3 ? (p->num_objects * (20/4)) : 0;
 	index_crc = p->index_data;
-	index_crc += 2 + 256 + p->num_objects * (20/4) + nr;
+	index_crc += 2 + 256 + sha1_table + nr;
 
 	return data_crc != ntohl(*index_crc);
 }
diff --git a/pack-revindex.c b/pack-revindex.c
index b4d2b35..739a568 100644
--- a/pack-revindex.c
+++ b/pack-revindex.c
@@ -170,9 +170,10 @@ static void create_pack_revindex(struct pack_revindex *rix)
 	index += 4 * 256;
 
 	if (p->index_version > 1) {
-		const uint32_t *off_32 =
-			(uint32_t *)(index + 8 + p->num_objects * (20 + 4));
-		const uint32_t *off_64 = off_32 + p->num_objects;
+		const uint32_t *off_32, *off_64;
+		unsigned sha1 = p->index_version < 3 ? 20 : 0;
+		off_32 = (uint32_t *)(index + 8 + p->num_objects * (sha1 + 4));
+		off_64 = off_32 + p->num_objects;
 		for (i = 0; i < num_ent; i++) {
 			uint32_t off = ntohl(*off_32++);
 			if (!(off & 0x80000000)) {
diff --git a/sha1_file.c b/sha1_file.c
index c2020d0..5c63781 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -504,7 +504,7 @@ static int check_packed_git_idx(const char *path,  struct packed_git *p)
 	hdr = idx_map;
 	if (hdr->idx_signature == htonl(PACK_IDX_SIGNATURE)) {
 		version = ntohl(hdr->idx_version);
-		if (version < 2 || version > 2) {
+		if (version < 2 || version > 3) {
 			munmap(idx_map, idx_size);
 			return error("index file %s is version %"PRIu32
 				     " and is not supported by this binary"
@@ -539,12 +539,13 @@ static int check_packed_git_idx(const char *path,  struct packed_git *p)
 			munmap(idx_map, idx_size);
 			return error("wrong index v1 file size in %s", path);
 		}
-	} else if (version == 2) {
+	} else if (version == 2 || version == 3) {
+		unsigned long min_size, max_size;
 		/*
 		 * Minimum size:
 		 *  - 8 bytes of header
 		 *  - 256 index entries 4 bytes each
-		 *  - 20-byte sha1 entry * nr
+		 *  - 20-byte sha1 entry * nr (version 2 only)
 		 *  - 4-byte crc entry * nr
 		 *  - 4-byte offset entry * nr
 		 *  - 20-byte SHA1 of the packfile
@@ -553,8 +554,10 @@ static int check_packed_git_idx(const char *path,  struct packed_git *p)
 		 * variable sized table containing 8-byte entries
 		 * for offsets larger than 2^31.
 		 */
-		unsigned long min_size = 8 + 4*256 + nr*(20 + 4 + 4) + 20 + 20;
-		unsigned long max_size = min_size;
+		min_size = 8 + 4*256 + nr*(20 + 4 + 4) + 20 + 20;
+		if (version != 2)
+			min_size -= nr*20;
+		max_size = min_size;
 		if (nr)
 			max_size += (nr - 1)*8;
 		if (idx_size < min_size || idx_size > max_size) {
@@ -573,6 +576,36 @@ static int check_packed_git_idx(const char *path,  struct packed_git *p)
 		}
 	}
 
+	if (version >= 3) {
+		/* the SHA1 table is located in the main pack file */
+		void *pack_map;
+		struct pack_header *pack_hdr;
+
+		fd = git_open_noatime(p->pack_name);
+		if (fd < 0) {
+			munmap(idx_map, idx_size);
+			return error("unable to open %s", p->pack_name);
+		}
+		if (fstat(fd, &st) != 0 || xsize_t(st.st_size) < 12 + nr*20) {
+			close(fd);
+			munmap(idx_map, idx_size);
+			return error("size of %s is wrong", p->pack_name);
+		}
+		pack_map = xmmap(NULL, 12 + nr*20, PROT_READ, MAP_PRIVATE, fd, 0);
+		close(fd);
+		pack_hdr = pack_map;
+		if (pack_hdr->hdr_signature != htonl(PACK_SIGNATURE) ||
+		    pack_hdr->hdr_version != htonl(4) ||
+		    pack_hdr->hdr_entries != htonl(nr)) {
+			munmap(idx_map, idx_size);
+			munmap(pack_map, 12 + nr*20);
+			return error("packfile for %s doesn't match expectations", path);
+		}
+		p->sha1_table = pack_map;
+		p->sha1_table += 12;
+	} else
+		p->sha1_table = NULL;
+
 	p->index_version = version;
 	p->index_data = idx_map;
 	p->index_size = idx_size;
@@ -697,6 +730,10 @@ void close_pack_index(struct packed_git *p)
 		munmap((void *)p->index_data, p->index_size);
 		p->index_data = NULL;
 	}
+	if (p->sha1_table) {
+		munmap((void *)(p->sha1_table - 12), 12 + p->num_objects * 20);
+		p->sha1_table = NULL;
+	}
 }
 
 /*
@@ -2226,9 +2263,12 @@ const unsigned char *nth_packed_object_sha1(struct packed_git *p,
 	index += 4 * 256;
 	if (p->index_version == 1) {
 		return index + 24 * n + 4;
-	} else {
+	} else if (p->index_version == 2) {
 		index += 8;
 		return index + 20 * n;
+	} else {
+		index = p->sha1_table;
+		return index + 20 * n;
 	}
 }
 
@@ -2241,6 +2281,8 @@ off_t nth_packed_object_offset(const struct packed_git *p, uint32_t n)
 	} else {
 		uint32_t off;
 		index += 8 + p->num_objects * (20 + 4);
+		if (p->index_version != 2)
+			index -= p->num_objects * 20;
 		off = ntohl(*((uint32_t *)(index + 4 * n)));
 		if (!(off & 0x80000000))
 			return off;
@@ -2281,6 +2323,8 @@ off_t find_pack_entry_one(const unsigned char *sha1,
 		stride = 24;
 		index += 4;
 	}
+	if (p->index_version > 2)
+		index = p->sha1_table;
 
 	if (debug_lookup)
 		printf("%02x%02x%02x... lo %u hi %u nr %"PRIu32"\n",
-- 
1.8.4.38.g317e65b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 26/38] pack v4: object header decode
  2013-09-05  6:19 [PATCH 00/38] pack version 4 basic functionalities Nicolas Pitre
                   ` (24 preceding siblings ...)
  2013-09-05  6:19 ` [PATCH 25/38] pack v4: initial pack index v3 support on the read side Nicolas Pitre
@ 2013-09-05  6:19 ` Nicolas Pitre
  2013-09-05  6:19 ` [PATCH 27/38] pack v4: code to obtain a SHA1 from a sha1ref Nicolas Pitre
                   ` (13 subsequent siblings)
  39 siblings, 0 replies; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-05  6:19 UTC (permalink / raw)
  To: git

For this we need the pack version.  However only open_packed_git_1() has
been audited for pack v4 so far, hence the version validation is not
added to pack_version_ok() just yet.

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 cache.h     |  1 +
 sha1_file.c | 14 ++++++++++++--
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/cache.h b/cache.h
index c939b60..59d9ba7 100644
--- a/cache.h
+++ b/cache.h
@@ -1025,6 +1025,7 @@ extern struct packed_git {
 	uint32_t num_objects;
 	uint32_t num_bad_objects;
 	unsigned char *bad_object_sha1;
+	int version;
 	int index_version;
 	time_t mtime;
 	int pack_fd;
diff --git a/sha1_file.c b/sha1_file.c
index 5c63781..a298933 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -10,6 +10,7 @@
 #include "string-list.h"
 #include "delta.h"
 #include "pack.h"
+#include "varint.h"
 #include "blob.h"
 #include "commit.h"
 #include "run-command.h"
@@ -845,10 +846,11 @@ static int open_packed_git_1(struct packed_git *p)
 		return error("file %s is far too short to be a packfile", p->pack_name);
 	if (hdr.hdr_signature != htonl(PACK_SIGNATURE))
 		return error("file %s is not a GIT packfile", p->pack_name);
-	if (!pack_version_ok(hdr.hdr_version))
+	if (!pack_version_ok(hdr.hdr_version) && hdr.hdr_version != htonl(4))
 		return error("packfile %s is version %"PRIu32" and not"
 			" supported (try upgrading GIT to a newer version)",
 			p->pack_name, ntohl(hdr.hdr_version));
+	p->version = ntohl(hdr.hdr_version);
 
 	/* Verify the pack matches its index. */
 	if (p->num_objects != ntohl(hdr.hdr_entries))
@@ -1725,7 +1727,15 @@ int unpack_object_header(struct packed_git *p,
 	 * insane, so we know won't exceed what we have been given.
 	 */
 	base = use_pack(p, w_curs, *curpos, &left);
-	used = unpack_object_header_buffer(base, left, &type, sizep);
+	if (p->version < 4) {
+		used = unpack_object_header_buffer(base, left, &type, sizep);
+	} else {
+		const unsigned char *cp = base;
+		uintmax_t val = decode_varint(&cp);
+		used = cp - base;
+		type = val & 0xf;
+		*sizep = val >> 4;
+	}
 	if (!used) {
 		type = OBJ_BAD;
 	} else
-- 
1.8.4.38.g317e65b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 27/38] pack v4: code to obtain a SHA1 from a sha1ref
  2013-09-05  6:19 [PATCH 00/38] pack version 4 basic functionalities Nicolas Pitre
                   ` (25 preceding siblings ...)
  2013-09-05  6:19 ` [PATCH 26/38] pack v4: object header decode Nicolas Pitre
@ 2013-09-05  6:19 ` Nicolas Pitre
  2013-09-05  6:19 ` [PATCH 28/38] pack v4: code to load and prepare a pack dictionary table for use Nicolas Pitre
                   ` (12 subsequent siblings)
  39 siblings, 0 replies; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-05  6:19 UTC (permalink / raw)
  To: git

Let's start actually parsing pack v4 data.  Here's the first item.

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 Makefile       |  1 +
 packv4-parse.c | 30 ++++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)
 create mode 100644 packv4-parse.c

diff --git a/Makefile b/Makefile
index 4716113..ba6cafc 100644
--- a/Makefile
+++ b/Makefile
@@ -838,6 +838,7 @@ LIB_OBJS += object.o
 LIB_OBJS += pack-check.o
 LIB_OBJS += pack-revindex.o
 LIB_OBJS += pack-write.o
+LIB_OBJS += packv4-parse.o
 LIB_OBJS += pager.o
 LIB_OBJS += parse-options.o
 LIB_OBJS += parse-options-cb.o
diff --git a/packv4-parse.c b/packv4-parse.c
new file mode 100644
index 0000000..299fc48
--- /dev/null
+++ b/packv4-parse.c
@@ -0,0 +1,30 @@
+/*
+ * Code to parse pack v4 object encoding
+ *
+ * (C) Nicolas Pitre <nico@fluxnic.net>
+ *
+ * This code is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include "cache.h"
+#include "varint.h"
+
+const unsigned char *get_sha1ref(struct packed_git *p,
+				 const unsigned char **bufp)
+{
+	const unsigned char *sha1;
+
+	if (!**bufp) {
+		sha1 = *bufp + 1;
+		*bufp += 21;
+	} else {
+		unsigned int index = decode_varint(bufp);
+		if (index < 1 || index - 1 > p->num_objects)
+			die("bad index in %s", __func__);
+		sha1 = p->sha1_table + (index - 1) * 20;
+	}
+
+	return sha1;
+}
-- 
1.8.4.38.g317e65b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 28/38] pack v4: code to load and prepare a pack dictionary table for use
  2013-09-05  6:19 [PATCH 00/38] pack version 4 basic functionalities Nicolas Pitre
                   ` (26 preceding siblings ...)
  2013-09-05  6:19 ` [PATCH 27/38] pack v4: code to obtain a SHA1 from a sha1ref Nicolas Pitre
@ 2013-09-05  6:19 ` Nicolas Pitre
  2013-09-05  6:19 ` [PATCH 29/38] pack v4: code to retrieve a name Nicolas Pitre
                   ` (11 subsequent siblings)
  39 siblings, 0 replies; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-05  6:19 UTC (permalink / raw)
  To: git

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 packv4-parse.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)

diff --git a/packv4-parse.c b/packv4-parse.c
index 299fc48..26894bc 100644
--- a/packv4-parse.c
+++ b/packv4-parse.c
@@ -28,3 +28,80 @@ const unsigned char *get_sha1ref(struct packed_git *p,
 
 	return sha1;
 }
+
+struct packv4_dict {
+	const unsigned char *data;
+	unsigned int nb_entries;
+	unsigned int offsets[FLEX_ARRAY];
+};
+
+static struct packv4_dict *load_dict(struct packed_git *p, off_t *offset)
+{
+	struct pack_window *w_curs = NULL;
+	off_t curpos = *offset;
+	unsigned long dict_size, avail;
+	unsigned char *src, *data;
+	const unsigned char *cp;
+	git_zstream stream;
+	struct packv4_dict *dict;
+	int nb_entries, i, st;
+
+	/* get uncompressed dictionary data size */
+	src = use_pack(p, &w_curs, curpos, &avail);
+	cp = src;
+	dict_size = decode_varint(&cp);
+	if (dict_size < 3) {
+		error("bad dict size");
+		return NULL;
+	}
+	curpos += cp - src;
+
+	data = xmallocz(dict_size);
+	memset(&stream, 0, sizeof(stream));
+	stream.next_out = data;
+	stream.avail_out = dict_size + 1;
+
+	git_inflate_init(&stream);
+	do {
+		src = use_pack(p, &w_curs, curpos, &stream.avail_in);
+		stream.next_in = src;
+		st = git_inflate(&stream, Z_FINISH);
+		curpos += stream.next_in - src;
+	} while ((st == Z_OK || st == Z_BUF_ERROR) && stream.avail_out);
+	git_inflate_end(&stream);
+	unuse_pack(&w_curs);
+	if (st != Z_STREAM_END || stream.total_out != dict_size) {
+		error("pack dictionary bad");
+		free(data);
+		return NULL;
+	}
+
+	/* count number of entries */
+	nb_entries = 0;
+	cp = data;
+	while (cp < data + dict_size - 3) {
+		cp += 2;  /* prefix bytes */
+		cp += strlen((const char *)cp);  /* entry string */
+		cp += 1;  /* terminating NUL */
+		nb_entries++;
+	}
+	if (cp - data != dict_size) {
+		error("dict size mismatch");
+		free(data);
+		return NULL;
+	}
+
+	dict = xmalloc(sizeof(*dict) + nb_entries * sizeof(dict->offsets[0]));
+	dict->data = data;
+	dict->nb_entries = nb_entries;
+
+	cp = data;
+	for (i = 0; i < nb_entries; i++) {
+		dict->offsets[i] = cp - data;
+		cp += 2;
+		cp += strlen((const char *)cp) + 1;
+	}
+
+	*offset = curpos;
+	return dict;
+}
-- 
1.8.4.38.g317e65b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 29/38] pack v4: code to retrieve a name
  2013-09-05  6:19 [PATCH 00/38] pack version 4 basic functionalities Nicolas Pitre
                   ` (27 preceding siblings ...)
  2013-09-05  6:19 ` [PATCH 28/38] pack v4: code to load and prepare a pack dictionary table for use Nicolas Pitre
@ 2013-09-05  6:19 ` Nicolas Pitre
  2013-09-05  6:19 ` [PATCH 30/38] pack v4: code to recreate a canonical commit object Nicolas Pitre
                   ` (10 subsequent siblings)
  39 siblings, 0 replies; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-05  6:19 UTC (permalink / raw)
  To: git

The name dictionary is loaded if not already done.  We know it is
located right after the SHA1 table (20 bytes per object) which is
itself right after the 12-byte header.

Then the index is parsed from the input buffer and a pointer to the
corresponding entry is returned.

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 cache.h        |  3 +++
 packv4-parse.c | 24 ++++++++++++++++++++++++
 2 files changed, 27 insertions(+)

diff --git a/cache.h b/cache.h
index 59d9ba7..6ce327e 100644
--- a/cache.h
+++ b/cache.h
@@ -1015,6 +1015,8 @@ struct pack_window {
 	unsigned int inuse_cnt;
 };
 
+struct packv4_dict;
+
 extern struct packed_git {
 	struct packed_git *next;
 	struct pack_window *windows;
@@ -1027,6 +1029,7 @@ extern struct packed_git {
 	unsigned char *bad_object_sha1;
 	int version;
 	int index_version;
+	struct packv4_dict *name_dict;
 	time_t mtime;
 	int pack_fd;
 	unsigned pack_local:1,
diff --git a/packv4-parse.c b/packv4-parse.c
index 26894bc..074e107 100644
--- a/packv4-parse.c
+++ b/packv4-parse.c
@@ -105,3 +105,27 @@ static struct packv4_dict *load_dict(struct packed_git *p, off_t *offset)
 	*offset = curpos;
 	return dict;
 }
+
+static void load_name_dict(struct packed_git *p)
+{
+	off_t offset = 12 + p->num_objects * 20;
+	struct packv4_dict *names = load_dict(p, &offset);
+	if (!names)
+		die("bad pack name dictionary in %s", p->pack_name);
+	p->name_dict = names;
+}
+
+const unsigned char *get_nameref(struct packed_git *p, const unsigned char **srcp)
+{
+	unsigned int index;
+
+	if (!p->name_dict)
+		load_name_dict(p);
+
+	index = decode_varint(srcp);
+	if (index >= p->name_dict->nb_entries) {
+		error("%s: index overflow", __func__);
+		return NULL;
+	}
+	return p->name_dict->data + p->name_dict->offsets[index];
+}
-- 
1.8.4.38.g317e65b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 30/38] pack v4: code to recreate a canonical commit object
  2013-09-05  6:19 [PATCH 00/38] pack version 4 basic functionalities Nicolas Pitre
                   ` (28 preceding siblings ...)
  2013-09-05  6:19 ` [PATCH 29/38] pack v4: code to retrieve a name Nicolas Pitre
@ 2013-09-05  6:19 ` Nicolas Pitre
  2013-09-05  6:19 ` [PATCH 31/38] sha1_file.c: make use of decode_varint() Nicolas Pitre
                   ` (9 subsequent siblings)
  39 siblings, 0 replies; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-05  6:19 UTC (permalink / raw)
  To: git

Usage of snprintf() is possibly not the most efficient approach.
For example we could simply copy the needed strings and generate
the SHA1 hex strings directly into the destination buffer.  But
such optimizations may come later.

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 packv4-parse.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)

diff --git a/packv4-parse.c b/packv4-parse.c
index 074e107..bca1a97 100644
--- a/packv4-parse.c
+++ b/packv4-parse.c
@@ -129,3 +129,77 @@ const unsigned char *get_nameref(struct packed_git *p, const unsigned char **src
 	}
 	return p->name_dict->data + p->name_dict->offsets[index];
 }
+
+void *pv4_get_commit(struct packed_git *p, struct pack_window **w_curs,
+		     off_t offset, unsigned long size)
+{
+	unsigned long avail;
+	git_zstream stream;
+	int len, st;
+	unsigned int nb_parents;
+	unsigned char *dst, *dcp;
+	const unsigned char *src, *scp, *sha1, *name;
+	unsigned long time;
+	int16_t tz;
+
+	dst = xmallocz(size);
+	dcp = dst;
+
+	src = use_pack(p, w_curs, offset, &avail);
+	scp = src;
+
+	sha1 = get_sha1ref(p, &scp);
+	len = snprintf((char *)dcp, size, "tree %s\n", sha1_to_hex(sha1));
+	dcp += len;
+	size -= len;
+
+	nb_parents = decode_varint(&scp);
+	while (nb_parents--) {
+		sha1 = get_sha1ref(p, &scp);
+		len = snprintf((char *)dcp, size, "parent %s\n", sha1_to_hex(sha1));
+		if (len >= size)
+			die("overflow in %s", __func__);
+		dcp += len;
+		size -= len;
+	}
+
+	name = get_nameref(p, &scp);
+	tz = (name[0] << 8) | name[1];
+	time = decode_varint(&scp);
+	len = snprintf((char *)dcp, size, "author %s %lu %+05d\n", name+2, time, tz);
+	if (len >= size)
+		die("overflow in %s", __func__);
+	dcp += len;
+	size -= len;
+
+	name = get_nameref(p, &scp);
+	tz = (name[0] << 8) | name[1];
+	time = decode_varint(&scp);
+	len = snprintf((char *)dcp, size, "committer %s %lu %+05d\n", name+2, time, tz);
+	if (len >= size)
+		die("overflow in %s", __func__);
+	dcp += len;
+	size -= len;
+
+	if (scp - src > avail)
+		die("overflow in %s", __func__);
+	offset += scp - src;
+
+	memset(&stream, 0, sizeof(stream));
+	stream.next_out = dcp;
+	stream.avail_out = size + 1;
+	git_inflate_init(&stream);
+	do {
+		src = use_pack(p, w_curs, offset, &stream.avail_in);
+		stream.next_in = (unsigned char *)src;
+		st = git_inflate(&stream, Z_FINISH);
+		offset += stream.next_in - src;
+	} while ((st == Z_OK || st == Z_BUF_ERROR) && stream.avail_out);
+	git_inflate_end(&stream);
+	if (st != Z_STREAM_END || stream.total_out != size) {
+		free(dst);
+		return NULL;
+	}
+
+	return dst;
+}
-- 
1.8.4.38.g317e65b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 31/38] sha1_file.c: make use of decode_varint()
  2013-09-05  6:19 [PATCH 00/38] pack version 4 basic functionalities Nicolas Pitre
                   ` (29 preceding siblings ...)
  2013-09-05  6:19 ` [PATCH 30/38] pack v4: code to recreate a canonical commit object Nicolas Pitre
@ 2013-09-05  6:19 ` Nicolas Pitre
  2013-09-05  7:35   ` SZEDER Gábor
  2013-09-05  6:19 ` [PATCH 32/38] pack v4: parse delta base reference Nicolas Pitre
                   ` (8 subsequent siblings)
  39 siblings, 1 reply; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-05  6:19 UTC (permalink / raw)
  To: git

... replacing the equivalent open coded loop.

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 sha1_file.c | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/sha1_file.c b/sha1_file.c
index a298933..67eb903 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -1687,20 +1687,12 @@ static off_t get_delta_base(struct packed_git *p,
 	 * is stupid, as then a REF_DELTA would be smaller to store.
 	 */
 	if (type == OBJ_OFS_DELTA) {
-		unsigned used = 0;
-		unsigned char c = base_info[used++];
-		base_offset = c & 127;
-		while (c & 128) {
-			base_offset += 1;
-			if (!base_offset || MSB(base_offset, 7))
-				return 0;  /* overflow */
-			c = base_info[used++];
-			base_offset = (base_offset << 7) + (c & 127);
-		}
+		const unsigned char *cp = base_info;
+		base_offset = decode_varint(&cp);
 		base_offset = delta_obj_offset - base_offset;
 		if (base_offset <= 0 || base_offset >= delta_obj_offset)
 			return 0;  /* out of bound */
-		*curpos += used;
+		*curpos += cp - base_info;
 	} else if (type == OBJ_REF_DELTA) {
 		/* The base entry _must_ be in the same pack */
 		base_offset = find_pack_entry_one(base_info, p);
-- 
1.8.4.38.g317e65b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 32/38] pack v4: parse delta base reference
  2013-09-05  6:19 [PATCH 00/38] pack version 4 basic functionalities Nicolas Pitre
                   ` (30 preceding siblings ...)
  2013-09-05  6:19 ` [PATCH 31/38] sha1_file.c: make use of decode_varint() Nicolas Pitre
@ 2013-09-05  6:19 ` Nicolas Pitre
  2013-09-05  6:19 ` [PATCH 33/38] pack v4: we can read commit objects now Nicolas Pitre
                   ` (7 subsequent siblings)
  39 siblings, 0 replies; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-05  6:19 UTC (permalink / raw)
  To: git

There is only one type of delta with pack v4.  The base reference
encoding already handles either an offset (via the pack index) or a
literal SHA1.

We assume in the literal SHA1 case that the object lives in the same
pack, just like with previous pack versions.

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 sha1_file.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/sha1_file.c b/sha1_file.c
index 67eb903..f3bfa28 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -1686,7 +1686,19 @@ static off_t get_delta_base(struct packed_git *p,
 	 * that is assured.  An OFS_DELTA longer than the hash size
 	 * is stupid, as then a REF_DELTA would be smaller to store.
 	 */
-	if (type == OBJ_OFS_DELTA) {
+	if (p->version >= 4) {
+		if (base_info[0] != 0) {
+			const unsigned char *cp = base_info;
+			unsigned int base_index = decode_varint(&cp);
+			if (!base_index || base_index - 1 >= p->num_objects)
+				return 0;  /* out of bounds */
+			*curpos += cp - base_info;
+			base_offset = nth_packed_object_offset(p, base_index - 1);
+		} else {
+			base_offset = find_pack_entry_one(base_info+1, p);
+			*curpos += 21;
+		}
+	} else if (type == OBJ_OFS_DELTA) {
 		const unsigned char *cp = base_info;
 		base_offset = decode_varint(&cp);
 		base_offset = delta_obj_offset - base_offset;
-- 
1.8.4.38.g317e65b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 33/38] pack v4: we can read commit objects now
  2013-09-05  6:19 [PATCH 00/38] pack version 4 basic functionalities Nicolas Pitre
                   ` (31 preceding siblings ...)
  2013-09-05  6:19 ` [PATCH 32/38] pack v4: parse delta base reference Nicolas Pitre
@ 2013-09-05  6:19 ` Nicolas Pitre
  2013-09-05  6:19 ` [PATCH 34/38] pack v4: code to retrieve a path component Nicolas Pitre
                   ` (6 subsequent siblings)
  39 siblings, 0 replies; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-05  6:19 UTC (permalink / raw)
  To: git

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 Makefile       |  1 +
 packv4-parse.c |  1 +
 packv4-parse.h |  7 +++++++
 sha1_file.c    | 10 ++++++++++
 4 files changed, 19 insertions(+)
 create mode 100644 packv4-parse.h

diff --git a/Makefile b/Makefile
index ba6cafc..22fc276 100644
--- a/Makefile
+++ b/Makefile
@@ -702,6 +702,7 @@ LIB_H += notes.h
 LIB_H += object.h
 LIB_H += pack-revindex.h
 LIB_H += pack.h
+LIB_H += packv4-parse.h
 LIB_H += parse-options.h
 LIB_H += patch-ids.h
 LIB_H += pathspec.h
diff --git a/packv4-parse.c b/packv4-parse.c
index bca1a97..431f47e 100644
--- a/packv4-parse.c
+++ b/packv4-parse.c
@@ -9,6 +9,7 @@
  */
 
 #include "cache.h"
+#include "packv4-parse.h"
 #include "varint.h"
 
 const unsigned char *get_sha1ref(struct packed_git *p,
diff --git a/packv4-parse.h b/packv4-parse.h
new file mode 100644
index 0000000..40aa75a
--- /dev/null
+++ b/packv4-parse.h
@@ -0,0 +1,7 @@
+#ifndef PACKV4_PARSE_H
+#define PACKV4_PARSE_H
+
+void *pv4_get_commit(struct packed_git *p, struct pack_window **w_curs,
+		     off_t offset, unsigned long size);
+
+#endif
diff --git a/sha1_file.c b/sha1_file.c
index f3bfa28..b57d9f8 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -19,6 +19,7 @@
 #include "tree-walk.h"
 #include "refs.h"
 #include "pack-revindex.h"
+#include "packv4-parse.h"
 #include "sha1-lookup.h"
 #include "bulk-checkin.h"
 #include "streaming.h"
@@ -2172,6 +2173,15 @@ void *unpack_entry(struct packed_git *p, off_t obj_offset,
 		break;
 	case OBJ_COMMIT:
 	case OBJ_TREE:
+		if (p->version >= 4 && !base_from_cache) {
+			if (type == OBJ_COMMIT) {
+				data = pv4_get_commit(p, &w_curs, curpos, size);
+			} else {
+				die("no pack v4 tree parsing yet");
+			}
+			break;
+		}
+		/* fall through */
 	case OBJ_BLOB:
 	case OBJ_TAG:
 		if (!base_from_cache)
-- 
1.8.4.38.g317e65b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 34/38] pack v4: code to retrieve a path component
  2013-09-05  6:19 [PATCH 00/38] pack version 4 basic functionalities Nicolas Pitre
                   ` (32 preceding siblings ...)
  2013-09-05  6:19 ` [PATCH 33/38] pack v4: we can read commit objects now Nicolas Pitre
@ 2013-09-05  6:19 ` Nicolas Pitre
  2013-09-05  6:19 ` [PATCH 35/38] pack v4: decode tree objects Nicolas Pitre
                   ` (5 subsequent siblings)
  39 siblings, 0 replies; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-05  6:19 UTC (permalink / raw)
  To: git

Because the path dictionary table is located right after the name
dictionary table, we currently need to load the later to find the
former.

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 cache.h        |  2 ++
 packv4-parse.c | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/cache.h b/cache.h
index 6ce327e..5f2147a 100644
--- a/cache.h
+++ b/cache.h
@@ -1030,6 +1030,8 @@ extern struct packed_git {
 	int version;
 	int index_version;
 	struct packv4_dict *name_dict;
+	off_t name_dict_end;
+	struct packv4_dict *path_dict;
 	time_t mtime;
 	int pack_fd;
 	unsigned pack_local:1,
diff --git a/packv4-parse.c b/packv4-parse.c
index 431f47e..b80b73e 100644
--- a/packv4-parse.c
+++ b/packv4-parse.c
@@ -114,6 +114,7 @@ static void load_name_dict(struct packed_git *p)
 	if (!names)
 		die("bad pack name dictionary in %s", p->pack_name);
 	p->name_dict = names;
+	p->name_dict_end = offset;
 }
 
 const unsigned char *get_nameref(struct packed_git *p, const unsigned char **srcp)
@@ -131,6 +132,41 @@ const unsigned char *get_nameref(struct packed_git *p, const unsigned char **src
 	return p->name_dict->data + p->name_dict->offsets[index];
 }
 
+static void load_path_dict(struct packed_git *p)
+{
+	off_t offset;
+	struct packv4_dict *paths;
+
+	/*
+	 * For now we need to load the name dictionary to know where
+	 * it ends and therefore where the path dictionary starts.
+	 */
+	if (!p->name_dict)
+		load_name_dict(p);
+
+	offset = p->name_dict_end;
+	paths = load_dict(p, &offset);
+	if (!paths)
+		die("bad pack path dictionary in %s", p->pack_name);
+	p->path_dict = paths;
+}
+
+const unsigned char *get_pathref(struct packed_git *p, const unsigned char **srcp)
+{
+	unsigned int index;
+
+	if (!p->path_dict)
+		load_path_dict(p);
+
+	index = decode_varint(srcp);
+	if (index < 1 || index - 1 >= p->path_dict->nb_entries) {
+		error("%s: index overflow", __func__);
+		return NULL;
+	}
+	index -= 1;
+	return p->path_dict->data + p->path_dict->offsets[index];
+}
+
 void *pv4_get_commit(struct packed_git *p, struct pack_window **w_curs,
 		     off_t offset, unsigned long size)
 {
-- 
1.8.4.38.g317e65b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 35/38] pack v4: decode tree objects
  2013-09-05  6:19 [PATCH 00/38] pack version 4 basic functionalities Nicolas Pitre
                   ` (33 preceding siblings ...)
  2013-09-05  6:19 ` [PATCH 34/38] pack v4: code to retrieve a path component Nicolas Pitre
@ 2013-09-05  6:19 ` Nicolas Pitre
  2013-09-05  6:19 ` [PATCH 36/38] pack v4: get " Nicolas Pitre
                   ` (4 subsequent siblings)
  39 siblings, 0 replies; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-05  6:19 UTC (permalink / raw)
  To: git

For now we recreate the whole tree object in its canonical form.

Eventually, the core code should grow some ability to walk packv4 tree
entries directly which would be way more efficient.  Not only would that
avoid double tree entry parsing, but the pack v4 encoding allows for
getting at child objects without going through the SHA1 search.

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 packv4-parse.c | 137 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 131 insertions(+), 6 deletions(-)

diff --git a/packv4-parse.c b/packv4-parse.c
index b80b73e..04eab46 100644
--- a/packv4-parse.c
+++ b/packv4-parse.c
@@ -151,19 +151,15 @@ static void load_path_dict(struct packed_git *p)
 	p->path_dict = paths;
 }
 
-const unsigned char *get_pathref(struct packed_git *p, const unsigned char **srcp)
+const unsigned char *get_pathref(struct packed_git *p, unsigned int index)
 {
-	unsigned int index;
-
 	if (!p->path_dict)
 		load_path_dict(p);
 
-	index = decode_varint(srcp);
-	if (index < 1 || index - 1 >= p->path_dict->nb_entries) {
+	if (index >= p->path_dict->nb_entries) {
 		error("%s: index overflow", __func__);
 		return NULL;
 	}
-	index -= 1;
 	return p->path_dict->data + p->path_dict->offsets[index];
 }
 
@@ -240,3 +236,132 @@ void *pv4_get_commit(struct packed_git *p, struct pack_window **w_curs,
 
 	return dst;
 }
+
+static int decode_entries(struct packed_git *p, struct pack_window **w_curs,
+			  off_t offset, unsigned int start, unsigned int count,
+			  unsigned char **dstp, unsigned long *sizep, int hdr)
+{
+	unsigned long avail;
+	unsigned int nb_entries;
+	const unsigned char *src, *scp;
+	off_t copy_objoffset = 0;
+
+	src = use_pack(p, w_curs, offset, &avail);
+	scp = src;
+
+	if (hdr) {
+		/* we need to skip over the object header */
+		while (*scp & 128)
+			if (++scp - src >= avail - 20)
+				return -1;
+		/* let's still make sure this is actually a tree */
+		if ((*scp++ & 0xf) != OBJ_TREE)
+			return -1;
+	}
+
+	nb_entries = decode_varint(&scp);
+	if (scp == src || start > nb_entries || count > nb_entries - start)
+		return -1;
+	offset += scp - src;
+	avail -= scp - src;
+	src = scp;
+
+	while (count) {
+		unsigned int what;
+
+		if (avail < 20) {
+			src = use_pack(p, w_curs, offset, &avail);
+			if (avail < 20)
+				return -1;
+		}
+		scp = src;
+
+		what = decode_varint(&scp);
+		if (scp == src)
+			return -1;
+
+		if (!(what & 1) && start != 0) {
+			/*
+			 * This is a single entry and we have to skip it.
+			 * The path index was parsed and is in 'what'.
+			 * Skip over the SHA1 index.
+			 */
+			while (*scp++ & 128);
+			start--;
+		} else if (!(what & 1) && start == 0) {
+			/*
+			 * This is an actual tree entry to recreate.
+			 */
+			const unsigned char *path, *sha1;
+			unsigned mode;
+			int len;
+
+			path = get_pathref(p, what >> 1);
+			sha1 = get_sha1ref(p, &scp);
+			if (!path || !sha1)
+				return -1;
+			mode = (path[0] << 8) | path[1];
+			len = snprintf((char *)*dstp, *sizep, "%o %s%c",
+					   mode, path+2, '\0');
+			if (len + 20 > *sizep)
+				return -1;
+			hashcpy(*dstp + len, sha1);
+			*dstp += len + 20;
+			*sizep -= len + 20;
+			count--;
+		} else if (what & 1) {
+			/*
+			 * Copy from another tree object.
+			 */
+			unsigned int copy_start, copy_count;
+
+			copy_start = what >> 1;
+			copy_count = decode_varint(&scp);
+			if (!copy_count)
+				return -1;
+
+			/*
+			 * The LSB of copy_count is a flag indicating if
+			 * a third value is provided to specify the source
+			 * object.  This may be omitted when it doesn't
+			 * change, but has to be specified at least for the
+			 * first copy sequence.
+			 */
+			if (copy_count & 1) {
+				unsigned index = decode_varint(&scp);
+				if (!index)  /* thin pack */
+					return -1;
+				copy_objoffset =
+					nth_packed_object_offset(p, index - 1);
+			}
+			if (!copy_objoffset)
+				return -1;
+			copy_count >>= 1;
+
+			if (start >= copy_count) {
+				start -= copy_count;
+			} else {
+				int ret;
+				copy_count -= start;
+				copy_start += start;
+				start = 0;
+				if (copy_count > count)
+					copy_count = count;
+				count -= copy_count;
+				ret = decode_entries(p, w_curs,
+					copy_objoffset, copy_start, copy_count,
+					dstp, sizep, 1);
+				if (ret)
+					return ret;
+				/* force pack window readjustment */
+				avail = scp - src;
+			}
+		}
+
+		offset += scp - src;
+		avail -= scp - src;
+		src = scp;
+	}
+
+	return 0;
+}
-- 
1.8.4.38.g317e65b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 36/38] pack v4: get tree objects
  2013-09-05  6:19 [PATCH 00/38] pack version 4 basic functionalities Nicolas Pitre
                   ` (34 preceding siblings ...)
  2013-09-05  6:19 ` [PATCH 35/38] pack v4: decode tree objects Nicolas Pitre
@ 2013-09-05  6:19 ` Nicolas Pitre
  2013-09-05  6:20 ` [PATCH 37/38] pack v4: introduce "escape hatches" in the name and path indexes Nicolas Pitre
                   ` (3 subsequent siblings)
  39 siblings, 0 replies; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-05  6:19 UTC (permalink / raw)
  To: git

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 packv4-parse.c | 25 +++++++++++++++++++++++++
 packv4-parse.h |  2 ++
 sha1_file.c    |  2 +-
 3 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/packv4-parse.c b/packv4-parse.c
index 04eab46..4c218d2 100644
--- a/packv4-parse.c
+++ b/packv4-parse.c
@@ -365,3 +365,28 @@ static int decode_entries(struct packed_git *p, struct pack_window **w_curs,
 
 	return 0;
 }
+
+void *pv4_get_tree(struct packed_git *p, struct pack_window **w_curs,
+		   off_t offset, unsigned long size)
+{
+	unsigned long avail;
+	unsigned int nb_entries;
+	unsigned char *dst, *dcp;
+	const unsigned char *src, *scp;
+	int ret;
+
+	src = use_pack(p, w_curs, offset, &avail);
+	scp = src;
+	nb_entries = decode_varint(&scp);
+	if (scp == src)
+		return NULL;
+
+	dst = xmallocz(size);
+	dcp = dst;
+	ret = decode_entries(p, w_curs, offset, 0, nb_entries, &dcp, &size, 0);
+	if (ret < 0 || size != 0) {
+		free(dst);
+		return NULL;
+	}
+	return dst;
+}
diff --git a/packv4-parse.h b/packv4-parse.h
index 40aa75a..5f9d809 100644
--- a/packv4-parse.h
+++ b/packv4-parse.h
@@ -3,5 +3,7 @@
 
 void *pv4_get_commit(struct packed_git *p, struct pack_window **w_curs,
 		     off_t offset, unsigned long size);
+void *pv4_get_tree(struct packed_git *p, struct pack_window **w_curs,
+		   off_t offset, unsigned long size);
 
 #endif
diff --git a/sha1_file.c b/sha1_file.c
index b57d9f8..79e1293 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -2177,7 +2177,7 @@ void *unpack_entry(struct packed_git *p, off_t obj_offset,
 			if (type == OBJ_COMMIT) {
 				data = pv4_get_commit(p, &w_curs, curpos, size);
 			} else {
-				die("no pack v4 tree parsing yet");
+				data = pv4_get_tree(p, &w_curs, curpos, size);
 			}
 			break;
 		}
-- 
1.8.4.38.g317e65b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 37/38] pack v4: introduce "escape hatches" in the name and path indexes
  2013-09-05  6:19 [PATCH 00/38] pack version 4 basic functionalities Nicolas Pitre
                   ` (35 preceding siblings ...)
  2013-09-05  6:19 ` [PATCH 36/38] pack v4: get " Nicolas Pitre
@ 2013-09-05  6:20 ` Nicolas Pitre
  2013-09-05 19:02   ` Nicolas Pitre
  2013-09-05  6:20 ` [PATCH 38/38] packv4-create: add a command line argument to limit tree copy sequences Nicolas Pitre
                   ` (2 subsequent siblings)
  39 siblings, 1 reply; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-05  6:20 UTC (permalink / raw)
  To: git

If the path or name index is zero, this means the entry data is to be
found inline rather than being located in the dictionary table. This is
there to allow easy completion of thin packs without having to add new
table entries which would have required a full rewrite of the pack data.

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 packv4-create.c |  6 +++---
 packv4-parse.c  | 28 ++++++++++++++++++++++------
 2 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/packv4-create.c b/packv4-create.c
index fd16222..9d6ffc0 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -343,7 +343,7 @@ void *pv4_encode_commit(void *buffer, unsigned long *sizep)
 	index = dict_add_entry(commit_name_table, tz_val, in, end - in);
 	if (index < 0)
 		goto bad_dict;
-	out += encode_varint(index, out);
+	out += encode_varint(index + 1, out);
 	time = strtoul(end, &end, 10);
 	if (!end || end[0] != ' ' || end[6] != '\n')
 		goto bad_data;
@@ -361,7 +361,7 @@ void *pv4_encode_commit(void *buffer, unsigned long *sizep)
 	index = dict_add_entry(commit_name_table, tz_val, in, end - in);
 	if (index < 0)
 		goto bad_dict;
-	out += encode_varint(index, out);
+	out += encode_varint(index + 1, out);
 	time = strtoul(end, &end, 10);
 	if (!end || end[0] != ' ' || end[6] != '\n')
 		goto bad_data;
@@ -561,7 +561,7 @@ void *pv4_encode_tree(void *_buffer, unsigned long *sizep,
 			free(buffer);
 			return NULL;
 		}
-		out += encode_varint(index << 1, out);
+		out += encode_varint((index + 1) << 1, out);
 		out += encode_sha1ref(name_entry.sha1, out);
 	}
 
diff --git a/packv4-parse.c b/packv4-parse.c
index 4c218d2..6db4ed3 100644
--- a/packv4-parse.c
+++ b/packv4-parse.c
@@ -125,11 +125,19 @@ const unsigned char *get_nameref(struct packed_git *p, const unsigned char **src
 		load_name_dict(p);
 
 	index = decode_varint(srcp);
-	if (index >= p->name_dict->nb_entries) {
+
+	if (!index) {
+		/* the entry data is inline */
+		const unsigned char *data = *srcp;
+		*srcp += 2 + strlen((const char *)*srcp + 2) + 1;
+		return data;
+	}
+
+	if (index - 1 >= p->name_dict->nb_entries) {
 		error("%s: index overflow", __func__);
 		return NULL;
 	}
-	return p->name_dict->data + p->name_dict->offsets[index];
+	return p->name_dict->data + p->name_dict->offsets[index - 1];
 }
 
 static void load_path_dict(struct packed_git *p)
@@ -151,16 +159,24 @@ static void load_path_dict(struct packed_git *p)
 	p->path_dict = paths;
 }
 
-const unsigned char *get_pathref(struct packed_git *p, unsigned int index)
+const unsigned char *get_pathref(struct packed_git *p, unsigned int index,
+				 const unsigned char **srcp)
 {
 	if (!p->path_dict)
 		load_path_dict(p);
 
-	if (index >= p->path_dict->nb_entries) {
+	if (!index) {
+		/* the entry data is inline */
+		const unsigned char *data = *srcp;
+		*srcp += 2 + strlen((const char *)*srcp + 2) + 1;
+		return data;
+	}
+
+	if (index - 1 >= p->path_dict->nb_entries) {
 		error("%s: index overflow", __func__);
 		return NULL;
 	}
-	return p->path_dict->data + p->path_dict->offsets[index];
+	return p->path_dict->data + p->path_dict->offsets[index - 1];
 }
 
 void *pv4_get_commit(struct packed_git *p, struct pack_window **w_curs,
@@ -296,7 +312,7 @@ static int decode_entries(struct packed_git *p, struct pack_window **w_curs,
 			unsigned mode;
 			int len;
 
-			path = get_pathref(p, what >> 1);
+			path = get_pathref(p, what >> 1, &scp);
 			sha1 = get_sha1ref(p, &scp);
 			if (!path || !sha1)
 				return -1;
-- 
1.8.4.38.g317e65b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 38/38] packv4-create: add a command line argument to limit tree copy sequences
  2013-09-05  6:19 [PATCH 00/38] pack version 4 basic functionalities Nicolas Pitre
                   ` (36 preceding siblings ...)
  2013-09-05  6:20 ` [PATCH 37/38] pack v4: introduce "escape hatches" in the name and path indexes Nicolas Pitre
@ 2013-09-05  6:20 ` Nicolas Pitre
  2013-09-07 10:43 ` [PATCH 00/12] pack v4 support in index-pack Nguyễn Thái Ngọc Duy
  2013-09-08 15:04 ` [PATCH 00/11] pack v4 support in pack-objects Nguyễn Thái Ngọc Duy
  39 siblings, 0 replies; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-05  6:20 UTC (permalink / raw)
  To: git

Because there is no delta object cache for tree objects yet, walking
tree entries may result in a lot of recursion.

Let's add --min-tree-copy=N where N is the minimum number of copied
entries in a single copy sequence allowed for encoding tree deltas.
The default is 1. Specifying 0 disables tree deltas entirely.

This allows for experiments with the delta width and see the influence
on pack size vs runtime access cost.

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 packv4-create.c | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/packv4-create.c b/packv4-create.c
index 9d6ffc0..34dcebf 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -19,6 +19,7 @@
 
 static int pack_compression_seen;
 static int pack_compression_level = Z_DEFAULT_COMPRESSION;
+static int min_tree_copy = 1;
 
 struct data_entry {
 	unsigned offset;
@@ -441,7 +442,7 @@ void *pv4_encode_tree(void *_buffer, unsigned long *sizep,
 	if (!size)
 		return NULL;
 
-	if (!delta_size)
+	if (!delta_size || !min_tree_copy)
 		delta = NULL;
 
 	/*
@@ -530,7 +531,6 @@ void *pv4_encode_tree(void *_buffer, unsigned long *sizep,
 			cp += encode_varint(copy_count, cp);
 			if (first_delta)
 				cp += encode_sha1ref(delta_sha1, cp);
-			copy_count = 0;
 
 			/*
 			 * Now let's make sure this is going to take less
@@ -538,12 +538,14 @@ void *pv4_encode_tree(void *_buffer, unsigned long *sizep,
 			 * created in parallel.  If so we dump the copy
 			 * sequence over those entries in the output buffer.
 			 */
-			if (cp - copy_buf < out - &buffer[copy_pos]) {
+			if (copy_count >= min_tree_copy &&
+			    cp - copy_buf < out - &buffer[copy_pos]) {
 				out = buffer + copy_pos;
 				memcpy(out, copy_buf, cp - copy_buf);
 				out += cp - copy_buf;
 				first_delta = 0;
 			}
+			copy_count = 0;
 		}
 
 		if (end - out < 48) {
@@ -574,7 +576,8 @@ void *pv4_encode_tree(void *_buffer, unsigned long *sizep,
 		cp += encode_varint(copy_count, cp);
 		if (first_delta)
 			cp += encode_sha1ref(delta_sha1, cp);
-		if (cp - copy_buf < out - &buffer[copy_pos]) {
+		if (copy_count >= min_tree_copy &&
+		    cp - copy_buf < out - &buffer[copy_pos]) {
 			out = buffer + copy_pos;
 			memcpy(out, copy_buf, cp - copy_buf);
 			out += cp - copy_buf;
@@ -1078,14 +1081,24 @@ static int git_pack_config(const char *k, const char *v, void *cb)
 
 int main(int argc, char *argv[])
 {
-	if (argc != 3) {
-		fprintf(stderr, "Usage: %s <src_packfile> <dst_packfile>\n", argv[0]);
+	char *src_pack, *dst_pack;
+
+	if (argc == 3) {
+		src_pack = argv[1];
+		dst_pack = argv[2];
+	} else if (argc == 4 && !prefixcmp(argv[1], "--min-tree-copy=")) {
+		min_tree_copy = atoi(argv[1] + strlen("--min-tree-copy="));
+		src_pack = argv[2];
+		dst_pack = argv[3];
+	} else {
+		fprintf(stderr, "Usage: %s [--min-tree-copy=<n>] <src_packfile> <dst_packfile>\n", argv[0]);
 		exit(1);
 	}
+
 	git_config(git_pack_config, NULL);
 	if (!pack_compression_seen && core_compression_seen)
 		pack_compression_level = core_compression_level;
-	process_one_pack(argv[1], argv[2]);
+	process_one_pack(src_pack, dst_pack);
 	if (0)
 		dict_dump();
 	return 0;
-- 
1.8.4.38.g317e65b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* Re: [PATCH 31/38] sha1_file.c: make use of decode_varint()
  2013-09-05  6:19 ` [PATCH 31/38] sha1_file.c: make use of decode_varint() Nicolas Pitre
@ 2013-09-05  7:35   ` SZEDER Gábor
  0 siblings, 0 replies; 124+ messages in thread
From: SZEDER Gábor @ 2013-09-05  7:35 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Nicolas Pitre, git

On Thu, Sep 05, 2013 at 02:19:54AM -0400, Nicolas Pitre wrote:
> ... replacing the equivalent open coded loop.
> 
> Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
> ---
>  sha1_file.c | 14 +++-----------
>  1 file changed, 3 insertions(+), 11 deletions(-)
> 
> diff --git a/sha1_file.c b/sha1_file.c
> index a298933..67eb903 100644
> --- a/sha1_file.c
> +++ b/sha1_file.c
> @@ -1687,20 +1687,12 @@ static off_t get_delta_base(struct packed_git *p,
>  	 * is stupid, as then a REF_DELTA would be smaller to store.
>  	 */
>  	if (type == OBJ_OFS_DELTA) {
> -		unsigned used = 0;
> -		unsigned char c = base_info[used++];
> -		base_offset = c & 127;
> -		while (c & 128) {
> -			base_offset += 1;
> -			if (!base_offset || MSB(base_offset, 7))
> -				return 0;  /* overflow */
> -			c = base_info[used++];
> -			base_offset = (base_offset << 7) + (c & 127);
> -		}
> +		const unsigned char *cp = base_info;
> +		base_offset = decode_varint(&cp);
>  		base_offset = delta_obj_offset - base_offset;
>  		if (base_offset <= 0 || base_offset >= delta_obj_offset)
>  			return 0;  /* out of bound */
> -		*curpos += used;
> +		*curpos += cp - base_info;
>  	} else if (type == OBJ_REF_DELTA) {
>  		/* The base entry _must_ be in the same pack */
>  		base_offset = find_pack_entry_one(base_info, p);
> -- 
> 1.8.4.38.g317e65b

This patch seems to be a cleanup independent from pack v4, it applies
cleanly on master and passes all tests in itself.

Best,
Gábor

^ permalink raw reply	[flat|nested] 124+ messages in thread

* Re: [PATCH 05/38] pack v4: add commit object parsing
  2013-09-05  6:19 ` [PATCH 05/38] pack v4: add commit object parsing Nicolas Pitre
@ 2013-09-05 10:30   ` SZEDER Gábor
  2013-09-05 17:30     ` Nicolas Pitre
  0 siblings, 1 reply; 124+ messages in thread
From: SZEDER Gábor @ 2013-09-05 10:30 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git

Hi,


On Thu, Sep 05, 2013 at 02:19:28AM -0400, Nicolas Pitre wrote:
> Let's create another dictionary table to hold the author and committer
> entries.  We use the same table format used for tree entries where the
> 16 bit data prefix is conveniently used to store the timezone value.
> 
> In order to copy straight from a commit object buffer, dict_add_entry()
> is modified to get the string length as the provided string pointer is
> not always be null terminated.
> 
> Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
> ---
>  packv4-create.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++------
>  1 file changed, 89 insertions(+), 9 deletions(-)
> 
> diff --git a/packv4-create.c b/packv4-create.c
> index eccd9fc..5c08871 100644
> --- a/packv4-create.c
> +++ b/packv4-create.c
> @@ -1,5 +1,5 @@
>  /*
> - * packv4-create.c: management of dictionary tables used in pack v4
> + * packv4-create.c: creation of dictionary tables and objects used in pack v4
>   *
>   * (C) Nicolas Pitre <nico@fluxnic.net>
>   *
> @@ -80,9 +80,9 @@ static void rehash_entries(struct dict_table *t)
>  	}
>  }
>  
> -int dict_add_entry(struct dict_table *t, int val, const char *str)
> +int dict_add_entry(struct dict_table *t, int val, const char *str, int str_len)
>  {
> -	int i, val_len = 2, str_len = strlen(str) + 1;
> +	int i, val_len = 2;
>  
>  	if (t->ptr + val_len + str_len > t->size) {

We need a +1 here on the left side, i.e.

        if (t->ptr + val_len + str_len + 1 > t->size) {

The str_len variable accounted for the terminating null character
before, but this patch removes str_len = strlen(str) + 1; above, and
callers specify the length of str without the terminating null in
str_len.  Thus it can lead to memory corruption, when the new entry
happens to end at 't->ptr + val_len + str_len' and the line added in
the next hunk writes the terminating null beyond the end of the
buffer.  I couldn't create a v4 pack from a current linux repo because
of this; either glibc detected something or 'git packv4-create'
crashed.

Sidenote: couldn't we call the 'ptr' field something else, like
end_offset or end_idx?  It took me some headscratching to figure out
why is it OK to compare a pointer to an integer above, or use a
pointer without dereferencing as an index into an array below (because
ptr is, well, not a pointer after all).

>  		t->size = (t->size + val_len + str_len + 1024) * 3 / 2;
> @@ -92,6 +92,7 @@ int dict_add_entry(struct dict_table *t, int val, const char *str)
>  	t->data[t->ptr] = val >> 8;
>  	t->data[t->ptr + 1] = val;
>  	memcpy(t->data + t->ptr + val_len, str, str_len);
> +	t->data[t->ptr + val_len + str_len] = 0;
>  
>  	i = (t->nb_entries) ?
>  		locate_entry(t, t->data + t->ptr, val_len + str_len) : -1;
> @@ -107,7 +108,7 @@ int dict_add_entry(struct dict_table *t, int val, const char *str)
>  	t->entry[t->nb_entries].offset = t->ptr;
>  	t->entry[t->nb_entries].size = val_len + str_len;
>  	t->entry[t->nb_entries].hits = 1;
> -	t->ptr += val_len + str_len;
> +	t->ptr += val_len + str_len + 1;

Good.


Best,
Gábor


>  	t->nb_entries++;
>  
>  	if (t->hash_size * 3 <= t->nb_entries * 4)
> @@ -135,8 +136,73 @@ static void sort_dict_entries_by_hits(struct dict_table *t)
>  	rehash_entries(t);
>  }
>  
> +static struct dict_table *commit_name_table;
>  static struct dict_table *tree_path_table;
>  
> +/*
> + * Parse the author/committer line from a canonical commit object.
> + * The 'from' argument points right after the "author " or "committer "
> + * string.  The time zone is parsed and stored in *tz_val.  The returned
> + * pointer is right after the end of the email address which is also just
> + * before the time value, or NULL if a parsing error is encountered.
> + */
> +static char *get_nameend_and_tz(char *from, int *tz_val)
> +{
> +	char *end, *tz;
> +
> +	tz = strchr(from, '\n');
> +	/* let's assume the smallest possible string to be "x <x> 0 +0000\n" */
> +	if (!tz || tz - from < 13)
> +		return NULL;
> +	tz -= 4;
> +	end = tz - 4;
> +	while (end - from > 5 && *end != ' ')
> +		end--;
> +	if (end[-1] != '>' || end[0] != ' ' || tz[-2] != ' ')
> +		return NULL;
> +	*tz_val = (tz[0] - '0') * 1000 +
> +		  (tz[1] - '0') * 100 +
> +		  (tz[2] - '0') * 10 +
> +		  (tz[3] - '0');
> +	switch (tz[-1]) {
> +	default:	return NULL;
> +	case '+':	break;
> +	case '-':	*tz_val = -*tz_val;
> +	}
> +	return end;
> +}
> +
> +static int add_commit_dict_entries(void *buf, unsigned long size)
> +{
> +	char *name, *end = NULL;
> +	int tz_val;
> +
> +	if (!commit_name_table)
> +		commit_name_table = create_dict_table();
> +
> +	/* parse and add author info */
> +	name = strstr(buf, "\nauthor ");
> +	if (name) {
> +		name += 8;
> +		end = get_nameend_and_tz(name, &tz_val);
> +	}
> +	if (!name || !end)
> +		return -1;
> +	dict_add_entry(commit_name_table, tz_val, name, end - name);
> +
> +	/* parse and add committer info */
> +	name = strstr(end, "\ncommitter ");
> +	if (name) {
> +	       name += 11;
> +	       end = get_nameend_and_tz(name, &tz_val);
> +	}
> +	if (!name || !end)
> +		return -1;
> +	dict_add_entry(commit_name_table, tz_val, name, end - name);
> +
> +	return 0;
> +}
> +
>  static int add_tree_dict_entries(void *buf, unsigned long size)
>  {
>  	struct tree_desc desc;
> @@ -146,13 +212,16 @@ static int add_tree_dict_entries(void *buf, unsigned long size)
>  		tree_path_table = create_dict_table();
>  
>  	init_tree_desc(&desc, buf, size);
> -	while (tree_entry(&desc, &name_entry))
> +	while (tree_entry(&desc, &name_entry)) {
> +		int pathlen = tree_entry_len(&name_entry);
>  		dict_add_entry(tree_path_table, name_entry.mode,
> -			       name_entry.path);
> +				name_entry.path, pathlen);
> +	}
> +
>  	return 0;
>  }
>  
> -void dict_dump(struct dict_table *t)
> +void dump_dict_table(struct dict_table *t)
>  {
>  	int i;
>  
> @@ -169,6 +238,12 @@ void dict_dump(struct dict_table *t)
>  	}
>  }
>  
> +static void dict_dump(void)
> +{
> +	dump_dict_table(commit_name_table);
> +	dump_dict_table(tree_path_table);
> +}
> +
>  struct idx_entry
>  {
>  	off_t                offset;
> @@ -205,6 +280,7 @@ static int create_pack_dictionaries(struct packed_git *p)
>  		enum object_type type;
>  		unsigned long size;
>  		struct object_info oi = {};
> +		int (*add_dict_entries)(void *, unsigned long);
>  
>  		oi.typep = &type;
>  		oi.sizep = &size;
> @@ -213,7 +289,11 @@ static int create_pack_dictionaries(struct packed_git *p)
>  			    sha1_to_hex(objects[i].sha1), p->pack_name);
>  
>  		switch (type) {
> +		case OBJ_COMMIT:
> +			add_dict_entries = add_commit_dict_entries;
> +			break;
>  		case OBJ_TREE:
> +			add_dict_entries = add_tree_dict_entries;
>  			break;
>  		default:
>  			continue;
> @@ -225,7 +305,7 @@ static int create_pack_dictionaries(struct packed_git *p)
>  		if (check_sha1_signature(objects[i].sha1, data, size, typename(type)))
>  			die("packed %s from %s is corrupt",
>  			    sha1_to_hex(objects[i].sha1), p->pack_name);
> -		if (add_tree_dict_entries(data, size) < 0)
> +		if (add_dict_entries(data, size) < 0)
>  			die("can't process %s object %s",
>  				typename(type), sha1_to_hex(objects[i].sha1));
>  		free(data);
> @@ -285,6 +365,6 @@ int main(int argc, char *argv[])
>  		exit(1);
>  	}
>  	process_one_pack(argv[1]);
> -	dict_dump(tree_path_table);
> +	dict_dump();
>  	return 0;
>  }
> -- 
> 1.8.4.38.g317e65b
> 
> 

^ permalink raw reply	[flat|nested] 124+ messages in thread

* Re: [PATCH 05/38] pack v4: add commit object parsing
  2013-09-05 10:30   ` SZEDER Gábor
@ 2013-09-05 17:30     ` Nicolas Pitre
  0 siblings, 0 replies; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-05 17:30 UTC (permalink / raw)
  To: SZEDER Gábor; +Cc: git

[-- Attachment #1: Type: TEXT/PLAIN, Size: 2152 bytes --]

On Thu, 5 Sep 2013, SZEDER Gábor wrote:

> Hi,
> 
> 
> On Thu, Sep 05, 2013 at 02:19:28AM -0400, Nicolas Pitre wrote:
> > Let's create another dictionary table to hold the author and committer
> > entries.  We use the same table format used for tree entries where the
> > 16 bit data prefix is conveniently used to store the timezone value.
> > 
> > In order to copy straight from a commit object buffer, dict_add_entry()
> > is modified to get the string length as the provided string pointer is
> > not always be null terminated.
> > 
> > Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
> > ---
> >  packv4-create.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++------
> >  1 file changed, 89 insertions(+), 9 deletions(-)
> > 
> > diff --git a/packv4-create.c b/packv4-create.c
> > index eccd9fc..5c08871 100644
> > --- a/packv4-create.c
> > +++ b/packv4-create.c
> > @@ -1,5 +1,5 @@
> >  /*
> > - * packv4-create.c: management of dictionary tables used in pack v4
> > + * packv4-create.c: creation of dictionary tables and objects used in pack v4
> >   *
> >   * (C) Nicolas Pitre <nico@fluxnic.net>
> >   *
> > @@ -80,9 +80,9 @@ static void rehash_entries(struct dict_table *t)
> >  	}
> >  }
> >  
> > -int dict_add_entry(struct dict_table *t, int val, const char *str)
> > +int dict_add_entry(struct dict_table *t, int val, const char *str, int str_len)
> >  {
> > -	int i, val_len = 2, str_len = strlen(str) + 1;
> > +	int i, val_len = 2;
> >  
> >  	if (t->ptr + val_len + str_len > t->size) {
> 
> We need a +1 here on the left side, i.e.
> 
>         if (t->ptr + val_len + str_len + 1 > t->size) {

Absolutely, good catch.

> Sidenote: couldn't we call the 'ptr' field something else, like
> end_offset or end_idx?  It took me some headscratching to figure out
> why is it OK to compare a pointer to an integer above, or use a
> pointer without dereferencing as an index into an array below (because
> ptr is, well, not a pointer after all).

Indeed.  This is a remnant of an earlier implementation which didn't use 
realloc() and therefore this used to be a real pointer.

Both issues now addressed in my tree.

Thanks


Nicolas

^ permalink raw reply	[flat|nested] 124+ messages in thread

* Re: [PATCH 37/38] pack v4: introduce "escape hatches" in the name and path indexes
  2013-09-05  6:20 ` [PATCH 37/38] pack v4: introduce "escape hatches" in the name and path indexes Nicolas Pitre
@ 2013-09-05 19:02   ` Nicolas Pitre
  2013-09-05 21:48     ` Nicolas Pitre
  2013-09-05 23:57     ` Duy Nguyen
  0 siblings, 2 replies; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-05 19:02 UTC (permalink / raw)
  To: Duy Nguyen, git

On Thu, 5 Sep 2013, Nicolas Pitre wrote:

> If the path or name index is zero, this means the entry data is to be
> found inline rather than being located in the dictionary table. This is
> there to allow easy completion of thin packs without having to add new
> table entries which would have required a full rewrite of the pack data.
> 
> Signed-off-by: Nicolas Pitre <nico@fluxnic.net>

I'm now dropping this patch.  Please also remove this from your 
documentation patch.

I think that we've found a way to better support thin packs.

You said:

> What if the sender prepares the sha-1 table to contain missing objects
> in advance? The sender should know what base objects are missing. Then
> we only need to append objects at the receiving end and verify that
> all new objects are also present in the sha-1 table.

So the SHA1 table is covered.

Missing objects in a thin pack cannot themselves be deltas.  We had 
their undeltified form at the end of a pack for the pack to be complete.  
Therefore those missing objects serve only as base objects for other 
deltas.

Although this is possible to have deltified commit objects in pack v2, I 
don't think this happens very often. There is no deltified commit 
objects in pack v4.

Blob objects are the same in pack v2 and pack v4.  No dictionary 
references are needed.

That leaves only tree objects.  And because we've also discussed the 
need to have non transcoded object representations for those odd cases 
such as zero padded file modes, we might as well simply use that for the 
appended tree objects already needed to complete a thin pack.  At least 
the strings in tree entries will be compressed that way.

Problem solved, and one less special case in the code.

What do you think?


Nicolas

^ permalink raw reply	[flat|nested] 124+ messages in thread

* Re: [PATCH 37/38] pack v4: introduce "escape hatches" in the name and path indexes
  2013-09-05 19:02   ` Nicolas Pitre
@ 2013-09-05 21:48     ` Nicolas Pitre
  2013-09-05 23:57     ` Duy Nguyen
  1 sibling, 0 replies; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-05 21:48 UTC (permalink / raw)
  To: Duy Nguyen, git

On Thu, 5 Sep 2013, Nicolas Pitre wrote:

> On Thu, 5 Sep 2013, Nicolas Pitre wrote:
> 
> > If the path or name index is zero, this means the entry data is to be
> > found inline rather than being located in the dictionary table. This is
> > there to allow easy completion of thin packs without having to add new
> > table entries which would have required a full rewrite of the pack data.
> > 
> > Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
> 
> I'm now dropping this patch.  Please also remove this from your 
> documentation patch.

Well... I couldn't resist another little change that has been nagging me 
for a while.

Both the author and committer time stamps are very closely related most 
of the time.  So the committer time stamp is now encoded as a difference 
against the author time stamp with the LSB indicating a negative 
difference.

On git.git this saves 0.3% on the pack size.  Not much, but still 
impressive for only a time stamp.


Nicolas

^ permalink raw reply	[flat|nested] 124+ messages in thread

* Re: [PATCH 37/38] pack v4: introduce "escape hatches" in the name and path indexes
  2013-09-05 19:02   ` Nicolas Pitre
  2013-09-05 21:48     ` Nicolas Pitre
@ 2013-09-05 23:57     ` Duy Nguyen
  1 sibling, 0 replies; 124+ messages in thread
From: Duy Nguyen @ 2013-09-05 23:57 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Git Mailing List

On Fri, Sep 6, 2013 at 2:02 AM, Nicolas Pitre <nico@fluxnic.net> wrote:
> I think that we've found a way to better support thin packs.
>
> You said:
>
>> What if the sender prepares the sha-1 table to contain missing objects
>> in advance? The sender should know what base objects are missing. Then
>> we only need to append objects at the receiving end and verify that
>> all new objects are also present in the sha-1 table.
>
> So the SHA1 table is covered.
>
> Missing objects in a thin pack cannot themselves be deltas.  We had
> their undeltified form at the end of a pack for the pack to be complete.
> Therefore those missing objects serve only as base objects for other
> deltas.
>
> Although this is possible to have deltified commit objects in pack v2, I
> don't think this happens very often. There is no deltified commit
> objects in pack v4.
>
> Blob objects are the same in pack v2 and pack v4.  No dictionary
> references are needed.
>
> That leaves only tree objects.  And because we've also discussed the
> need to have non transcoded object representations for those odd cases
> such as zero padded file modes, we might as well simply use that for the
> appended tree objects already needed to complete a thin pack.  At least
> the strings in tree entries will be compressed that way.
>
> Problem solved, and one less special case in the code.
>
> What do you think?

Agreed.

>  Please also remove this from your documentation patch.

Will do.
-- 
Duy

^ permalink raw reply	[flat|nested] 124+ messages in thread

* Re: [PATCH 10/38] pack v4: commit object encoding
  2013-09-05  6:19 ` [PATCH 10/38] pack v4: commit object encoding Nicolas Pitre
@ 2013-09-06  6:57   ` Junio C Hamano
  2013-09-06 21:28     ` Nicolas Pitre
  0 siblings, 1 reply; 124+ messages in thread
From: Junio C Hamano @ 2013-09-06  6:57 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git

Nicolas Pitre <nico@fluxnic.net> writes:

> This goes as follows:
>
> - Tree reference: either variable length encoding of the index
>   into the SHA1 table or the literal SHA1 prefixed by 0 (see
>   encode_sha1ref()).
>
> - Parent count: variable length encoding of the number of parents.
>   This is normally going to occupy a single byte but doesn't have to.
>
> - List of parent references: a list of encode_sha1ref() encoded
>   references, or nothing if the parent count was zero.
>
> - Author reference: variable length encoding of an index into the author
>   identifier dictionary table which also covers the time zone.  To make
>   the overall encoding efficient, the author table is sorted by usage
>   frequency so the most used names are first and require the shortest
>   index encoding.
>
> - Author time stamp: variable length encoded.  Year 2038 ready!
>
> - Committer reference: same as author reference.
>
> - Committer time stamp: same as author time stamp.
>
> The remainder of the canonical commit object content is then zlib
> compressed and appended to the above.
>
> Rationale: The most important commit object data is densely encoded while
> requiring no zlib inflate processing on access, and all SHA1 references
> are most likely to be direct indices into the pack index file requiring
> no SHA1 search into the pack index file.

May I suggest a small change to the above, though.

Reorder the entries so that Parent count, list of parents and
committer time stamp come first in this order, and then the rest.

That way, commit.c::parse_commit() could populate its field lazily
with parsing only the very minimum set of fields, and then the
revision walker, revision.c::add_parents_to_list(), can find where
in the priority queue to add the parents to the list of commits to
be processed while still keeping the object partially parsed.  If a
commit is UNINTERESTING, no further parsing needs to be done.

>
> Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
> ---
>  packv4-create.c | 119 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 119 insertions(+)
>
> diff --git a/packv4-create.c b/packv4-create.c
> index 12527c0..d4a79f4 100644
> --- a/packv4-create.c
> +++ b/packv4-create.c
> @@ -14,6 +14,9 @@
>  #include "pack.h"
>  #include "varint.h"
>  
> +
> +static int pack_compression_level = Z_DEFAULT_COMPRESSION;
> +
>  struct data_entry {
>  	unsigned offset;
>  	unsigned size;
> @@ -274,6 +277,122 @@ static int encode_sha1ref(const unsigned char *sha1, unsigned char *buf)
>  	return 1 + 20;
>  }
>  
> +/*
> + * This converts a canonical commit object buffer into its
> + * tightly packed representation using the already populated
> + * and sorted commit_name_table dictionary.  The parsing is
> + * strict so to ensure the canonical version may always be
> + * regenerated and produce the same hash.
> + */
> +void *pv4_encode_commit(void *buffer, unsigned long *sizep)
> +{
> +	unsigned long size = *sizep;
> +	char *in, *tail, *end;
> +	unsigned char *out;
> +	unsigned char sha1[20];
> +	int nb_parents, index, tz_val;
> +	unsigned long time;
> +	z_stream stream;
> +	int status;
> +
> +	/*
> +	 * It is guaranteed that the output is always going to be smaller
> +	 * than the input.  We could even do this conversion in place.
> +	 */
> +	in = buffer;
> +	tail = in + size;
> +	buffer = xmalloc(size);
> +	out = buffer;
> +
> +	/* parse the "tree" line */
> +	if (in + 46 >= tail || memcmp(in, "tree ", 5) || in[45] != '\n')
> +		goto bad_data;
> +	if (get_sha1_lowhex(in + 5, sha1) < 0)
> +		goto bad_data;
> +	in += 46;
> +	out += encode_sha1ref(sha1, out);
> +
> +	/* count how many "parent" lines */
> +	nb_parents = 0;
> +	while (in + 48 < tail && !memcmp(in, "parent ", 7) && in[47] == '\n') {
> +		nb_parents++;
> +		in += 48;
> +	}
> +	out += encode_varint(nb_parents, out);
> +
> +	/* rewind and parse the "parent" lines */
> +	in -= 48 * nb_parents;
> +	while (nb_parents--) {
> +		if (get_sha1_lowhex(in + 7, sha1))
> +			goto bad_data;
> +		out += encode_sha1ref(sha1, out);
> +		in += 48;
> +	}
> +
> +	/* parse the "author" line */
> +	/* it must be at least "author x <x> 0 +0000\n" i.e. 21 chars */
> +	if (in + 21 >= tail || memcmp(in, "author ", 7))
> +		goto bad_data;
> +	in += 7;
> +	end = get_nameend_and_tz(in, &tz_val);
> +	if (!end)
> +		goto bad_data;
> +	index = dict_add_entry(commit_name_table, tz_val, in, end - in);
> +	if (index < 0)
> +		goto bad_dict;
> +	out += encode_varint(index, out);
> +	time = strtoul(end, &end, 10);
> +	if (!end || end[0] != ' ' || end[6] != '\n')
> +		goto bad_data;
> +	out += encode_varint(time, out);
> +	in = end + 7;
> +
> +	/* parse the "committer" line */
> +	/* it must be at least "committer x <x> 0 +0000\n" i.e. 24 chars */
> +	if (in + 24 >= tail || memcmp(in, "committer ", 7))
> +		goto bad_data;
> +	in += 10;
> +	end = get_nameend_and_tz(in, &tz_val);
> +	if (!end)
> +		goto bad_data;
> +	index = dict_add_entry(commit_name_table, tz_val, in, end - in);
> +	if (index < 0)
> +		goto bad_dict;
> +	out += encode_varint(index, out);
> +	time = strtoul(end, &end, 10);
> +	if (!end || end[0] != ' ' || end[6] != '\n')
> +		goto bad_data;
> +	out += encode_varint(time, out);
> +	in = end + 7;
> +
> +	/* finally, deflate the remaining data */
> +	memset(&stream, 0, sizeof(stream));
> +	deflateInit(&stream, pack_compression_level);
> +	stream.next_in = (unsigned char *)in;
> +	stream.avail_in = tail - in;
> +	stream.next_out = (unsigned char *)out;
> +	stream.avail_out = size - (out - (unsigned char *)buffer);
> +	status = deflate(&stream, Z_FINISH);
> +	end = (char *)stream.next_out;
> +	deflateEnd(&stream);
> +	if (status != Z_STREAM_END) {
> +		error("deflate error status %d", status);
> +		goto bad;
> +	}
> +
> +	*sizep = end - (char *)buffer;
> +	return buffer;
> +
> +bad_data:
> +	error("bad commit data");
> +	goto bad;
> +bad_dict:
> +	error("bad dict entry");
> +bad:
> +	free(buffer);
> +	return NULL;
> +}
> +
>  static struct pack_idx_entry *get_packed_object_list(struct packed_git *p)
>  {
>  	unsigned i, nr_objects = p->num_objects;

^ permalink raw reply	[flat|nested] 124+ messages in thread

* Re: [PATCH 10/38] pack v4: commit object encoding
  2013-09-06  6:57   ` Junio C Hamano
@ 2013-09-06 21:28     ` Nicolas Pitre
  2013-09-06 22:08       ` Junio C Hamano
  0 siblings, 1 reply; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-06 21:28 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

On Thu, 5 Sep 2013, Junio C Hamano wrote:

> Nicolas Pitre <nico@fluxnic.net> writes:
> 
> > This goes as follows:
> >
> > - Tree reference: either variable length encoding of the index
> >   into the SHA1 table or the literal SHA1 prefixed by 0 (see
> >   encode_sha1ref()).
> >
> > - Parent count: variable length encoding of the number of parents.
> >   This is normally going to occupy a single byte but doesn't have to.
> >
> > - List of parent references: a list of encode_sha1ref() encoded
> >   references, or nothing if the parent count was zero.
> >
> > - Author reference: variable length encoding of an index into the author
> >   identifier dictionary table which also covers the time zone.  To make
> >   the overall encoding efficient, the author table is sorted by usage
> >   frequency so the most used names are first and require the shortest
> >   index encoding.
> >
> > - Author time stamp: variable length encoded.  Year 2038 ready!
> >
> > - Committer reference: same as author reference.
> >
> > - Committer time stamp: same as author time stamp.
> >
> > The remainder of the canonical commit object content is then zlib
> > compressed and appended to the above.
> >
> > Rationale: The most important commit object data is densely encoded while
> > requiring no zlib inflate processing on access, and all SHA1 references
> > are most likely to be direct indices into the pack index file requiring
> > no SHA1 search into the pack index file.
> 
> May I suggest a small change to the above, though.
> 
> Reorder the entries so that Parent count, list of parents and
> committer time stamp come first in this order, and then the rest.
> 
> That way, commit.c::parse_commit() could populate its field lazily
> with parsing only the very minimum set of fields, and then the
> revision walker, revision.c::add_parents_to_list(), can find where
> in the priority queue to add the parents to the list of commits to
> be processed while still keeping the object partially parsed.  If a
> commit is UNINTERESTING, no further parsing needs to be done.

OK.  If I understand correctly, the committer time stamp is more 
important than the author's, right?  Because my latest change in the 
format was to make the former as a difference against the later and that 
would obviously have to be reversed.

Also, to keep some kind of estetic symetry (if such thing may exist in a 
raw byte format) may I suggest keeping the tree reference first.  That 
is easy to skip over if you don't need it, something like:

	if (!*ptr)
		ptr += 1 + 20;
	else
		while (*ptr++ & 128);

Whereas, for a checkout where only the tree info is needed, if it is 
located after the list of parents, then the above needs to be done for 
all those parents and the committer time.


Nicolas

^ permalink raw reply	[flat|nested] 124+ messages in thread

* Re: [PATCH 10/38] pack v4: commit object encoding
  2013-09-06 21:28     ` Nicolas Pitre
@ 2013-09-06 22:08       ` Junio C Hamano
  2013-09-07  4:41         ` Nicolas Pitre
  0 siblings, 1 reply; 124+ messages in thread
From: Junio C Hamano @ 2013-09-06 22:08 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git

Nicolas Pitre <nico@fluxnic.net> writes:

> OK.  If I understand correctly, the committer time stamp is more 
> important than the author's, right?

Yeah, it matters a lot more when doing timestamp based traversal
without the reachability bitmaps.

> ... may I suggest keeping the tree reference first.  That 
> is easy to skip over if you don't need it,...
> ... Whereas, for a checkout where only the tree info is needed, if it is 
> located after the list of parents, then the above needs to be done for 
> all those parents and the committer time.

Hmm.  I wonder if that is a really good trade-off.

"checkout" is to parse a single commit object and grab the "tree"
field, while "log" is to parse millions of commit objects to grab
their "parents" and "committer timestamp" fields ("log path/spec"
needs to grab "tree", too, so that does not make "tree" extremely
uncommon compared to the other two fields, though).

I dunno.

^ permalink raw reply	[flat|nested] 124+ messages in thread

* Re: [PATCH 10/38] pack v4: commit object encoding
  2013-09-06 22:08       ` Junio C Hamano
@ 2013-09-07  4:41         ` Nicolas Pitre
  0 siblings, 0 replies; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-07  4:41 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

On Fri, 6 Sep 2013, Junio C Hamano wrote:

> Nicolas Pitre <nico@fluxnic.net> writes:
> 
> > OK.  If I understand correctly, the committer time stamp is more 
> > important than the author's, right?
> 
> Yeah, it matters a lot more when doing timestamp based traversal
> without the reachability bitmaps.
> 
> > ... may I suggest keeping the tree reference first.  That 
> > is easy to skip over if you don't need it,...
> > ... Whereas, for a checkout where only the tree info is needed, if it is 
> > located after the list of parents, then the above needs to be done for 
> > all those parents and the committer time.
> 
> Hmm.  I wonder if that is a really good trade-off.
> 
> "checkout" is to parse a single commit object and grab the "tree"
> field, while "log" is to parse millions of commit objects to grab
> their "parents" and "committer timestamp" fields ("log path/spec"
> needs to grab "tree", too, so that does not make "tree" extremely
> uncommon compared to the other two fields, though).
> 
> I dunno.

I've therefore settled in the middle.  The patch description now looks 
like:

|    This goes as follows:
|
|    - Tree reference: either variable length encoding of the index
|      into the SHA1 table or the literal SHA1 prefixed by 0 (see
|      encode_sha1ref()).
|
|    - Parent count: variable length encoding of the number of parents.
|      This is normally going to occupy a single byte but doesn't have to.
|
|    - List of parent references: a list of encode_sha1ref() encoded
|      references, or nothing if the parent count was zero.
|
|    - Committer time stamp: variable length encoded.  Year 2038 ready!
|      Unlike the canonical representation, this is stored close to the
|      list of parents so the important data for history traversal can be
|      retrieved without parsing the rest of the object.
|
|    - Committer reference: variable length encoding of an index into the
|      ident dictionary table which also covers the time zone.  To make
|      the overall encoding efficient, the ident table is sorted by usage
|      frequency so the most used entries are first and require the shortest
|      index encoding.
|
|    - Author time stamp: encoded as a difference against the committer
|      time stamp, with the LSB used to indicate commit time is behind
|      author time.
|
|    - Author reference: same as committer reference.
|
|    The remainder of the canonical commit object content is then zlib
|    compressed and appended to the above.

I also updated the documentation patch accordingly in my tree.


Nicolas

^ permalink raw reply	[flat|nested] 124+ messages in thread

* [PATCH 00/12] pack v4 support in index-pack
  2013-09-05  6:19 [PATCH 00/38] pack version 4 basic functionalities Nicolas Pitre
                   ` (37 preceding siblings ...)
  2013-09-05  6:20 ` [PATCH 38/38] packv4-create: add a command line argument to limit tree copy sequences Nicolas Pitre
@ 2013-09-07 10:43 ` Nguyễn Thái Ngọc Duy
  2013-09-07 10:43   ` [PATCH 01/12] pack v4: split pv4_create_dict() out of load_dict() Nguyễn Thái Ngọc Duy
                     ` (12 more replies)
  2013-09-08 15:04 ` [PATCH 00/11] pack v4 support in pack-objects Nguyễn Thái Ngọc Duy
  39 siblings, 13 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-07 10:43 UTC (permalink / raw)
  To: git; +Cc: Nguyễn Thái Ngọc Duy

This makes index-pack recognize pack v4. It still lacks:

 - the ability to walk through multi-base trees
 - thin pack support

The first is not easy to solve imo and but does not impact us in short
term because pack-objects probably will not learn to produce such
trees any time soon.

The second should be done after pack-objects can produce thin packs,
else it's hard to verify that the code works as expected.

This bases on Nico's tree, which does not really match the series this
post is replied to due to some format changes. I don't know, maybe we
could share more code with packv4-parse.c. Right now I just need
something that works and somewhat maintainable.

Nguyễn Thái Ngọc Duy (12):
  pack v4: split pv4_create_dict() out of load_dict()
  index-pack: split out varint decoding code
  index-pack: do not allocate buffer for unpacking deltas in the first pass
  index-pack: split inflate/digest code out of unpack_entry_data
  index-pack: parse v4 header and dictionaries
  index-pack: make sure all objects are registered in v4's SHA-1 table
  index-pack: parse v4 commit format
  index-pack: parse v4 tree format
  index-pack: move delta base queuing code to unpack_raw_entry
  index-pack: record all delta bases in v4 (tree and ref-delta)
  index-pack: skip looking for ofs-deltas in v4 as they are not allowed
  index-pack: resolve v4 one-base trees

 builtin/index-pack.c | 679 ++++++++++++++++++++++++++++++++++++++++++++-------
 packv4-parse.c       |  63 ++---
 packv4-parse.h       |   8 +
 3 files changed, 627 insertions(+), 123 deletions(-)

-- 
1.8.2.83.gc99314b

^ permalink raw reply	[flat|nested] 124+ messages in thread

* [PATCH 01/12] pack v4: split pv4_create_dict() out of load_dict()
  2013-09-07 10:43 ` [PATCH 00/12] pack v4 support in index-pack Nguyễn Thái Ngọc Duy
@ 2013-09-07 10:43   ` Nguyễn Thái Ngọc Duy
  2013-09-07 10:43   ` [PATCH 02/12] index-pack: split out varint decoding code Nguyễn Thái Ngọc Duy
                     ` (11 subsequent siblings)
  12 siblings, 0 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-07 10:43 UTC (permalink / raw)
  To: git; +Cc: Nguyễn Thái Ngọc Duy


Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 packv4-parse.c | 63 ++++++++++++++++++++++++++++++++--------------------------
 packv4-parse.h |  8 ++++++++
 2 files changed, 43 insertions(+), 28 deletions(-)

diff --git a/packv4-parse.c b/packv4-parse.c
index 63bba03..82661ba 100644
--- a/packv4-parse.c
+++ b/packv4-parse.c
@@ -30,11 +30,38 @@ const unsigned char *get_sha1ref(struct packed_git *p,
 	return sha1;
 }
 
-struct packv4_dict {
-	const unsigned char *data;
-	unsigned int nb_entries;
-	unsigned int offsets[FLEX_ARRAY];
-};
+struct packv4_dict *pv4_create_dict(const unsigned char *data, int dict_size)
+{
+	struct packv4_dict *dict;
+	int i;
+
+	/* count number of entries */
+	int nb_entries = 0;
+	const unsigned char *cp = data;
+	while (cp < data + dict_size - 3) {
+		cp += 2;  /* prefix bytes */
+		cp += strlen((const char *)cp);  /* entry string */
+		cp += 1;  /* terminating NUL */
+		nb_entries++;
+	}
+	if (cp - data != dict_size) {
+		error("dict size mismatch");
+		return NULL;
+	}
+
+	dict = xmalloc(sizeof(*dict) + nb_entries * sizeof(dict->offsets[0]));
+	dict->data = data;
+	dict->nb_entries = nb_entries;
+
+	cp = data;
+	for (i = 0; i < nb_entries; i++) {
+		dict->offsets[i] = cp - data;
+		cp += 2;
+		cp += strlen((const char *)cp) + 1;
+	}
+
+	return dict;
+}
 
 static struct packv4_dict *load_dict(struct packed_git *p, off_t *offset)
 {
@@ -45,7 +72,7 @@ static struct packv4_dict *load_dict(struct packed_git *p, off_t *offset)
 	const unsigned char *cp;
 	git_zstream stream;
 	struct packv4_dict *dict;
-	int nb_entries, i, st;
+	int st;
 
 	/* get uncompressed dictionary data size */
 	src = use_pack(p, &w_curs, curpos, &avail);
@@ -77,32 +104,12 @@ static struct packv4_dict *load_dict(struct packed_git *p, off_t *offset)
 		return NULL;
 	}
 
-	/* count number of entries */
-	nb_entries = 0;
-	cp = data;
-	while (cp < data + dict_size - 3) {
-		cp += 2;  /* prefix bytes */
-		cp += strlen((const char *)cp);  /* entry string */
-		cp += 1;  /* terminating NUL */
-		nb_entries++;
-	}
-	if (cp - data != dict_size) {
-		error("dict size mismatch");
+	dict = pv4_create_dict(data, dict_size);
+	if (!dict) {
 		free(data);
 		return NULL;
 	}
 
-	dict = xmalloc(sizeof(*dict) + nb_entries * sizeof(dict->offsets[0]));
-	dict->data = data;
-	dict->nb_entries = nb_entries;
-
-	cp = data;
-	for (i = 0; i < nb_entries; i++) {
-		dict->offsets[i] = cp - data;
-		cp += 2;
-		cp += strlen((const char *)cp) + 1;
-	}
-
 	*offset = curpos;
 	return dict;
 }
diff --git a/packv4-parse.h b/packv4-parse.h
index 5f9d809..0b2405a 100644
--- a/packv4-parse.h
+++ b/packv4-parse.h
@@ -1,6 +1,14 @@
 #ifndef PACKV4_PARSE_H
 #define PACKV4_PARSE_H
 
+struct packv4_dict {
+	const unsigned char *data;
+	unsigned int nb_entries;
+	unsigned int offsets[FLEX_ARRAY];
+};
+
+struct packv4_dict *pv4_create_dict(const unsigned char *data, int dict_size);
+
 void *pv4_get_commit(struct packed_git *p, struct pack_window **w_curs,
 		     off_t offset, unsigned long size);
 void *pv4_get_tree(struct packed_git *p, struct pack_window **w_curs,
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 02/12] index-pack: split out varint decoding code
  2013-09-07 10:43 ` [PATCH 00/12] pack v4 support in index-pack Nguyễn Thái Ngọc Duy
  2013-09-07 10:43   ` [PATCH 01/12] pack v4: split pv4_create_dict() out of load_dict() Nguyễn Thái Ngọc Duy
@ 2013-09-07 10:43   ` Nguyễn Thái Ngọc Duy
  2013-09-07 10:43   ` [PATCH 03/12] index-pack: do not allocate buffer for unpacking deltas in the first pass Nguyễn Thái Ngọc Duy
                     ` (10 subsequent siblings)
  12 siblings, 0 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-07 10:43 UTC (permalink / raw)
  To: git; +Cc: Nguyễn Thái Ngọc Duy


Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/index-pack.c | 82 ++++++++++++++++++++++++++++------------------------
 1 file changed, 45 insertions(+), 37 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index 9c1cfac..5b1395d 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -275,6 +275,31 @@ static void use(int bytes)
 	consumed_bytes += bytes;
 }
 
+static inline void *fill_and_use(int bytes)
+{
+	void *p = fill(bytes);
+	use(bytes);
+	return p;
+}
+
+static NORETURN void bad_object(unsigned long offset, const char *format,
+		       ...) __attribute__((format (printf, 2, 3)));
+
+static uintmax_t read_varint(void)
+{
+	unsigned char c = *(char*)fill_and_use(1);
+	uintmax_t val = c & 127;
+	while (c & 128) {
+		val += 1;
+		if (!val || MSB(val, 7))
+			bad_object(consumed_bytes,
+				   _("offset overflow in read_varint"));
+		c = *(char*)fill_and_use(1);
+		val = (val << 7) + (c & 127);
+	}
+	return val;
+}
+
 static const char *open_pack_file(const char *pack_name)
 {
 	if (from_stdin) {
@@ -315,9 +340,6 @@ static void parse_pack_header(void)
 	use(sizeof(struct pack_header));
 }
 
-static NORETURN void bad_object(unsigned long offset, const char *format,
-		       ...) __attribute__((format (printf, 2, 3)));
-
 static NORETURN void bad_object(unsigned long offset, const char *format, ...)
 {
 	va_list params;
@@ -455,55 +477,41 @@ static void *unpack_entry_data(unsigned long offset, unsigned long size,
 	return buf == fixed_buf ? NULL : buf;
 }
 
+static void read_typesize_v2(struct object_entry *obj)
+{
+	unsigned char c = *(char*)fill_and_use(1);
+	unsigned shift;
+
+	obj->type = (c >> 4) & 7;
+	obj->size = (c & 15);
+	shift = 4;
+	while (c & 128) {
+		c = *(char*)fill_and_use(1);
+		obj->size += (c & 0x7f) << shift;
+		shift += 7;
+	}
+}
+
 static void *unpack_raw_entry(struct object_entry *obj,
 			      union delta_base *delta_base,
 			      unsigned char *sha1)
 {
-	unsigned char *p;
-	unsigned long size, c;
-	off_t base_offset;
-	unsigned shift;
 	void *data;
+	uintmax_t val;
 
 	obj->idx.offset = consumed_bytes;
 	input_crc32 = crc32(0, NULL, 0);
 
-	p = fill(1);
-	c = *p;
-	use(1);
-	obj->type = (c >> 4) & 7;
-	size = (c & 15);
-	shift = 4;
-	while (c & 0x80) {
-		p = fill(1);
-		c = *p;
-		use(1);
-		size += (c & 0x7f) << shift;
-		shift += 7;
-	}
-	obj->size = size;
+	read_typesize_v2(obj);
 
 	switch (obj->type) {
 	case OBJ_REF_DELTA:
-		hashcpy(delta_base->sha1, fill(20));
-		use(20);
+		hashcpy(delta_base->sha1, fill_and_use(20));
 		break;
 	case OBJ_OFS_DELTA:
 		memset(delta_base, 0, sizeof(*delta_base));
-		p = fill(1);
-		c = *p;
-		use(1);
-		base_offset = c & 127;
-		while (c & 128) {
-			base_offset += 1;
-			if (!base_offset || MSB(base_offset, 7))
-				bad_object(obj->idx.offset, _("offset value overflow for delta base object"));
-			p = fill(1);
-			c = *p;
-			use(1);
-			base_offset = (base_offset << 7) + (c & 127);
-		}
-		delta_base->offset = obj->idx.offset - base_offset;
+		val = read_varint();
+		delta_base->offset = obj->idx.offset - val;
 		if (delta_base->offset <= 0 || delta_base->offset >= obj->idx.offset)
 			bad_object(obj->idx.offset, _("delta base offset is out of bound"));
 		break;
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 03/12] index-pack: do not allocate buffer for unpacking deltas in the first pass
  2013-09-07 10:43 ` [PATCH 00/12] pack v4 support in index-pack Nguyễn Thái Ngọc Duy
  2013-09-07 10:43   ` [PATCH 01/12] pack v4: split pv4_create_dict() out of load_dict() Nguyễn Thái Ngọc Duy
  2013-09-07 10:43   ` [PATCH 02/12] index-pack: split out varint decoding code Nguyễn Thái Ngọc Duy
@ 2013-09-07 10:43   ` Nguyễn Thái Ngọc Duy
  2013-09-07 10:43   ` [PATCH 04/12] index-pack: split inflate/digest code out of unpack_entry_data Nguyễn Thái Ngọc Duy
                     ` (9 subsequent siblings)
  12 siblings, 0 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-07 10:43 UTC (permalink / raw)
  To: git; +Cc: Nguyễn Thái Ngọc Duy

We do need deltas until the second pass. Allocating a buffer for it
then freeing later is wasteful is unnecessary. Make it use fixed_buf
(aka large blob code path).

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/index-pack.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index 5b1395d..a47cc34 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -446,7 +446,8 @@ static void *unpack_entry_data(unsigned long offset, unsigned long size,
 		git_SHA1_Update(&c, hdr, hdrlen);
 	} else
 		sha1 = NULL;
-	if (type == OBJ_BLOB && size > big_file_threshold)
+	if (is_delta_type(type) ||
+	     (type == OBJ_BLOB && size > big_file_threshold))
 		buf = fixed_buf;
 	else
 		buf = xmalloc(size);
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 04/12] index-pack: split inflate/digest code out of unpack_entry_data
  2013-09-07 10:43 ` [PATCH 00/12] pack v4 support in index-pack Nguyễn Thái Ngọc Duy
                     ` (2 preceding siblings ...)
  2013-09-07 10:43   ` [PATCH 03/12] index-pack: do not allocate buffer for unpacking deltas in the first pass Nguyễn Thái Ngọc Duy
@ 2013-09-07 10:43   ` Nguyễn Thái Ngọc Duy
  2013-09-07 10:43   ` [PATCH 05/12] index-pack: parse v4 header and dictionaries Nguyễn Thái Ngọc Duy
                     ` (8 subsequent siblings)
  12 siblings, 0 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-07 10:43 UTC (permalink / raw)
  To: git; +Cc: Nguyễn Thái Ngọc Duy


Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/index-pack.c | 62 +++++++++++++++++++++++++++++++---------------------
 1 file changed, 37 insertions(+), 25 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index a47cc34..0dd7193 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -429,33 +429,19 @@ static int is_delta_type(enum object_type type)
 	return (type == OBJ_REF_DELTA || type == OBJ_OFS_DELTA);
 }
 
-static void *unpack_entry_data(unsigned long offset, unsigned long size,
-			       enum object_type type, unsigned char *sha1)
+static void read_and_inflate(unsigned long offset,
+			     void *buf, unsigned long size,
+			     unsigned long wraparound,
+			     git_SHA_CTX *ctx,
+			     unsigned char *sha1)
 {
-	static char fixed_buf[8192];
-	int status;
 	git_zstream stream;
-	void *buf;
-	git_SHA_CTX c;
-	char hdr[32];
-	int hdrlen;
-
-	if (!is_delta_type(type)) {
-		hdrlen = sprintf(hdr, "%s %lu", typename(type), size) + 1;
-		git_SHA1_Init(&c);
-		git_SHA1_Update(&c, hdr, hdrlen);
-	} else
-		sha1 = NULL;
-	if (is_delta_type(type) ||
-	     (type == OBJ_BLOB && size > big_file_threshold))
-		buf = fixed_buf;
-	else
-		buf = xmalloc(size);
+	int status;
 
 	memset(&stream, 0, sizeof(stream));
 	git_inflate_init(&stream);
 	stream.next_out = buf;
-	stream.avail_out = buf == fixed_buf ? sizeof(fixed_buf) : size;
+	stream.avail_out = wraparound ? wraparound : size;
 
 	do {
 		unsigned char *last_out = stream.next_out;
@@ -464,17 +450,43 @@ static void *unpack_entry_data(unsigned long offset, unsigned long size,
 		status = git_inflate(&stream, 0);
 		use(input_len - stream.avail_in);
 		if (sha1)
-			git_SHA1_Update(&c, last_out, stream.next_out - last_out);
-		if (buf == fixed_buf) {
+			git_SHA1_Update(ctx, last_out, stream.next_out - last_out);
+		if (wraparound) {
 			stream.next_out = buf;
-			stream.avail_out = sizeof(fixed_buf);
+			stream.avail_out = wraparound;
 		}
 	} while (status == Z_OK);
 	if (stream.total_out != size || status != Z_STREAM_END)
 		bad_object(offset, _("inflate returned %d"), status);
 	git_inflate_end(&stream);
 	if (sha1)
-		git_SHA1_Final(sha1, &c);
+		git_SHA1_Final(sha1, ctx);
+}
+
+static void *unpack_entry_data(unsigned long offset, unsigned long size,
+			       enum object_type type, unsigned char *sha1)
+{
+	static char fixed_buf[8192];
+	void *buf;
+	git_SHA_CTX c;
+	char hdr[32];
+	int hdrlen;
+
+	if (!is_delta_type(type)) {
+		hdrlen = sprintf(hdr, "%s %lu", typename(type), size) + 1;
+		git_SHA1_Init(&c);
+		git_SHA1_Update(&c, hdr, hdrlen);
+	} else
+		sha1 = NULL;
+	if (is_delta_type(type) ||
+	     (type == OBJ_BLOB && size > big_file_threshold))
+		buf = fixed_buf;
+	else
+		buf = xmalloc(size);
+
+	read_and_inflate(offset, buf, size,
+			 buf == fixed_buf ? sizeof(fixed_buf) : 0,
+			 &c, sha1);
 	return buf == fixed_buf ? NULL : buf;
 }
 
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 05/12] index-pack: parse v4 header and dictionaries
  2013-09-07 10:43 ` [PATCH 00/12] pack v4 support in index-pack Nguyễn Thái Ngọc Duy
                     ` (3 preceding siblings ...)
  2013-09-07 10:43   ` [PATCH 04/12] index-pack: split inflate/digest code out of unpack_entry_data Nguyễn Thái Ngọc Duy
@ 2013-09-07 10:43   ` Nguyễn Thái Ngọc Duy
  2013-09-08  2:14     ` Nicolas Pitre
  2013-09-07 10:43   ` [PATCH 06/12] index-pack: make sure all objects are registered in v4's SHA-1 table Nguyễn Thái Ngọc Duy
                     ` (7 subsequent siblings)
  12 siblings, 1 reply; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-07 10:43 UTC (permalink / raw)
  To: git; +Cc: Nguyễn Thái Ngọc Duy


Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/index-pack.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 60 insertions(+), 1 deletion(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index 0dd7193..59b6c56 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -11,6 +11,7 @@
 #include "exec_cmd.h"
 #include "streaming.h"
 #include "thread-utils.h"
+#include "packv4-parse.h"
 
 static const char index_pack_usage[] =
 "git index-pack [-v] [-o <index-file>] [--keep | --keep=<msg>] [--verify] [--strict] (<pack-file> | --stdin [--fix-thin] [<pack-file>])";
@@ -70,6 +71,8 @@ struct delta_entry {
 static struct object_entry *objects;
 static struct delta_entry *deltas;
 static struct thread_local nothread_data;
+static unsigned char *sha1_table;
+static struct packv4_dict *name_dict, *path_dict;
 static int nr_objects;
 static int nr_deltas;
 static int nr_resolved_deltas;
@@ -81,6 +84,7 @@ static int do_fsck_object;
 static int verbose;
 static int show_stat;
 static int check_self_contained_and_connected;
+static int packv4;
 
 static struct progress *progress;
 
@@ -300,6 +304,21 @@ static uintmax_t read_varint(void)
 	return val;
 }
 
+static void *read_data(int size)
+{
+	const int max = sizeof(input_buffer);
+	void *buf;
+	char *p;
+	p = buf = xmalloc(size);
+	while (size) {
+		int to_fill = size > max ? max : size;
+		memcpy(p, fill_and_use(to_fill), to_fill);
+		p += to_fill;
+		size -= to_fill;
+	}
+	return buf;
+}
+
 static const char *open_pack_file(const char *pack_name)
 {
 	if (from_stdin) {
@@ -332,7 +351,9 @@ static void parse_pack_header(void)
 	/* Header consistency check */
 	if (hdr->hdr_signature != htonl(PACK_SIGNATURE))
 		die(_("pack signature mismatch"));
-	if (!pack_version_ok(hdr->hdr_version))
+	if (hdr->hdr_version == htonl(4))
+		packv4 = 1;
+	else if (!pack_version_ok(hdr->hdr_version))
 		die(_("pack version %"PRIu32" unsupported"),
 			ntohl(hdr->hdr_version));
 
@@ -1013,6 +1034,31 @@ static void *threaded_second_pass(void *data)
 }
 #endif
 
+static struct packv4_dict *read_dict(void)
+{
+	unsigned long size;
+	unsigned char *data;
+	struct packv4_dict *dict;
+
+	size = read_varint();
+	data = xmallocz(size);
+	read_and_inflate(consumed_bytes, data, size, 0, NULL, NULL);
+	dict = pv4_create_dict(data, size);
+	if (!dict)
+		die("unable to parse dictionary");
+	return dict;
+}
+
+static void parse_dictionaries(void)
+{
+	if (!packv4)
+		return;
+
+	sha1_table = read_data(20 * nr_objects);
+	name_dict = read_dict();
+	path_dict = read_dict();
+}
+
 /*
  * First pass:
  * - find locations of all objects;
@@ -1651,6 +1697,7 @@ int cmd_index_pack(int argc, const char **argv, const char *prefix)
 	parse_pack_header();
 	objects = xcalloc(nr_objects + 1, sizeof(struct object_entry));
 	deltas = xcalloc(nr_objects, sizeof(struct delta_entry));
+	parse_dictionaries();
 	parse_pack_objects(pack_sha1);
 	resolve_deltas();
 	conclude_pack(fix_thin_pack, curr_pack, pack_sha1);
@@ -1661,6 +1708,9 @@ int cmd_index_pack(int argc, const char **argv, const char *prefix)
 	if (show_stat)
 		show_pack_info(stat_only);
 
+	if (packv4)
+		die("we're not there yet");
+
 	idx_objects = xmalloc((nr_objects) * sizeof(struct pack_idx_entry *));
 	for (i = 0; i < nr_objects; i++)
 		idx_objects[i] = &objects[i].idx;
@@ -1677,6 +1727,15 @@ int cmd_index_pack(int argc, const char **argv, const char *prefix)
 	free(objects);
 	free(index_name_buf);
 	free(keep_name_buf);
+	free(sha1_table);
+	if (name_dict) {
+		free((void*)name_dict->data);
+		free(name_dict);
+	}
+	if (path_dict) {
+		free((void*)path_dict->data);
+		free(path_dict);
+	}
 	if (pack_name == NULL)
 		free((void *) curr_pack);
 	if (index_name == NULL)
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 06/12] index-pack: make sure all objects are registered in v4's SHA-1 table
  2013-09-07 10:43 ` [PATCH 00/12] pack v4 support in index-pack Nguyễn Thái Ngọc Duy
                     ` (4 preceding siblings ...)
  2013-09-07 10:43   ` [PATCH 05/12] index-pack: parse v4 header and dictionaries Nguyễn Thái Ngọc Duy
@ 2013-09-07 10:43   ` Nguyễn Thái Ngọc Duy
  2013-09-07 10:43   ` [PATCH 07/12] index-pack: parse v4 commit format Nguyễn Thái Ngọc Duy
                     ` (6 subsequent siblings)
  12 siblings, 0 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-07 10:43 UTC (permalink / raw)
  To: git; +Cc: Nguyễn Thái Ngọc Duy


Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/index-pack.c | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index 59b6c56..db2370d 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -742,6 +742,19 @@ static int check_collison(struct object_entry *entry)
 	return 0;
 }
 
+static void check_against_sha1table(struct object_entry *obj)
+{
+	const unsigned char *found;
+	if (!packv4)
+		return;
+
+	found = bsearch(obj->idx.sha1, sha1_table, nr_objects, 20,
+			(int (*)(const void *, const void *))hashcmp);
+	if (!found)
+		die(_("object %s not found in SHA-1 table"),
+		    sha1_to_hex(obj->idx.sha1));
+}
+
 static void sha1_object(const void *data, struct object_entry *obj_entry,
 			unsigned long size, enum object_type type,
 			const unsigned char *sha1)
@@ -910,6 +923,7 @@ static void resolve_delta(struct object_entry *delta_obj,
 		bad_object(delta_obj->idx.offset, _("failed to apply delta"));
 	hash_sha1_file(result->data, result->size,
 		       typename(delta_obj->real_type), delta_obj->idx.sha1);
+	check_against_sha1table(delta_obj);
 	sha1_object(result->data, NULL, result->size, delta_obj->real_type,
 		    delta_obj->idx.sha1);
 	counter_lock();
@@ -1087,8 +1101,12 @@ static void parse_pack_objects(unsigned char *sha1)
 			/* large blobs, check later */
 			obj->real_type = OBJ_BAD;
 			nr_delays++;
-		} else
-			sha1_object(data, NULL, obj->size, obj->type, obj->idx.sha1);
+			check_against_sha1table(obj);
+		} else {
+			check_against_sha1table(obj);
+			sha1_object(data, NULL, obj->size, obj->type,
+				    obj->idx.sha1);
+		}
 		free(data);
 		display_progress(progress, i+1);
 	}
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 07/12] index-pack: parse v4 commit format
  2013-09-07 10:43 ` [PATCH 00/12] pack v4 support in index-pack Nguyễn Thái Ngọc Duy
                     ` (5 preceding siblings ...)
  2013-09-07 10:43   ` [PATCH 06/12] index-pack: make sure all objects are registered in v4's SHA-1 table Nguyễn Thái Ngọc Duy
@ 2013-09-07 10:43   ` Nguyễn Thái Ngọc Duy
  2013-09-07 10:43   ` [PATCH 08/12] index-pack: parse v4 tree format Nguyễn Thái Ngọc Duy
                     ` (5 subsequent siblings)
  12 siblings, 0 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-07 10:43 UTC (permalink / raw)
  To: git; +Cc: Nguyễn Thái Ngọc Duy


Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/index-pack.c | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 92 insertions(+), 3 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index db2370d..210b78d 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -304,6 +304,30 @@ static uintmax_t read_varint(void)
 	return val;
 }
 
+static const unsigned char *read_sha1ref(void)
+{
+	unsigned int index = read_varint();
+	if (!index) {
+		static unsigned char sha1[20];
+		hashcpy(sha1, fill_and_use(20));
+		return sha1;
+	}
+	index--;
+	if (index >= nr_objects)
+		bad_object(consumed_bytes,
+			   _("bad index in read_sha1ref"));
+	return sha1_table + index * 20;
+}
+
+static const unsigned char *read_dictref(struct packv4_dict *dict)
+{
+	unsigned int index = read_varint();
+	if (index >= dict->nb_entries)
+		bad_object(consumed_bytes,
+			   _("bad index in read_dictref"));
+	return  dict->data + dict->offsets[index];
+}
+
 static void *read_data(int size)
 {
 	const int max = sizeof(input_buffer);
@@ -484,6 +508,59 @@ static void read_and_inflate(unsigned long offset,
 		git_SHA1_Final(sha1, ctx);
 }
 
+static void *unpack_commit_v4(unsigned int offset,
+			      unsigned long size,
+			      unsigned char *sha1)
+{
+	unsigned int nb_parents;
+	const unsigned char *committer, *author, *ident;
+	unsigned long author_time, committer_time;
+	git_SHA_CTX ctx;
+	char hdr[32];
+	int hdrlen;
+	int16_t committer_tz, author_tz;
+	struct strbuf dst;
+
+	strbuf_init(&dst, size);
+
+	strbuf_addf(&dst, "tree %s\n", sha1_to_hex(read_sha1ref()));
+	nb_parents = read_varint();
+	while (nb_parents--)
+		strbuf_addf(&dst, "parent %s\n", sha1_to_hex(read_sha1ref()));
+
+	committer_time = read_varint();
+	ident = read_dictref(name_dict);
+	committer_tz = (ident[0] << 8) | ident[1];
+	committer = ident + 2;
+
+	author_time = read_varint();
+	ident = read_dictref(name_dict);
+	author_tz = (ident[0] << 8) | ident[1];
+	author = ident + 2;
+
+	if (author_time & 1)
+		author_time = committer_time + (author_time >> 1);
+	else
+		author_time = committer_time - (author_time >> 1);
+
+	strbuf_addf(&dst,
+		    "author %s %lu %+05d\n"
+		    "committer %s %lu %+05d\n",
+		    author, author_time, author_tz,
+		    committer, committer_time, committer_tz);
+
+	if (dst.len > size)
+		bad_object(offset, _("bad commit"));
+
+	hdrlen = sprintf(hdr, "commit %lu", size) + 1;
+	git_SHA1_Init(&ctx);
+	git_SHA1_Update(&ctx, hdr, hdrlen);
+	git_SHA1_Update(&ctx, dst.buf, dst.len);
+	read_and_inflate(offset, dst.buf + dst.len, size - dst.len,
+			 0, &ctx, sha1);
+	return dst.buf;
+}
+
 static void *unpack_entry_data(unsigned long offset, unsigned long size,
 			       enum object_type type, unsigned char *sha1)
 {
@@ -493,6 +570,9 @@ static void *unpack_entry_data(unsigned long offset, unsigned long size,
 	char hdr[32];
 	int hdrlen;
 
+	if (type == OBJ_PV4_COMMIT)
+		return unpack_commit_v4(offset, size, sha1);
+
 	if (!is_delta_type(type)) {
 		hdrlen = sprintf(hdr, "%s %lu", typename(type), size) + 1;
 		git_SHA1_Init(&c);
@@ -536,7 +616,13 @@ static void *unpack_raw_entry(struct object_entry *obj,
 	obj->idx.offset = consumed_bytes;
 	input_crc32 = crc32(0, NULL, 0);
 
-	read_typesize_v2(obj);
+	if (packv4) {
+		val = read_varint();
+		obj->type = val & 15;
+		obj->size = val >> 4;
+	} else
+		read_typesize_v2(obj);
+	obj->real_type = obj->type;
 
 	switch (obj->type) {
 	case OBJ_REF_DELTA:
@@ -554,6 +640,10 @@ static void *unpack_raw_entry(struct object_entry *obj,
 	case OBJ_BLOB:
 	case OBJ_TAG:
 		break;
+
+	case OBJ_PV4_COMMIT:
+		obj->real_type = OBJ_COMMIT;
+		break;
 	default:
 		bad_object(obj->idx.offset, _("unknown object type %d"), obj->type);
 	}
@@ -1092,7 +1182,6 @@ static void parse_pack_objects(unsigned char *sha1)
 	for (i = 0; i < nr_objects; i++) {
 		struct object_entry *obj = &objects[i];
 		void *data = unpack_raw_entry(obj, &delta->base, obj->idx.sha1);
-		obj->real_type = obj->type;
 		if (is_delta_type(obj->type)) {
 			nr_deltas++;
 			delta->obj_no = i;
@@ -1104,7 +1193,7 @@ static void parse_pack_objects(unsigned char *sha1)
 			check_against_sha1table(obj);
 		} else {
 			check_against_sha1table(obj);
-			sha1_object(data, NULL, obj->size, obj->type,
+			sha1_object(data, NULL, obj->size, obj->real_type,
 				    obj->idx.sha1);
 		}
 		free(data);
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 08/12] index-pack: parse v4 tree format
  2013-09-07 10:43 ` [PATCH 00/12] pack v4 support in index-pack Nguyễn Thái Ngọc Duy
                     ` (6 preceding siblings ...)
  2013-09-07 10:43   ` [PATCH 07/12] index-pack: parse v4 commit format Nguyễn Thái Ngọc Duy
@ 2013-09-07 10:43   ` Nguyễn Thái Ngọc Duy
  2013-09-08  2:52     ` Nicolas Pitre
  2013-09-07 10:43   ` [PATCH 09/12] index-pack: move delta base queuing code to unpack_raw_entry Nguyễn Thái Ngọc Duy
                     ` (4 subsequent siblings)
  12 siblings, 1 reply; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-07 10:43 UTC (permalink / raw)
  To: git; +Cc: Nguyễn Thái Ngọc Duy


Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/index-pack.c | 104 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 100 insertions(+), 4 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index 210b78d..51ca64b 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -319,6 +319,21 @@ static const unsigned char *read_sha1ref(void)
 	return sha1_table + index * 20;
 }
 
+static const unsigned char *read_sha1table_ref(void)
+{
+	const unsigned char *sha1 = read_sha1ref();
+	if (sha1 < sha1_table || sha1 >= sha1_table + nr_objects * 20) {
+		unsigned char *found;
+		found = bsearch(sha1, sha1_table, nr_objects, 20,
+				(int (*)(const void *, const void *))hashcmp);
+		if (!found)
+			bad_object(consumed_bytes,
+				   _("SHA-1 %s not found in SHA-1 table"),
+				   sha1_to_hex(sha1));
+	}
+	return sha1;
+}
+
 static const unsigned char *read_dictref(struct packv4_dict *dict)
 {
 	unsigned int index = read_varint();
@@ -561,17 +576,93 @@ static void *unpack_commit_v4(unsigned int offset,
 	return dst.buf;
 }
 
-static void *unpack_entry_data(unsigned long offset, unsigned long size,
-			       enum object_type type, unsigned char *sha1)
+/*
+ * v4 trees are actually kind of deltas and we don't do delta in the
+ * first pass. This function only walks through a tree object to find
+ * the end offset, register object dependencies and performs limited
+ * validation.
+ */
+static void *unpack_tree_v4(struct object_entry *obj,
+			    unsigned int offset, unsigned long size,
+			    unsigned char *sha1)
+{
+	unsigned int nr = read_varint();
+	const unsigned char *last_base = NULL;
+	struct strbuf sb = STRBUF_INIT;
+	while (nr) {
+		unsigned int copy_start_or_path = read_varint();
+		if (copy_start_or_path & 1) { /* copy_start */
+			unsigned int copy_count = read_varint();
+			if (copy_count & 1) { /* first delta */
+				last_base = read_sha1table_ref();
+			} else if (!last_base)
+				bad_object(offset,
+					   _("bad copy count index in unpack_tree_v4"));
+			copy_count >>= 1;
+			if (!copy_count)
+				bad_object(offset,
+					   _("bad copy count index in unpack_tree_v4"));
+			nr -= copy_count;
+		} else {	/* path */
+			unsigned int path_idx = copy_start_or_path >> 1;
+			const unsigned char *entry_sha1;
+
+			if (path_idx >= path_dict->nb_entries)
+				bad_object(offset,
+					   _("bad path index in unpack_tree_v4"));
+			entry_sha1 = read_sha1ref();
+			nr--;
+
+			if (!last_base) {
+				const unsigned char *path;
+				unsigned mode;
+
+				path = path_dict->data + path_dict->offsets[path_idx];
+				mode = (path[0] << 8) | path[1];
+				strbuf_addf(&sb, "%o %s%c", mode, path+2, '\0');
+				strbuf_add(&sb, entry_sha1, 20);
+				if (sb.len > size)
+					bad_object(offset,
+						   _("tree larger than expected"));
+			}
+		}
+	}
+
+	if (last_base) {
+		strbuf_release(&sb);
+		return NULL;
+	} else {
+		git_SHA_CTX ctx;
+		char hdr[32];
+		int hdrlen;
+
+		if (sb.len != size)
+			bad_object(offset, _("tree size mismatch"));
+
+		hdrlen = sprintf(hdr, "tree %lu", size) + 1;
+		git_SHA1_Init(&ctx);
+		git_SHA1_Update(&ctx, hdr, hdrlen);
+		git_SHA1_Update(&ctx, sb.buf, size);
+		git_SHA1_Final(sha1, &ctx);
+		return strbuf_detach(&sb, NULL);
+	}
+}
+
+static void *unpack_entry_data(struct object_entry *obj, unsigned char *sha1)
 {
 	static char fixed_buf[8192];
 	void *buf;
 	git_SHA_CTX c;
 	char hdr[32];
 	int hdrlen;
+	unsigned long offset = obj->idx.offset;
+	unsigned long size = obj->size;
+	enum object_type type = obj->type;
 
 	if (type == OBJ_PV4_COMMIT)
 		return unpack_commit_v4(offset, size, sha1);
+	if (type == OBJ_PV4_TREE)
+		return unpack_tree_v4(obj, offset, size, sha1);
 
 	if (!is_delta_type(type)) {
 		hdrlen = sprintf(hdr, "%s %lu", typename(type), size) + 1;
@@ -640,16 +731,19 @@ static void *unpack_raw_entry(struct object_entry *obj,
 	case OBJ_BLOB:
 	case OBJ_TAG:
 		break;
-
 	case OBJ_PV4_COMMIT:
 		obj->real_type = OBJ_COMMIT;
 		break;
+	case OBJ_PV4_TREE:
+		obj->real_type = OBJ_TREE;
+		break;
+
 	default:
 		bad_object(obj->idx.offset, _("unknown object type %d"), obj->type);
 	}
 	obj->hdr_size = consumed_bytes - obj->idx.offset;
 
-	data = unpack_entry_data(obj->idx.offset, obj->size, obj->type, sha1);
+	data = unpack_entry_data(obj, sha1);
 	obj->idx.crc32 = input_crc32;
 	return data;
 }
@@ -1186,6 +1280,8 @@ static void parse_pack_objects(unsigned char *sha1)
 			nr_deltas++;
 			delta->obj_no = i;
 			delta++;
+		} else if (!data && obj->type == OBJ_PV4_TREE) {
+			/* delay sha1_object() until second pass */
 		} else if (!data) {
 			/* large blobs, check later */
 			obj->real_type = OBJ_BAD;
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 09/12] index-pack: move delta base queuing code to unpack_raw_entry
  2013-09-07 10:43 ` [PATCH 00/12] pack v4 support in index-pack Nguyễn Thái Ngọc Duy
                     ` (7 preceding siblings ...)
  2013-09-07 10:43   ` [PATCH 08/12] index-pack: parse v4 tree format Nguyễn Thái Ngọc Duy
@ 2013-09-07 10:43   ` Nguyễn Thái Ngọc Duy
  2013-09-07 10:43   ` [PATCH 10/12] index-pack: record all delta bases in v4 (tree and ref-delta) Nguyễn Thái Ngọc Duy
                     ` (3 subsequent siblings)
  12 siblings, 0 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-07 10:43 UTC (permalink / raw)
  To: git; +Cc: Nguyễn Thái Ngọc Duy

For v2, ofs-delta and ref-delta can only have queue one delta base at
a time. A v4 tree can have more than one delta base. Move the queuing
code up to unpack_raw_entry() and give unpack_tree_v4() more
flexibility to add its bases.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/index-pack.c | 46 ++++++++++++++++++++++++++++++----------------
 1 file changed, 30 insertions(+), 16 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index 51ca64b..c5a8f68 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -576,6 +576,25 @@ static void *unpack_commit_v4(unsigned int offset,
 	return dst.buf;
 }
 
+static void add_sha1_delta(struct object_entry *obj,
+			   const unsigned char *sha1)
+{
+	struct delta_entry *delta = deltas + nr_deltas;
+	delta->obj_no = obj - objects;
+	hashcpy(delta->base.sha1, sha1);
+	nr_deltas++;
+}
+
+static void add_ofs_delta(struct object_entry *obj,
+			  off_t offset)
+{
+	struct delta_entry *delta = deltas + nr_deltas;
+	delta->obj_no = obj - objects;
+	memset(&delta->base, 0, sizeof(delta->base));
+	delta->base.offset = offset;
+	nr_deltas++;
+}
+
 /*
  * v4 trees are actually kind of deltas and we don't do delta in the
  * first pass. This function only walks through a tree object to find
@@ -698,17 +717,16 @@ static void read_typesize_v2(struct object_entry *obj)
 }
 
 static void *unpack_raw_entry(struct object_entry *obj,
-			      union delta_base *delta_base,
 			      unsigned char *sha1)
 {
 	void *data;
-	uintmax_t val;
+	off_t offset;
 
 	obj->idx.offset = consumed_bytes;
 	input_crc32 = crc32(0, NULL, 0);
 
 	if (packv4) {
-		val = read_varint();
+		uintmax_t val = read_varint();
 		obj->type = val & 15;
 		obj->size = val >> 4;
 	} else
@@ -717,14 +735,14 @@ static void *unpack_raw_entry(struct object_entry *obj,
 
 	switch (obj->type) {
 	case OBJ_REF_DELTA:
-		hashcpy(delta_base->sha1, fill_and_use(20));
+		add_sha1_delta(obj, fill_and_use(20));
 		break;
 	case OBJ_OFS_DELTA:
-		memset(delta_base, 0, sizeof(*delta_base));
-		val = read_varint();
-		delta_base->offset = obj->idx.offset - val;
-		if (delta_base->offset <= 0 || delta_base->offset >= obj->idx.offset)
-			bad_object(obj->idx.offset, _("delta base offset is out of bound"));
+		offset = obj->idx.offset - read_varint();
+		if (offset <= 0 || offset >= obj->idx.offset)
+			bad_object(obj->idx.offset,
+				   _("delta base offset is out of bound"));
+		add_ofs_delta(obj, offset);
 		break;
 	case OBJ_COMMIT:
 	case OBJ_TREE:
@@ -1266,7 +1284,6 @@ static void parse_dictionaries(void)
 static void parse_pack_objects(unsigned char *sha1)
 {
 	int i, nr_delays = 0;
-	struct delta_entry *delta = deltas;
 	struct stat st;
 
 	if (verbose)
@@ -1275,12 +1292,9 @@ static void parse_pack_objects(unsigned char *sha1)
 				nr_objects);
 	for (i = 0; i < nr_objects; i++) {
 		struct object_entry *obj = &objects[i];
-		void *data = unpack_raw_entry(obj, &delta->base, obj->idx.sha1);
-		if (is_delta_type(obj->type)) {
-			nr_deltas++;
-			delta->obj_no = i;
-			delta++;
-		} else if (!data && obj->type == OBJ_PV4_TREE) {
+		void *data = unpack_raw_entry(obj, obj->idx.sha1);
+		if (is_delta_type(obj->type) ||
+		    (!data && obj->type == OBJ_PV4_TREE)) {
 			/* delay sha1_object() until second pass */
 		} else if (!data) {
 			/* large blobs, check later */
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 10/12] index-pack: record all delta bases in v4 (tree and ref-delta)
  2013-09-07 10:43 ` [PATCH 00/12] pack v4 support in index-pack Nguyễn Thái Ngọc Duy
                     ` (8 preceding siblings ...)
  2013-09-07 10:43   ` [PATCH 09/12] index-pack: move delta base queuing code to unpack_raw_entry Nguyễn Thái Ngọc Duy
@ 2013-09-07 10:43   ` Nguyễn Thái Ngọc Duy
  2013-09-07 10:43   ` [PATCH 11/12] index-pack: skip looking for ofs-deltas in v4 as they are not allowed Nguyễn Thái Ngọc Duy
                     ` (2 subsequent siblings)
  12 siblings, 0 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-07 10:43 UTC (permalink / raw)
  To: git; +Cc: Nguyễn Thái Ngọc Duy


Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/index-pack.c | 33 ++++++++++++++++++++++++++++++---
 1 file changed, 30 insertions(+), 3 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index c5a8f68..33722e1 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -24,6 +24,7 @@ struct object_entry {
 	enum object_type real_type;
 	unsigned delta_depth;
 	int base_object_no;
+	int nr_bases;		/* only valid for v4 trees */
 };
 
 union delta_base {
@@ -489,6 +490,11 @@ static int is_delta_type(enum object_type type)
 	return (type == OBJ_REF_DELTA || type == OBJ_OFS_DELTA);
 }
 
+static int is_delta_tree(const struct object_entry *obj)
+{
+	return obj->type == OBJ_PV4_TREE && obj->nr_bases > 0;
+}
+
 static void read_and_inflate(unsigned long offset,
 			     void *buf, unsigned long size,
 			     unsigned long wraparound,
@@ -595,6 +601,20 @@ static void add_ofs_delta(struct object_entry *obj,
 	nr_deltas++;
 }
 
+static void add_tree_delta_base(struct object_entry *obj,
+				const unsigned char *base,
+				int delta_start)
+{
+	int i;
+
+	for (i = delta_start; i < nr_deltas; i++)
+		if (!hashcmp(base, deltas[i].base.sha1))
+			return;
+
+	add_sha1_delta(obj, base);
+	obj->nr_bases++;
+}
+
 /*
  * v4 trees are actually kind of deltas and we don't do delta in the
  * first pass. This function only walks through a tree object to find
@@ -608,12 +628,14 @@ static void *unpack_tree_v4(struct object_entry *obj,
 	unsigned int nr = read_varint();
 	const unsigned char *last_base = NULL;
 	struct strbuf sb = STRBUF_INIT;
+	int delta_start = nr_deltas;
 	while (nr) {
 		unsigned int copy_start_or_path = read_varint();
 		if (copy_start_or_path & 1) { /* copy_start */
 			unsigned int copy_count = read_varint();
 			if (copy_count & 1) { /* first delta */
 				last_base = read_sha1table_ref();
+				add_tree_delta_base(obj, last_base, delta_start);
 			} else if (!last_base)
 				bad_object(offset,
 					   _("bad copy count index in unpack_tree_v4"));
@@ -735,9 +757,15 @@ static void *unpack_raw_entry(struct object_entry *obj,
 
 	switch (obj->type) {
 	case OBJ_REF_DELTA:
-		add_sha1_delta(obj, fill_and_use(20));
+		if (packv4)
+			add_sha1_delta(obj, read_sha1table_ref());
+		else
+			add_sha1_delta(obj, fill_and_use(20));
 		break;
 	case OBJ_OFS_DELTA:
+		if (packv4)
+			die(_("pack version 4 does not support ofs-delta type (offset %lu)"),
+			    obj->idx.offset);
 		offset = obj->idx.offset - read_varint();
 		if (offset <= 0 || offset >= obj->idx.offset)
 			bad_object(obj->idx.offset,
@@ -1293,8 +1321,7 @@ static void parse_pack_objects(unsigned char *sha1)
 	for (i = 0; i < nr_objects; i++) {
 		struct object_entry *obj = &objects[i];
 		void *data = unpack_raw_entry(obj, obj->idx.sha1);
-		if (is_delta_type(obj->type) ||
-		    (!data && obj->type == OBJ_PV4_TREE)) {
+		if (is_delta_type(obj->type) || is_delta_tree(obj)) {
 			/* delay sha1_object() until second pass */
 		} else if (!data) {
 			/* large blobs, check later */
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 11/12] index-pack: skip looking for ofs-deltas in v4 as they are not allowed
  2013-09-07 10:43 ` [PATCH 00/12] pack v4 support in index-pack Nguyễn Thái Ngọc Duy
                     ` (9 preceding siblings ...)
  2013-09-07 10:43   ` [PATCH 10/12] index-pack: record all delta bases in v4 (tree and ref-delta) Nguyễn Thái Ngọc Duy
@ 2013-09-07 10:43   ` Nguyễn Thái Ngọc Duy
  2013-09-07 10:43   ` [PATCH 12/12] index-pack: resolve v4 one-base trees Nguyễn Thái Ngọc Duy
  2013-09-08  7:22   ` [PATCH v2 00/14] pack v4 support in index-pack Nguyễn Thái Ngọc Duy
  12 siblings, 0 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-07 10:43 UTC (permalink / raw)
  To: git; +Cc: Nguyễn Thái Ngọc Duy


Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/index-pack.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index 33722e1..1fa74f4 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -1171,10 +1171,13 @@ static struct base_data *find_unresolved_deltas_1(struct base_data *base,
 		find_delta_children(&base_spec,
 				    &base->ref_first, &base->ref_last, OBJ_REF_DELTA);
 
-		memset(&base_spec, 0, sizeof(base_spec));
-		base_spec.offset = base->obj->idx.offset;
-		find_delta_children(&base_spec,
-				    &base->ofs_first, &base->ofs_last, OBJ_OFS_DELTA);
+		if (!packv4) {
+			memset(&base_spec, 0, sizeof(base_spec));
+			base_spec.offset = base->obj->idx.offset;
+			find_delta_children(&base_spec,
+					    &base->ofs_first, &base->ofs_last,
+					    OBJ_OFS_DELTA);
+		}
 
 		if (base->ref_last == -1 && base->ofs_last == -1) {
 			free(base->data);
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 12/12] index-pack: resolve v4 one-base trees
  2013-09-07 10:43 ` [PATCH 00/12] pack v4 support in index-pack Nguyễn Thái Ngọc Duy
                     ` (10 preceding siblings ...)
  2013-09-07 10:43   ` [PATCH 11/12] index-pack: skip looking for ofs-deltas in v4 as they are not allowed Nguyễn Thái Ngọc Duy
@ 2013-09-07 10:43   ` Nguyễn Thái Ngọc Duy
  2013-09-08  3:28     ` Nicolas Pitre
  2013-09-08  7:22   ` [PATCH v2 00/14] pack v4 support in index-pack Nguyễn Thái Ngọc Duy
  12 siblings, 1 reply; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-07 10:43 UTC (permalink / raw)
  To: git; +Cc: Nguyễn Thái Ngọc Duy

This is the most common case for delta trees. In fact it's the only
kind that's produced by packv4-create. It fits well in the way
index-pack resolves deltas and benefits from threading (the set of
objects depending on this base does not overlap with the set of
objects depending on another base)

Multi-base trees will be probably processed differently.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/index-pack.c | 194 ++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 178 insertions(+), 16 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index 1fa74f4..4a24bc3 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -12,6 +12,8 @@
 #include "streaming.h"
 #include "thread-utils.h"
 #include "packv4-parse.h"
+#include "varint.h"
+#include "tree-walk.h"
 
 static const char index_pack_usage[] =
 "git index-pack [-v] [-o <index-file>] [--keep | --keep=<msg>] [--verify] [--strict] (<pack-file> | --stdin [--fix-thin] [<pack-file>])";
@@ -38,8 +40,8 @@ struct base_data {
 	struct object_entry *obj;
 	void *data;
 	unsigned long size;
-	int ref_first, ref_last;
-	int ofs_first, ofs_last;
+	int ref_first, ref_last, tree_first;
+	int ofs_first, ofs_last, tree_last;
 };
 
 #if !defined(NO_PTHREADS) && defined(NO_THREAD_SAFE_PREAD)
@@ -437,6 +439,7 @@ static struct base_data *alloc_base_data(void)
 	memset(base, 0, sizeof(*base));
 	base->ref_last = -1;
 	base->ofs_last = -1;
+	base->tree_last = -1;
 	return base;
 }
 
@@ -670,6 +673,8 @@ static void *unpack_tree_v4(struct object_entry *obj,
 	}
 
 	if (last_base) {
+		if (nr_deltas - delta_start > 1)
+			die("sorry guys, multi-base trees are not supported yet");
 		strbuf_release(&sb);
 		return NULL;
 	} else {
@@ -794,6 +799,83 @@ static void *unpack_raw_entry(struct object_entry *obj,
 	return data;
 }
 
+static void *patch_one_base_tree(const struct object_entry *src,
+				 const unsigned char *src_buf,
+				 const unsigned char *delta_buf,
+				 unsigned long delta_size,
+				 unsigned long *dst_size)
+{
+	unsigned int nr;
+	const unsigned char *last_base = NULL;
+	struct strbuf sb = STRBUF_INIT;
+	const unsigned char *p = delta_buf;
+
+	nr = decode_varint(&p);
+	while (nr && p < delta_buf + delta_size) {
+		unsigned int copy_start_or_path = decode_varint(&p);
+		if (copy_start_or_path & 1) { /* copy_start */
+			struct tree_desc desc;
+			struct name_entry entry;
+			unsigned int copy_count = decode_varint(&p);
+			unsigned int copy_start = copy_start_or_path >> 1;
+			if (!src)
+				die("we are not supposed to copy from another tree!");
+			if (copy_count & 1) { /* first delta */
+				unsigned int id = decode_varint(&p);
+				if (!id) {
+					last_base = p;
+					p += 20;
+				} else
+					last_base = sha1_table + (id - 1) * 20;
+				if (hashcmp(last_base, src->idx.sha1))
+					die(_("bad tree base in patch_one_base_tree"));
+			} else if (!last_base)
+				die(_("bad copy count index in patch_one_base_tree"));
+			copy_count >>= 1;
+			if (!copy_count)
+				die(_("bad copy count index in patch_one_base_tree"));
+			nr -= copy_count;
+
+			init_tree_desc(&desc, src_buf, src->size);
+			while (tree_entry(&desc, &entry)) {
+				if (copy_start)
+					copy_start--;
+				else if (copy_count) {
+					strbuf_addf(&sb, "%o %s%c", entry.mode, entry.path, '\0');
+					strbuf_add(&sb, entry.sha1, 20);
+					copy_count--;
+				} else
+					break;
+			}
+		} else {	/* path */
+			unsigned int path_idx = copy_start_or_path >> 1;
+			const unsigned char *path;
+			unsigned mode;
+			unsigned int id;
+			const unsigned char *entry_sha1;
+
+			if (path_idx >= path_dict->nb_entries)
+				die(_("bad path index in unpack_tree_v4"));
+			id = decode_varint(&p);
+			if (!id) {
+				entry_sha1 = p;
+				p += 20;
+			} else
+				entry_sha1 = sha1_table + (id - 1) * 20;
+			nr--;
+
+			path = path_dict->data + path_dict->offsets[path_idx];
+			mode = (path[0] << 8) | path[1];
+			strbuf_addf(&sb, "%o %s%c", mode, path+2, '\0');
+			strbuf_add(&sb, entry_sha1, 20);
+		}
+	}
+	if (nr != 0 || p != delta_buf + delta_size)
+		die(_("bad delta tree"));
+	*dst_size = sb.len;
+	return sb.buf;
+}
+
 static void *unpack_data(struct object_entry *obj,
 			 int (*consume)(const unsigned char *, unsigned long, void *),
 			 void *cb_data)
@@ -855,8 +937,33 @@ static void *unpack_data(struct object_entry *obj,
 	return data;
 }
 
+static void *get_tree_v4_from_pack(struct object_entry *obj,
+				   unsigned long *len_p)
+{
+	off_t from = obj[0].idx.offset + obj[0].hdr_size;
+	unsigned long len = obj[1].idx.offset - from;
+	unsigned char *data;
+	ssize_t n;
+
+	data = xmalloc(len);
+	n = pread(pack_fd, data, len, from);
+	if (n < 0)
+		die_errno(_("cannot pread pack file"));
+	if (!n)
+		die(Q_("premature end of pack file, %lu byte missing",
+		       "premature end of pack file, %lu bytes missing",
+		       len),
+		    len);
+	if (len_p)
+		*len_p = len;
+	return data;
+}
+
 static void *get_data_from_pack(struct object_entry *obj)
 {
+	if (obj->type == OBJ_PV4_COMMIT || obj->type == OBJ_PV4_TREE)
+		die("BUG: unsupported code path");
+
 	return unpack_data(obj, NULL, NULL);
 }
 
@@ -1096,14 +1203,25 @@ static void *get_base_data(struct base_data *c)
 		struct object_entry *obj = c->obj;
 		struct base_data **delta = NULL;
 		int delta_nr = 0, delta_alloc = 0;
+		unsigned long size, len;
 
-		while (is_delta_type(c->obj->type) && !c->data) {
+		while ((is_delta_type(c->obj->type) ||
+			(c->base && c->obj->type == OBJ_PV4_TREE)) &&
+		       !c->data) {
 			ALLOC_GROW(delta, delta_nr + 1, delta_alloc);
 			delta[delta_nr++] = c;
 			c = c->base;
 		}
 		if (!delta_nr) {
-			c->data = get_data_from_pack(obj);
+			if (c->obj->type == OBJ_PV4_TREE) {
+				void *tree_v4 = get_tree_v4_from_pack(obj, &len);
+				c->data = patch_one_base_tree(NULL, NULL,
+							      tree_v4, len, &size);
+				if (size != obj->size)
+					die("size mismatch");
+				free(tree_v4);
+			} else
+				c->data = get_data_from_pack(obj);
 			c->size = obj->size;
 			get_thread_data()->base_cache_used += c->size;
 			prune_base_data(c);
@@ -1113,11 +1231,18 @@ static void *get_base_data(struct base_data *c)
 			c = delta[delta_nr - 1];
 			obj = c->obj;
 			base = get_base_data(c->base);
-			raw = get_data_from_pack(obj);
-			c->data = patch_delta(
-				base, c->base->size,
-				raw, obj->size,
-				&c->size);
+			if (c->obj->type == OBJ_PV4_TREE) {
+				raw = get_tree_v4_from_pack(obj, &len);
+				c->data = patch_one_base_tree(c->base->obj, base,
+							      raw, len, &size);
+				if (size != obj->size)
+					die("size mismatch");
+			} else {
+				raw = get_data_from_pack(obj);
+				c->data = patch_delta(base, c->base->size,
+						      raw, obj->size,
+						      &c->size);
+			}
 			free(raw);
 			if (!c->data)
 				bad_object(obj->idx.offset, _("failed to apply delta"));
@@ -1133,6 +1258,8 @@ static void resolve_delta(struct object_entry *delta_obj,
 			  struct base_data *base, struct base_data *result)
 {
 	void *base_data, *delta_data;
+	int tree_v4 = delta_obj->type == OBJ_PV4_TREE;
+	unsigned long tree_size;
 
 	delta_obj->real_type = base->obj->real_type;
 	if (show_stat) {
@@ -1143,10 +1270,18 @@ static void resolve_delta(struct object_entry *delta_obj,
 		deepest_delta_unlock();
 	}
 	delta_obj->base_object_no = base->obj - objects;
-	delta_data = get_data_from_pack(delta_obj);
+	if (tree_v4)
+		delta_data = get_tree_v4_from_pack(delta_obj, &tree_size);
+	else
+		delta_data = get_data_from_pack(delta_obj);
 	base_data = get_base_data(base);
 	result->obj = delta_obj;
-	result->data = patch_delta(base_data, base->size,
+	if (tree_v4)
+		result->data = patch_one_base_tree(base->obj, base_data,
+						   delta_data, tree_size,
+						   &result->size);
+	else
+		result->data = patch_delta(base_data, base->size,
 				   delta_data, delta_obj->size, &result->size);
 	free(delta_data);
 	if (!result->data)
@@ -1164,7 +1299,8 @@ static void resolve_delta(struct object_entry *delta_obj,
 static struct base_data *find_unresolved_deltas_1(struct base_data *base,
 						  struct base_data *prev_base)
 {
-	if (base->ref_last == -1 && base->ofs_last == -1) {
+	if (base->ref_last == -1 && base->ofs_last == -1 &&
+	    base->tree_last == -1) {
 		union delta_base base_spec;
 
 		hashcpy(base_spec.sha1, base->obj->idx.sha1);
@@ -1177,9 +1313,15 @@ static struct base_data *find_unresolved_deltas_1(struct base_data *base,
 			find_delta_children(&base_spec,
 					    &base->ofs_first, &base->ofs_last,
 					    OBJ_OFS_DELTA);
+		} else {
+			hashcpy(base_spec.sha1, base->obj->idx.sha1);
+			find_delta_children(&base_spec,
+					    &base->tree_first, &base->tree_last,
+					    OBJ_PV4_TREE);
 		}
 
-		if (base->ref_last == -1 && base->ofs_last == -1) {
+		if (base->ref_last == -1 && base->ofs_last == -1 &&
+		    base->tree_last == -1) {
 			free(base->data);
 			return NULL;
 		}
@@ -1213,6 +1355,25 @@ static struct base_data *find_unresolved_deltas_1(struct base_data *base,
 		return result;
 	}
 
+	while (base->tree_first <= base->tree_last) {
+		struct object_entry *child = objects + deltas[base->tree_first].obj_no;
+		struct base_data *result;
+
+		assert(child->type == OBJ_PV4_TREE);
+		if (child->nr_bases > 1) {
+			/* maybe resolved in the third pass or something */
+			base->tree_first++;
+			continue;
+		}
+		result = alloc_base_data();
+		resolve_delta(child, base, result);
+		if (base->tree_first == base->tree_last)
+			free_base_data(base);
+
+		base->tree_first++;
+		return result;
+	}
+
 	unlink_base_data(base);
 	return NULL;
 }
@@ -1266,7 +1427,8 @@ static void *threaded_second_pass(void *data)
 		counter_unlock();
 		work_lock();
 		while (nr_dispatched < nr_objects &&
-		       is_delta_type(objects[nr_dispatched].type))
+		       (is_delta_type(objects[nr_dispatched].type) ||
+			is_delta_tree(objects + nr_dispatched)))
 			nr_dispatched++;
 		if (nr_dispatched >= nr_objects) {
 			work_unlock();
@@ -1411,7 +1573,7 @@ static void resolve_deltas(void)
 	for (i = 0; i < nr_objects; i++) {
 		struct object_entry *obj = &objects[i];
 
-		if (is_delta_type(obj->type))
+		if (is_delta_type(obj->type) || is_delta_tree(obj))
 			continue;
 		resolve_base(obj);
 		display_progress(progress, nr_resolved_deltas);
@@ -1956,7 +2118,7 @@ int cmd_index_pack(int argc, const char **argv, const char *prefix)
 		show_pack_info(stat_only);
 
 	if (packv4)
-		die("we're not there yet");
+		opts.version = 3;
 
 	idx_objects = xmalloc((nr_objects) * sizeof(struct pack_idx_entry *));
 	for (i = 0; i < nr_objects; i++)
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* Re: [PATCH 05/12] index-pack: parse v4 header and dictionaries
  2013-09-07 10:43   ` [PATCH 05/12] index-pack: parse v4 header and dictionaries Nguyễn Thái Ngọc Duy
@ 2013-09-08  2:14     ` Nicolas Pitre
  0 siblings, 0 replies; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-08  2:14 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git

[-- Attachment #1: Type: TEXT/PLAIN, Size: 646 bytes --]

On Sat, 7 Sep 2013, Nguyễn Thái Ngọc Duy wrote:

> 
> Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
> ---

[...]

> @@ -1677,6 +1727,15 @@ int cmd_index_pack(int argc, const char **argv, const char *prefix)
>  	free(objects);
>  	free(index_name_buf);
>  	free(keep_name_buf);
> +	free(sha1_table);
> +	if (name_dict) {
> +		free((void*)name_dict->data);
> +		free(name_dict);
> +	}
> +	if (path_dict) {
> +		free((void*)path_dict->data);
> +		free(path_dict);
> +	}

The freeing of dictionary tables should probably have its own function 
in packv4-parse.c.  and a call to it added in free_pack_by_name() as 
well.


Nicolas

^ permalink raw reply	[flat|nested] 124+ messages in thread

* Re: [PATCH 08/12] index-pack: parse v4 tree format
  2013-09-07 10:43   ` [PATCH 08/12] index-pack: parse v4 tree format Nguyễn Thái Ngọc Duy
@ 2013-09-08  2:52     ` Nicolas Pitre
  0 siblings, 0 replies; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-08  2:52 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git

[-- Attachment #1: Type: TEXT/PLAIN, Size: 5649 bytes --]

On Sat, 7 Sep 2013, Nguyễn Thái Ngọc Duy wrote:

> 
> Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
> ---
>  builtin/index-pack.c | 104 +++++++++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 100 insertions(+), 4 deletions(-)
> 
> diff --git a/builtin/index-pack.c b/builtin/index-pack.c
> index 210b78d..51ca64b 100644
> --- a/builtin/index-pack.c
> +++ b/builtin/index-pack.c
> @@ -319,6 +319,21 @@ static const unsigned char *read_sha1ref(void)
>  	return sha1_table + index * 20;
>  }
>  
> +static const unsigned char *read_sha1table_ref(void)
> +{
> +	const unsigned char *sha1 = read_sha1ref();
> +	if (sha1 < sha1_table || sha1 >= sha1_table + nr_objects * 20) {
> +		unsigned char *found;
> +		found = bsearch(sha1, sha1_table, nr_objects, 20,
> +				(int (*)(const void *, const void *))hashcmp);
> +		if (!found)
> +			bad_object(consumed_bytes,
> +				   _("SHA-1 %s not found in SHA-1 table"),
> +				   sha1_to_hex(sha1));
> +	}
> +	return sha1;
> +}
> +
>  static const unsigned char *read_dictref(struct packv4_dict *dict)
>  {
>  	unsigned int index = read_varint();
> @@ -561,17 +576,93 @@ static void *unpack_commit_v4(unsigned int offset,
>  	return dst.buf;
>  }
>  
> -static void *unpack_entry_data(unsigned long offset, unsigned long size,
> -			       enum object_type type, unsigned char *sha1)
> +/*
> + * v4 trees are actually kind of deltas and we don't do delta in the
> + * first pass. This function only walks through a tree object to find
> + * the end offset, register object dependencies and performs limited
> + * validation.
> + */
> +static void *unpack_tree_v4(struct object_entry *obj,
> +			    unsigned int offset, unsigned long size,
> +			    unsigned char *sha1)
> +{
> +	unsigned int nr = read_varint();
> +	const unsigned char *last_base = NULL;
> +	struct strbuf sb = STRBUF_INIT;
> +	while (nr) {
> +		unsigned int copy_start_or_path = read_varint();
> +		if (copy_start_or_path & 1) { /* copy_start */
> +			unsigned int copy_count = read_varint();
> +			if (copy_count & 1) { /* first delta */
> +				last_base = read_sha1table_ref();
> +			} else if (!last_base)
> +				bad_object(offset,
> +					   _("bad copy count index in unpack_tree_v4"));

Here the error message could be a little more explicit i.e. "missing 
delta base" or the like in order to distinguish from the next error.

> +			copy_count >>= 1;
> +			if (!copy_count)
> +				bad_object(offset,
> +					   _("bad copy count index in unpack_tree_v4"));
> +			nr -= copy_count;

Also make sure copy_count <= nr here.

> +		} else {	/* path */
> +			unsigned int path_idx = copy_start_or_path >> 1;
> +			const unsigned char *entry_sha1;
> +
> +			if (path_idx >= path_dict->nb_entries)
> +				bad_object(offset,
> +					   _("bad path index in unpack_tree_v4"));
> +			entry_sha1 = read_sha1ref();
> +			nr--;
> +
> +			if (!last_base) {

I've been confused for a while here by the use of last_base in the non 
delta path.  A comment indicating why this used here might be helpful to 
those unfamiliar with the format.

> +				const unsigned char *path;
> +				unsigned mode;
> +
> +				path = path_dict->data + path_dict->offsets[path_idx];
> +				mode = (path[0] << 8) | path[1];
> +				strbuf_addf(&sb, "%o %s%c", mode, path+2, '\0');
> +				strbuf_add(&sb, entry_sha1, 20);
> +				if (sb.len > size)
> +					bad_object(offset,
> +						   _("tree larger than expected"));
> +			}
> +		}
> +	}
> +
> +	if (last_base) {
> +		strbuf_release(&sb);
> +		return NULL;
> +	} else {
> +		git_SHA_CTX ctx;
> +		char hdr[32];
> +		int hdrlen;
> +
> +		if (sb.len != size)
> +			bad_object(offset, _("tree size mismatch"));
> +
> +		hdrlen = sprintf(hdr, "tree %lu", size) + 1;
> +		git_SHA1_Init(&ctx);
> +		git_SHA1_Update(&ctx, hdr, hdrlen);
> +		git_SHA1_Update(&ctx, sb.buf, size);
> +		git_SHA1_Final(sha1, &ctx);
> +		return strbuf_detach(&sb, NULL);
> +	}
> +}
> +
> +static void *unpack_entry_data(struct object_entry *obj, unsigned char *sha1)
>  {
>  	static char fixed_buf[8192];
>  	void *buf;
>  	git_SHA_CTX c;
>  	char hdr[32];
>  	int hdrlen;
> +	unsigned long offset = obj->idx.offset;
> +	unsigned long size = obj->size;
> +	enum object_type type = obj->type;
>  
>  	if (type == OBJ_PV4_COMMIT)
>  		return unpack_commit_v4(offset, size, sha1);
> +	if (type == OBJ_PV4_TREE)
> +		return unpack_tree_v4(obj, offset, size, sha1);
>  
>  	if (!is_delta_type(type)) {
>  		hdrlen = sprintf(hdr, "%s %lu", typename(type), size) + 1;
> @@ -640,16 +731,19 @@ static void *unpack_raw_entry(struct object_entry *obj,
>  	case OBJ_BLOB:
>  	case OBJ_TAG:
>  		break;
> -
>  	case OBJ_PV4_COMMIT:
>  		obj->real_type = OBJ_COMMIT;
>  		break;
> +	case OBJ_PV4_TREE:
> +		obj->real_type = OBJ_TREE;
> +		break;
> +
>  	default:
>  		bad_object(obj->idx.offset, _("unknown object type %d"), obj->type);
>  	}
>  	obj->hdr_size = consumed_bytes - obj->idx.offset;
>  
> -	data = unpack_entry_data(obj->idx.offset, obj->size, obj->type, sha1);
> +	data = unpack_entry_data(obj, sha1);
>  	obj->idx.crc32 = input_crc32;
>  	return data;
>  }
> @@ -1186,6 +1280,8 @@ static void parse_pack_objects(unsigned char *sha1)
>  			nr_deltas++;
>  			delta->obj_no = i;
>  			delta++;
> +		} else if (!data && obj->type == OBJ_PV4_TREE) {
> +			/* delay sha1_object() until second pass */
>  		} else if (!data) {
>  			/* large blobs, check later */
>  			obj->real_type = OBJ_BAD;
> -- 
> 1.8.2.83.gc99314b
> 
> --
> To unsubscribe from this list: send the line "unsubscribe git" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply	[flat|nested] 124+ messages in thread

* Re: [PATCH 12/12] index-pack: resolve v4 one-base trees
  2013-09-07 10:43   ` [PATCH 12/12] index-pack: resolve v4 one-base trees Nguyễn Thái Ngọc Duy
@ 2013-09-08  3:28     ` Nicolas Pitre
  2013-09-08  3:44       ` Duy Nguyen
  0 siblings, 1 reply; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-08  3:28 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git

[-- Attachment #1: Type: TEXT/PLAIN, Size: 4233 bytes --]

On Sat, 7 Sep 2013, Nguyễn Thái Ngọc Duy wrote:

> This is the most common case for delta trees. In fact it's the only
> kind that's produced by packv4-create. It fits well in the way
> index-pack resolves deltas and benefits from threading (the set of
> objects depending on this base does not overlap with the set of
> objects depending on another base)
> 
> Multi-base trees will be probably processed differently.
> 
> Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
> ---
>  builtin/index-pack.c | 194 ++++++++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 178 insertions(+), 16 deletions(-)
> 
> diff --git a/builtin/index-pack.c b/builtin/index-pack.c
> index 1fa74f4..4a24bc3 100644
> --- a/builtin/index-pack.c
> +++ b/builtin/index-pack.c
> @@ -12,6 +12,8 @@
>  #include "streaming.h"
>  #include "thread-utils.h"
>  #include "packv4-parse.h"
> +#include "varint.h"
> +#include "tree-walk.h"
>  
>  static const char index_pack_usage[] =
>  "git index-pack [-v] [-o <index-file>] [--keep | --keep=<msg>] [--verify] [--strict] (<pack-file> | --stdin [--fix-thin] [<pack-file>])";
> @@ -38,8 +40,8 @@ struct base_data {
>  	struct object_entry *obj;
>  	void *data;
>  	unsigned long size;
> -	int ref_first, ref_last;
> -	int ofs_first, ofs_last;
> +	int ref_first, ref_last, tree_first;
> +	int ofs_first, ofs_last, tree_last;
>  };
>  
>  #if !defined(NO_PTHREADS) && defined(NO_THREAD_SAFE_PREAD)
> @@ -437,6 +439,7 @@ static struct base_data *alloc_base_data(void)
>  	memset(base, 0, sizeof(*base));
>  	base->ref_last = -1;
>  	base->ofs_last = -1;
> +	base->tree_last = -1;
>  	return base;
>  }
>  
> @@ -670,6 +673,8 @@ static void *unpack_tree_v4(struct object_entry *obj,
>  	}
>  
>  	if (last_base) {
> +		if (nr_deltas - delta_start > 1)
> +			die("sorry guys, multi-base trees are not supported yet");
>  		strbuf_release(&sb);
>  		return NULL;
>  	} else {
> @@ -794,6 +799,83 @@ static void *unpack_raw_entry(struct object_entry *obj,
>  	return data;
>  }
>  
> +static void *patch_one_base_tree(const struct object_entry *src,
> +				 const unsigned char *src_buf,
> +				 const unsigned char *delta_buf,
> +				 unsigned long delta_size,
> +				 unsigned long *dst_size)
> +{
> +	unsigned int nr;
> +	const unsigned char *last_base = NULL;
> +	struct strbuf sb = STRBUF_INIT;
> +	const unsigned char *p = delta_buf;
> +
> +	nr = decode_varint(&p);
> +	while (nr && p < delta_buf + delta_size) {
> +		unsigned int copy_start_or_path = decode_varint(&p);
> +		if (copy_start_or_path & 1) { /* copy_start */
> +			struct tree_desc desc;
> +			struct name_entry entry;
> +			unsigned int copy_count = decode_varint(&p);
> +			unsigned int copy_start = copy_start_or_path >> 1;
> +			if (!src)
> +				die("we are not supposed to copy from another tree!");
> +			if (copy_count & 1) { /* first delta */
> +				unsigned int id = decode_varint(&p);
> +				if (!id) {
> +					last_base = p;
> +					p += 20;
> +				} else
> +					last_base = sha1_table + (id - 1) * 20;
> +				if (hashcmp(last_base, src->idx.sha1))
> +					die(_("bad tree base in patch_one_base_tree"));
> +			} else if (!last_base)
> +				die(_("bad copy count index in patch_one_base_tree"));
> +			copy_count >>= 1;
> +			if (!copy_count)
> +				die(_("bad copy count index in patch_one_base_tree"));
> +			nr -= copy_count;
> +
> +			init_tree_desc(&desc, src_buf, src->size);
> +			while (tree_entry(&desc, &entry)) {
> +				if (copy_start)
> +					copy_start--;
> +				else if (copy_count) {
> +					strbuf_addf(&sb, "%o %s%c", entry.mode, entry.path, '\0');
> +					strbuf_add(&sb, entry.sha1, 20);
> +					copy_count--;
> +				} else
> +					break;
> +			}
> +		} else {	/* path */
> +			unsigned int path_idx = copy_start_or_path >> 1;
> +			const unsigned char *path;
> +			unsigned mode;
> +			unsigned int id;
> +			const unsigned char *entry_sha1;
> +
> +			if (path_idx >= path_dict->nb_entries)
> +				die(_("bad path index in unpack_tree_v4"));
> +			id = decode_varint(&p);
> +			if (!id) {
> +				entry_sha1 = p;
> +				p += 20;
> +			} else
> +				entry_sha1 = sha1_table + (id - 1) * 20;

You should verify that id doesn't overflow the sha1 table here.
Similarly in other places.


Nicolas

^ permalink raw reply	[flat|nested] 124+ messages in thread

* Re: [PATCH 12/12] index-pack: resolve v4 one-base trees
  2013-09-08  3:28     ` Nicolas Pitre
@ 2013-09-08  3:44       ` Duy Nguyen
  0 siblings, 0 replies; 124+ messages in thread
From: Duy Nguyen @ 2013-09-08  3:44 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Git Mailing List

On Sun, Sep 8, 2013 at 10:28 AM, Nicolas Pitre <nico@fluxnic.net> wrote:
>> @@ -794,6 +799,83 @@ static void *unpack_raw_entry(struct object_entry *obj,
>>       return data;
>>  }
>>
>> +static void *patch_one_base_tree(const struct object_entry *src,
>> +                              const unsigned char *src_buf,
>> +                              const unsigned char *delta_buf,
>> +                              unsigned long delta_size,
>> +                              unsigned long *dst_size)
>> +{
>> +     unsigned int nr;
>> +     const unsigned char *last_base = NULL;
>> +     struct strbuf sb = STRBUF_INIT;
>> +     const unsigned char *p = delta_buf;
>> +
>> +     nr = decode_varint(&p);
>> +     while (nr && p < delta_buf + delta_size) {
>> +             unsigned int copy_start_or_path = decode_varint(&p);
>> +             if (copy_start_or_path & 1) { /* copy_start */
>> +                     struct tree_desc desc;
>> +                     struct name_entry entry;
>> +                     unsigned int copy_count = decode_varint(&p);
>> +                     unsigned int copy_start = copy_start_or_path >> 1;
>> +                     if (!src)
>> +                             die("we are not supposed to copy from another tree!");
>> +                     if (copy_count & 1) { /* first delta */
>> +                             unsigned int id = decode_varint(&p);
>> +                             if (!id) {
>> +                                     last_base = p;
>> +                                     p += 20;
>> +                             } else
>> +                                     last_base = sha1_table + (id - 1) * 20;
>> +                             if (hashcmp(last_base, src->idx.sha1))
>> +                                     die(_("bad tree base in patch_one_base_tree"));
>> +                     } else if (!last_base)
>> +                             die(_("bad copy count index in patch_one_base_tree"));
>> +                     copy_count >>= 1;
>> +                     if (!copy_count)
>> +                             die(_("bad copy count index in patch_one_base_tree"));
>> +                     nr -= copy_count;
>> +
>> +                     init_tree_desc(&desc, src_buf, src->size);
>> +                     while (tree_entry(&desc, &entry)) {
>> +                             if (copy_start)
>> +                                     copy_start--;
>> +                             else if (copy_count) {
>> +                                     strbuf_addf(&sb, "%o %s%c", entry.mode, entry.path, '\0');
>> +                                     strbuf_add(&sb, entry.sha1, 20);
>> +                                     copy_count--;
>> +                             } else
>> +                                     break;
>> +                     }
>> +             } else {        /* path */
>> +                     unsigned int path_idx = copy_start_or_path >> 1;
>> +                     const unsigned char *path;
>> +                     unsigned mode;
>> +                     unsigned int id;
>> +                     const unsigned char *entry_sha1;
>> +
>> +                     if (path_idx >= path_dict->nb_entries)
>> +                             die(_("bad path index in unpack_tree_v4"));
>> +                     id = decode_varint(&p);
>> +                     if (!id) {
>> +                             entry_sha1 = p;
>> +                             p += 20;
>> +                     } else
>> +                             entry_sha1 = sha1_table + (id - 1) * 20;
>
> You should verify that id doesn't overflow the sha1 table here.
> Similarly in other places.

I think it's unnecessary. All trees must have been checked by
unpack_tree_v4() in the first pass. Overflow should be caught there if
found.

-- 
Duy

^ permalink raw reply	[flat|nested] 124+ messages in thread

* [PATCH v2 00/14] pack v4 support in index-pack
  2013-09-07 10:43 ` [PATCH 00/12] pack v4 support in index-pack Nguyễn Thái Ngọc Duy
                     ` (11 preceding siblings ...)
  2013-09-07 10:43   ` [PATCH 12/12] index-pack: resolve v4 one-base trees Nguyễn Thái Ngọc Duy
@ 2013-09-08  7:22   ` Nguyễn Thái Ngọc Duy
  2013-09-08  7:22     ` [PATCH v2 01/14] pack v4: split pv4_create_dict() out of load_dict() Nguyễn Thái Ngọc Duy
                       ` (13 more replies)
  12 siblings, 14 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-08  7:22 UTC (permalink / raw)
  To: git; +Cc: Nguyễn Thái Ngọc Duy

Mostly cleanups after Nico's comments. The diff against v2 is

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index 4a24bc3..88340b5 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -22,8 +22,8 @@ struct object_entry {
 	struct pack_idx_entry idx;
 	unsigned long size;
 	unsigned int hdr_size;
-	enum object_type type;
-	enum object_type real_type;
+	enum object_type type;	/* type as written in pack */
+	enum object_type real_type; /* type after delta resolving */
 	unsigned delta_depth;
 	int base_object_no;
 	int nr_bases;		/* only valid for v4 trees */
@@ -194,8 +194,10 @@ static int mark_link(struct object *obj, int type, void *data)
 	return 0;
 }
 
-/* The content of each linked object must have been checked
-   or it must be already present in the object database */
+/*
+ * The content of each linked object must have been checked or it must
+ * be already present in the object database
+ */
 static unsigned check_object(struct object *obj)
 {
 	if (!obj)
@@ -289,6 +291,19 @@ static inline void *fill_and_use(int bytes)
 	return p;
 }
 
+static void check_against_sha1table(const unsigned char *sha1)
+{
+	const unsigned char *found;
+	if (!packv4)
+		return;
+
+	found = bsearch(sha1, sha1_table, nr_objects, 20,
+			(int (*)(const void *, const void *))hashcmp);
+	if (!found)
+		die(_("object %s not found in SHA-1 table"),
+		    sha1_to_hex(sha1));
+}
+
 static NORETURN void bad_object(unsigned long offset, const char *format,
 		       ...) __attribute__((format (printf, 2, 3)));
 
@@ -325,15 +340,8 @@ static const unsigned char *read_sha1ref(void)
 static const unsigned char *read_sha1table_ref(void)
 {
 	const unsigned char *sha1 = read_sha1ref();
-	if (sha1 < sha1_table || sha1 >= sha1_table + nr_objects * 20) {
-		unsigned char *found;
-		found = bsearch(sha1, sha1_table, nr_objects, 20,
-				(int (*)(const void *, const void *))hashcmp);
-		if (!found)
-			bad_object(consumed_bytes,
-				   _("SHA-1 %s not found in SHA-1 table"),
-				   sha1_to_hex(sha1));
-	}
+	if (sha1 < sha1_table || sha1 >= sha1_table + nr_objects * 20)
+		check_against_sha1table(sha1);
 	return sha1;
 }
 
@@ -346,21 +354,6 @@ static const unsigned char *read_dictref(struct packv4_dict *dict)
 	return  dict->data + dict->offsets[index];
 }
 
-static void *read_data(int size)
-{
-	const int max = sizeof(input_buffer);
-	void *buf;
-	char *p;
-	p = buf = xmalloc(size);
-	while (size) {
-		int to_fill = size > max ? max : size;
-		memcpy(p, fill_and_use(to_fill), to_fill);
-		p += to_fill;
-		size -= to_fill;
-	}
-	return buf;
-}
-
 static const char *open_pack_file(const char *pack_name)
 {
 	if (from_stdin) {
@@ -532,8 +525,7 @@ static void read_and_inflate(unsigned long offset,
 		git_SHA1_Final(sha1, ctx);
 }
 
-static void *unpack_commit_v4(unsigned int offset,
-			      unsigned long size,
+static void *unpack_commit_v4(unsigned int offset, unsigned long size,
 			      unsigned char *sha1)
 {
 	unsigned int nb_parents;
@@ -622,7 +614,8 @@ static void add_tree_delta_base(struct object_entry *obj,
  * v4 trees are actually kind of deltas and we don't do delta in the
  * first pass. This function only walks through a tree object to find
  * the end offset, register object dependencies and performs limited
- * validation.
+ * validation. For v4 trees that have no dependencies, we do
+ * uncompress and calculate their SHA-1.
  */
 static void *unpack_tree_v4(struct object_entry *obj,
 			    unsigned int offset, unsigned long size,
@@ -641,9 +634,9 @@ static void *unpack_tree_v4(struct object_entry *obj,
 				add_tree_delta_base(obj, last_base, delta_start);
 			} else if (!last_base)
 				bad_object(offset,
-					   _("bad copy count index in unpack_tree_v4"));
+					   _("missing delta base unpack_tree_v4"));
 			copy_count >>= 1;
-			if (!copy_count)
+			if (!copy_count || copy_count > nr)
 				bad_object(offset,
 					   _("bad copy count index in unpack_tree_v4"));
 			nr -= copy_count;
@@ -657,6 +650,13 @@ static void *unpack_tree_v4(struct object_entry *obj,
 			entry_sha1 = read_sha1ref();
 			nr--;
 
+			/*
+			 * Attempt to rebuild a canonical (base) tree.
+			 * If last_base is set, this tree depends on
+			 * another tree, which we have no access at this
+			 * stage, so reconstruction must be delayed until
+			 * the second pass.
+			 */
 			if (!last_base) {
 				const unsigned char *path;
 				unsigned mode;
@@ -694,6 +694,11 @@ static void *unpack_tree_v4(struct object_entry *obj,
 	}
 }
 
+/*
+ * Unpack an entry data in the streamed pack, calculate the object
+ * SHA-1 if it's not a large blob. Otherwise just try to inflate the
+ * object to /dev/null to determine the end of the entry in the pack.
+ */
 static void *unpack_entry_data(struct object_entry *obj, unsigned char *sha1)
 {
 	static char fixed_buf[8192];
@@ -799,19 +804,23 @@ static void *unpack_raw_entry(struct object_entry *obj,
 	return data;
 }
 
+/*
+ * Some checks are skipped because they are already done by
+ * unpack_tree_v4() in the first pass.
+ */
 static void *patch_one_base_tree(const struct object_entry *src,
 				 const unsigned char *src_buf,
 				 const unsigned char *delta_buf,
 				 unsigned long delta_size,
 				 unsigned long *dst_size)
 {
-	unsigned int nr;
+	int nr;
 	const unsigned char *last_base = NULL;
 	struct strbuf sb = STRBUF_INIT;
 	const unsigned char *p = delta_buf;
 
 	nr = decode_varint(&p);
-	while (nr && p < delta_buf + delta_size) {
+	while (nr > 0 && p < delta_buf + delta_size) {
 		unsigned int copy_start_or_path = decode_varint(&p);
 		if (copy_start_or_path & 1) { /* copy_start */
 			struct tree_desc desc;
@@ -829,11 +838,9 @@ static void *patch_one_base_tree(const struct object_entry *src,
 					last_base = sha1_table + (id - 1) * 20;
 				if (hashcmp(last_base, src->idx.sha1))
 					die(_("bad tree base in patch_one_base_tree"));
-			} else if (!last_base)
-				die(_("bad copy count index in patch_one_base_tree"));
+			}
+
 			copy_count >>= 1;
-			if (!copy_count)
-				die(_("bad copy count index in patch_one_base_tree"));
 			nr -= copy_count;
 
 			init_tree_desc(&desc, src_buf, src->size);
@@ -841,7 +848,8 @@ static void *patch_one_base_tree(const struct object_entry *src,
 				if (copy_start)
 					copy_start--;
 				else if (copy_count) {
-					strbuf_addf(&sb, "%o %s%c", entry.mode, entry.path, '\0');
+					strbuf_addf(&sb, "%o %s%c",
+						    entry.mode, entry.path, '\0');
 					strbuf_add(&sb, entry.sha1, 20);
 					copy_count--;
 				} else
@@ -854,8 +862,6 @@ static void *patch_one_base_tree(const struct object_entry *src,
 			unsigned int id;
 			const unsigned char *entry_sha1;
 
-			if (path_idx >= path_dict->nb_entries)
-				die(_("bad path index in unpack_tree_v4"));
 			id = decode_varint(&p);
 			if (!id) {
 				entry_sha1 = p;
@@ -876,6 +882,11 @@ static void *patch_one_base_tree(const struct object_entry *src,
 	return sb.buf;
 }
 
+/*
+ * Unpack entry data in the second pass when the pack is already
+ * stored on disk. consume call back is used for large-blob case. Must
+ * be thread safe.
+ */
 static void *unpack_data(struct object_entry *obj,
 			 int (*consume)(const unsigned char *, unsigned long, void *),
 			 void *cb_data)
@@ -1079,19 +1090,6 @@ static int check_collison(struct object_entry *entry)
 	return 0;
 }
 
-static void check_against_sha1table(struct object_entry *obj)
-{
-	const unsigned char *found;
-	if (!packv4)
-		return;
-
-	found = bsearch(obj->idx.sha1, sha1_table, nr_objects, 20,
-			(int (*)(const void *, const void *))hashcmp);
-	if (!found)
-		die(_("object %s not found in SHA-1 table"),
-		    sha1_to_hex(obj->idx.sha1));
-}
-
 static void sha1_object(const void *data, struct object_entry *obj_entry,
 			unsigned long size, enum object_type type,
 			const unsigned char *sha1)
@@ -1288,7 +1286,7 @@ static void resolve_delta(struct object_entry *delta_obj,
 		bad_object(delta_obj->idx.offset, _("failed to apply delta"));
 	hash_sha1_file(result->data, result->size,
 		       typename(delta_obj->real_type), delta_obj->idx.sha1);
-	check_against_sha1table(delta_obj);
+	check_against_sha1table(delta_obj->idx.sha1);
 	sha1_object(result->data, NULL, result->size, delta_obj->real_type,
 		    delta_obj->idx.sha1);
 	counter_lock();
@@ -1296,6 +1294,11 @@ static void resolve_delta(struct object_entry *delta_obj,
 	counter_unlock();
 }
 
+/*
+ * Given a base object, search for all objects depending on the base,
+ * try to unpack one of those object. The function will be called
+ * repeatedly until all objects are unpacked.
+ */
 static struct base_data *find_unresolved_deltas_1(struct base_data *base,
 						  struct base_data *prev_base)
 {
@@ -1408,6 +1411,10 @@ static int compare_delta_entry(const void *a, const void *b)
 				   objects[delta_b->obj_no].type);
 }
 
+/*
+ * Unpack all objects depending directly or indirectly on the given
+ * object
+ */
 static void resolve_base(struct object_entry *obj)
 {
 	struct base_data *base_obj = alloc_base_data();
@@ -1417,6 +1424,7 @@ static void resolve_base(struct object_entry *obj)
 }
 
 #ifndef NO_PTHREADS
+/* Call resolve_base() in multiple threads */
 static void *threaded_second_pass(void *data)
 {
 	set_thread_data(data);
@@ -1460,10 +1468,19 @@ static struct packv4_dict *read_dict(void)
 
 static void parse_dictionaries(void)
 {
+	int i;
 	if (!packv4)
 		return;
 
-	sha1_table = read_data(20 * nr_objects);
+	sha1_table = xmalloc(20 * nr_objects);
+	hashcpy(sha1_table, fill_and_use(20));
+	for (i = 1; i < nr_objects; i++) {
+		unsigned char *p = sha1_table + i * 20;
+		hashcpy(p, fill_and_use(20));
+		if (hashcmp(p - 20, p) >= 0)
+			die(_("wrong order in SHA-1 table at entry %d"), i);
+	}
+
 	name_dict = read_dict();
 	path_dict = read_dict();
 }
@@ -1492,9 +1509,9 @@ static void parse_pack_objects(unsigned char *sha1)
 			/* large blobs, check later */
 			obj->real_type = OBJ_BAD;
 			nr_delays++;
-			check_against_sha1table(obj);
+			check_against_sha1table(obj->idx.sha1);
 		} else {
-			check_against_sha1table(obj);
+			check_against_sha1table(obj->idx.sha1);
 			sha1_object(data, NULL, obj->size, obj->real_type,
 				    obj->idx.sha1);
 		}
@@ -2137,14 +2154,8 @@ int cmd_index_pack(int argc, const char **argv, const char *prefix)
 	free(index_name_buf);
 	free(keep_name_buf);
 	free(sha1_table);
-	if (name_dict) {
-		free((void*)name_dict->data);
-		free(name_dict);
-	}
-	if (path_dict) {
-		free((void*)path_dict->data);
-		free(path_dict);
-	}
+	pv4_free_dict(name_dict);
+	pv4_free_dict(path_dict);
 	if (pack_name == NULL)
 		free((void *) curr_pack);
 	if (index_name == NULL)
diff --git a/packv4-parse.c b/packv4-parse.c
index 82661ba..d515bb9 100644
--- a/packv4-parse.c
+++ b/packv4-parse.c
@@ -63,6 +63,14 @@ struct packv4_dict *pv4_create_dict(const unsigned char *data, int dict_size)
 	return dict;
 }
 
+void pv4_free_dict(struct packv4_dict *dict)
+{
+	if (dict) {
+		free((void*)dict->data);
+		free(dict);
+	}
+}
+
 static struct packv4_dict *load_dict(struct packed_git *p, off_t *offset)
 {
 	struct pack_window *w_curs = NULL;
diff --git a/packv4-parse.h b/packv4-parse.h
index 0b2405a..e6719f6 100644
--- a/packv4-parse.h
+++ b/packv4-parse.h
@@ -8,6 +8,7 @@ struct packv4_dict {
 };
 
 struct packv4_dict *pv4_create_dict(const unsigned char *data, int dict_size);
+void pv4_free_dict(struct packv4_dict *dict);
 
 void *pv4_get_commit(struct packed_git *p, struct pack_window **w_curs,
 		     off_t offset, unsigned long size);
diff --git a/sha1_file.c b/sha1_file.c
index c7bf677..1528e28 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -763,6 +763,8 @@ void free_pack_by_name(const char *pack_name)
 			}
 			close_pack_index(p);
 			free(p->bad_object_sha1);
+			pv4_free_dict(p->ident_dict);
+			pv4_free_dict(p->path_dict);
 			*pp = p->next;
 			if (last_found_pack == p)
 				last_found_pack = NULL;

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH v2 01/14] pack v4: split pv4_create_dict() out of load_dict()
  2013-09-08  7:22   ` [PATCH v2 00/14] pack v4 support in index-pack Nguyễn Thái Ngọc Duy
@ 2013-09-08  7:22     ` Nguyễn Thái Ngọc Duy
  2013-09-08  7:22     ` [PATCH v2 02/14] pack v4: add pv4_free_dict() Nguyễn Thái Ngọc Duy
                       ` (12 subsequent siblings)
  13 siblings, 0 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-08  7:22 UTC (permalink / raw)
  To: git; +Cc: Nguyễn Thái Ngọc Duy

---
 packv4-parse.c | 63 ++++++++++++++++++++++++++++++++--------------------------
 packv4-parse.h |  8 ++++++++
 2 files changed, 43 insertions(+), 28 deletions(-)

diff --git a/packv4-parse.c b/packv4-parse.c
index 63bba03..82661ba 100644
--- a/packv4-parse.c
+++ b/packv4-parse.c
@@ -30,11 +30,38 @@ const unsigned char *get_sha1ref(struct packed_git *p,
 	return sha1;
 }
 
-struct packv4_dict {
-	const unsigned char *data;
-	unsigned int nb_entries;
-	unsigned int offsets[FLEX_ARRAY];
-};
+struct packv4_dict *pv4_create_dict(const unsigned char *data, int dict_size)
+{
+	struct packv4_dict *dict;
+	int i;
+
+	/* count number of entries */
+	int nb_entries = 0;
+	const unsigned char *cp = data;
+	while (cp < data + dict_size - 3) {
+		cp += 2;  /* prefix bytes */
+		cp += strlen((const char *)cp);  /* entry string */
+		cp += 1;  /* terminating NUL */
+		nb_entries++;
+	}
+	if (cp - data != dict_size) {
+		error("dict size mismatch");
+		return NULL;
+	}
+
+	dict = xmalloc(sizeof(*dict) + nb_entries * sizeof(dict->offsets[0]));
+	dict->data = data;
+	dict->nb_entries = nb_entries;
+
+	cp = data;
+	for (i = 0; i < nb_entries; i++) {
+		dict->offsets[i] = cp - data;
+		cp += 2;
+		cp += strlen((const char *)cp) + 1;
+	}
+
+	return dict;
+}
 
 static struct packv4_dict *load_dict(struct packed_git *p, off_t *offset)
 {
@@ -45,7 +72,7 @@ static struct packv4_dict *load_dict(struct packed_git *p, off_t *offset)
 	const unsigned char *cp;
 	git_zstream stream;
 	struct packv4_dict *dict;
-	int nb_entries, i, st;
+	int st;
 
 	/* get uncompressed dictionary data size */
 	src = use_pack(p, &w_curs, curpos, &avail);
@@ -77,32 +104,12 @@ static struct packv4_dict *load_dict(struct packed_git *p, off_t *offset)
 		return NULL;
 	}
 
-	/* count number of entries */
-	nb_entries = 0;
-	cp = data;
-	while (cp < data + dict_size - 3) {
-		cp += 2;  /* prefix bytes */
-		cp += strlen((const char *)cp);  /* entry string */
-		cp += 1;  /* terminating NUL */
-		nb_entries++;
-	}
-	if (cp - data != dict_size) {
-		error("dict size mismatch");
+	dict = pv4_create_dict(data, dict_size);
+	if (!dict) {
 		free(data);
 		return NULL;
 	}
 
-	dict = xmalloc(sizeof(*dict) + nb_entries * sizeof(dict->offsets[0]));
-	dict->data = data;
-	dict->nb_entries = nb_entries;
-
-	cp = data;
-	for (i = 0; i < nb_entries; i++) {
-		dict->offsets[i] = cp - data;
-		cp += 2;
-		cp += strlen((const char *)cp) + 1;
-	}
-
 	*offset = curpos;
 	return dict;
 }
diff --git a/packv4-parse.h b/packv4-parse.h
index 5f9d809..0b2405a 100644
--- a/packv4-parse.h
+++ b/packv4-parse.h
@@ -1,6 +1,14 @@
 #ifndef PACKV4_PARSE_H
 #define PACKV4_PARSE_H
 
+struct packv4_dict {
+	const unsigned char *data;
+	unsigned int nb_entries;
+	unsigned int offsets[FLEX_ARRAY];
+};
+
+struct packv4_dict *pv4_create_dict(const unsigned char *data, int dict_size);
+
 void *pv4_get_commit(struct packed_git *p, struct pack_window **w_curs,
 		     off_t offset, unsigned long size);
 void *pv4_get_tree(struct packed_git *p, struct pack_window **w_curs,
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH v2 02/14] pack v4: add pv4_free_dict()
  2013-09-08  7:22   ` [PATCH v2 00/14] pack v4 support in index-pack Nguyễn Thái Ngọc Duy
  2013-09-08  7:22     ` [PATCH v2 01/14] pack v4: split pv4_create_dict() out of load_dict() Nguyễn Thái Ngọc Duy
@ 2013-09-08  7:22     ` Nguyễn Thái Ngọc Duy
  2013-09-08  7:22     ` [PATCH v2 03/14] index-pack: add more comments on some big functions Nguyễn Thái Ngọc Duy
                       ` (11 subsequent siblings)
  13 siblings, 0 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-08  7:22 UTC (permalink / raw)
  To: git; +Cc: Nguyễn Thái Ngọc Duy

---
 packv4-parse.c | 8 ++++++++
 packv4-parse.h | 1 +
 sha1_file.c    | 2 ++
 3 files changed, 11 insertions(+)

diff --git a/packv4-parse.c b/packv4-parse.c
index 82661ba..d515bb9 100644
--- a/packv4-parse.c
+++ b/packv4-parse.c
@@ -63,6 +63,14 @@ struct packv4_dict *pv4_create_dict(const unsigned char *data, int dict_size)
 	return dict;
 }
 
+void pv4_free_dict(struct packv4_dict *dict)
+{
+	if (dict) {
+		free((void*)dict->data);
+		free(dict);
+	}
+}
+
 static struct packv4_dict *load_dict(struct packed_git *p, off_t *offset)
 {
 	struct pack_window *w_curs = NULL;
diff --git a/packv4-parse.h b/packv4-parse.h
index 0b2405a..e6719f6 100644
--- a/packv4-parse.h
+++ b/packv4-parse.h
@@ -8,6 +8,7 @@ struct packv4_dict {
 };
 
 struct packv4_dict *pv4_create_dict(const unsigned char *data, int dict_size);
+void pv4_free_dict(struct packv4_dict *dict);
 
 void *pv4_get_commit(struct packed_git *p, struct pack_window **w_curs,
 		     off_t offset, unsigned long size);
diff --git a/sha1_file.c b/sha1_file.c
index c7bf677..1528e28 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -763,6 +763,8 @@ void free_pack_by_name(const char *pack_name)
 			}
 			close_pack_index(p);
 			free(p->bad_object_sha1);
+			pv4_free_dict(p->ident_dict);
+			pv4_free_dict(p->path_dict);
 			*pp = p->next;
 			if (last_found_pack == p)
 				last_found_pack = NULL;
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH v2 03/14] index-pack: add more comments on some big functions
  2013-09-08  7:22   ` [PATCH v2 00/14] pack v4 support in index-pack Nguyễn Thái Ngọc Duy
  2013-09-08  7:22     ` [PATCH v2 01/14] pack v4: split pv4_create_dict() out of load_dict() Nguyễn Thái Ngọc Duy
  2013-09-08  7:22     ` [PATCH v2 02/14] pack v4: add pv4_free_dict() Nguyễn Thái Ngọc Duy
@ 2013-09-08  7:22     ` Nguyễn Thái Ngọc Duy
  2013-09-08  7:22     ` [PATCH v2 04/14] index-pack: split out varint decoding code Nguyễn Thái Ngọc Duy
                       ` (10 subsequent siblings)
  13 siblings, 0 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-08  7:22 UTC (permalink / raw)
  To: git; +Cc: Nguyễn Thái Ngọc Duy

---
 builtin/index-pack.c | 30 ++++++++++++++++++++++++++----
 1 file changed, 26 insertions(+), 4 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index 9c1cfac..1dbabe0 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -19,8 +19,8 @@ struct object_entry {
 	struct pack_idx_entry idx;
 	unsigned long size;
 	unsigned int hdr_size;
-	enum object_type type;
-	enum object_type real_type;
+	enum object_type type;	/* type as written in pack */
+	enum object_type real_type; /* type after delta resolving */
 	unsigned delta_depth;
 	int base_object_no;
 };
@@ -187,8 +187,10 @@ static int mark_link(struct object *obj, int type, void *data)
 	return 0;
 }
 
-/* The content of each linked object must have been checked
-   or it must be already present in the object database */
+/*
+ * The content of each linked object must have been checked or it must
+ * be already present in the object database
+ */
 static unsigned check_object(struct object *obj)
 {
 	if (!obj)
@@ -407,6 +409,11 @@ static int is_delta_type(enum object_type type)
 	return (type == OBJ_REF_DELTA || type == OBJ_OFS_DELTA);
 }
 
+/*
+ * Unpack an entry data in the streamed pack, calculate the object
+ * SHA-1 if it's not a large blob. Otherwise just try to inflate the
+ * object to /dev/null to determine the end of the entry in the pack.
+ */
 static void *unpack_entry_data(unsigned long offset, unsigned long size,
 			       enum object_type type, unsigned char *sha1)
 {
@@ -522,6 +529,11 @@ static void *unpack_raw_entry(struct object_entry *obj,
 	return data;
 }
 
+/*
+ * Unpack entry data in the second pass when the pack is already
+ * stored on disk. consume call back is used for large-blob case. Must
+ * be thread safe.
+ */
 static void *unpack_data(struct object_entry *obj,
 			 int (*consume)(const unsigned char *, unsigned long, void *),
 			 void *cb_data)
@@ -875,6 +887,11 @@ static void resolve_delta(struct object_entry *delta_obj,
 	counter_unlock();
 }
 
+/*
+ * Given a base object, search for all objects depending on the base,
+ * try to unpack one of those object. The function will be called
+ * repeatedly until all objects are unpacked.
+ */
 static struct base_data *find_unresolved_deltas_1(struct base_data *base,
 						  struct base_data *prev_base)
 {
@@ -958,6 +975,10 @@ static int compare_delta_entry(const void *a, const void *b)
 				   objects[delta_b->obj_no].type);
 }
 
+/*
+ * Unpack all objects depending directly or indirectly on the given
+ * object
+ */
 static void resolve_base(struct object_entry *obj)
 {
 	struct base_data *base_obj = alloc_base_data();
@@ -967,6 +988,7 @@ static void resolve_base(struct object_entry *obj)
 }
 
 #ifndef NO_PTHREADS
+/* Call resolve_base() in multiple threads */
 static void *threaded_second_pass(void *data)
 {
 	set_thread_data(data);
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH v2 04/14] index-pack: split out varint decoding code
  2013-09-08  7:22   ` [PATCH v2 00/14] pack v4 support in index-pack Nguyễn Thái Ngọc Duy
                       ` (2 preceding siblings ...)
  2013-09-08  7:22     ` [PATCH v2 03/14] index-pack: add more comments on some big functions Nguyễn Thái Ngọc Duy
@ 2013-09-08  7:22     ` Nguyễn Thái Ngọc Duy
  2013-09-08  7:22     ` [PATCH v2 05/14] index-pack: do not allocate buffer for unpacking deltas in the first pass Nguyễn Thái Ngọc Duy
                       ` (9 subsequent siblings)
  13 siblings, 0 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-08  7:22 UTC (permalink / raw)
  To: git; +Cc: Nguyễn Thái Ngọc Duy

---
 builtin/index-pack.c | 82 ++++++++++++++++++++++++++++------------------------
 1 file changed, 45 insertions(+), 37 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index 1dbabe0..5fbd517 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -277,6 +277,31 @@ static void use(int bytes)
 	consumed_bytes += bytes;
 }
 
+static inline void *fill_and_use(int bytes)
+{
+	void *p = fill(bytes);
+	use(bytes);
+	return p;
+}
+
+static NORETURN void bad_object(unsigned long offset, const char *format,
+		       ...) __attribute__((format (printf, 2, 3)));
+
+static uintmax_t read_varint(void)
+{
+	unsigned char c = *(char*)fill_and_use(1);
+	uintmax_t val = c & 127;
+	while (c & 128) {
+		val += 1;
+		if (!val || MSB(val, 7))
+			bad_object(consumed_bytes,
+				   _("offset overflow in read_varint"));
+		c = *(char*)fill_and_use(1);
+		val = (val << 7) + (c & 127);
+	}
+	return val;
+}
+
 static const char *open_pack_file(const char *pack_name)
 {
 	if (from_stdin) {
@@ -317,9 +342,6 @@ static void parse_pack_header(void)
 	use(sizeof(struct pack_header));
 }
 
-static NORETURN void bad_object(unsigned long offset, const char *format,
-		       ...) __attribute__((format (printf, 2, 3)));
-
 static NORETURN void bad_object(unsigned long offset, const char *format, ...)
 {
 	va_list params;
@@ -462,55 +484,41 @@ static void *unpack_entry_data(unsigned long offset, unsigned long size,
 	return buf == fixed_buf ? NULL : buf;
 }
 
+static void read_typesize_v2(struct object_entry *obj)
+{
+	unsigned char c = *(char*)fill_and_use(1);
+	unsigned shift;
+
+	obj->type = (c >> 4) & 7;
+	obj->size = (c & 15);
+	shift = 4;
+	while (c & 128) {
+		c = *(char*)fill_and_use(1);
+		obj->size += (c & 0x7f) << shift;
+		shift += 7;
+	}
+}
+
 static void *unpack_raw_entry(struct object_entry *obj,
 			      union delta_base *delta_base,
 			      unsigned char *sha1)
 {
-	unsigned char *p;
-	unsigned long size, c;
-	off_t base_offset;
-	unsigned shift;
 	void *data;
+	uintmax_t val;
 
 	obj->idx.offset = consumed_bytes;
 	input_crc32 = crc32(0, NULL, 0);
 
-	p = fill(1);
-	c = *p;
-	use(1);
-	obj->type = (c >> 4) & 7;
-	size = (c & 15);
-	shift = 4;
-	while (c & 0x80) {
-		p = fill(1);
-		c = *p;
-		use(1);
-		size += (c & 0x7f) << shift;
-		shift += 7;
-	}
-	obj->size = size;
+	read_typesize_v2(obj);
 
 	switch (obj->type) {
 	case OBJ_REF_DELTA:
-		hashcpy(delta_base->sha1, fill(20));
-		use(20);
+		hashcpy(delta_base->sha1, fill_and_use(20));
 		break;
 	case OBJ_OFS_DELTA:
 		memset(delta_base, 0, sizeof(*delta_base));
-		p = fill(1);
-		c = *p;
-		use(1);
-		base_offset = c & 127;
-		while (c & 128) {
-			base_offset += 1;
-			if (!base_offset || MSB(base_offset, 7))
-				bad_object(obj->idx.offset, _("offset value overflow for delta base object"));
-			p = fill(1);
-			c = *p;
-			use(1);
-			base_offset = (base_offset << 7) + (c & 127);
-		}
-		delta_base->offset = obj->idx.offset - base_offset;
+		val = read_varint();
+		delta_base->offset = obj->idx.offset - val;
 		if (delta_base->offset <= 0 || delta_base->offset >= obj->idx.offset)
 			bad_object(obj->idx.offset, _("delta base offset is out of bound"));
 		break;
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH v2 05/14] index-pack: do not allocate buffer for unpacking deltas in the first pass
  2013-09-08  7:22   ` [PATCH v2 00/14] pack v4 support in index-pack Nguyễn Thái Ngọc Duy
                       ` (3 preceding siblings ...)
  2013-09-08  7:22     ` [PATCH v2 04/14] index-pack: split out varint decoding code Nguyễn Thái Ngọc Duy
@ 2013-09-08  7:22     ` Nguyễn Thái Ngọc Duy
  2013-09-08  7:22     ` [PATCH v2 06/14] index-pack: split inflate/digest code out of unpack_entry_data Nguyễn Thái Ngọc Duy
                       ` (8 subsequent siblings)
  13 siblings, 0 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-08  7:22 UTC (permalink / raw)
  To: git; +Cc: Nguyễn Thái Ngọc Duy

We do need deltas until the second pass. Allocating a buffer for it
then freeing later is wasteful is unnecessary. Make it use fixed_buf
(aka large blob code path).
---
 builtin/index-pack.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index 5fbd517..78554d0 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -453,7 +453,8 @@ static void *unpack_entry_data(unsigned long offset, unsigned long size,
 		git_SHA1_Update(&c, hdr, hdrlen);
 	} else
 		sha1 = NULL;
-	if (type == OBJ_BLOB && size > big_file_threshold)
+	if (is_delta_type(type) ||
+	     (type == OBJ_BLOB && size > big_file_threshold))
 		buf = fixed_buf;
 	else
 		buf = xmalloc(size);
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH v2 06/14] index-pack: split inflate/digest code out of unpack_entry_data
  2013-09-08  7:22   ` [PATCH v2 00/14] pack v4 support in index-pack Nguyễn Thái Ngọc Duy
                       ` (4 preceding siblings ...)
  2013-09-08  7:22     ` [PATCH v2 05/14] index-pack: do not allocate buffer for unpacking deltas in the first pass Nguyễn Thái Ngọc Duy
@ 2013-09-08  7:22     ` Nguyễn Thái Ngọc Duy
  2013-09-08  7:22     ` [PATCH v2 07/14] index-pack: parse v4 header and dictionaries Nguyễn Thái Ngọc Duy
                       ` (7 subsequent siblings)
  13 siblings, 0 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-08  7:22 UTC (permalink / raw)
  To: git; +Cc: Nguyễn Thái Ngọc Duy

---
 builtin/index-pack.c | 62 +++++++++++++++++++++++++++++++---------------------
 1 file changed, 37 insertions(+), 25 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index 78554d0..3389262 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -431,6 +431,40 @@ static int is_delta_type(enum object_type type)
 	return (type == OBJ_REF_DELTA || type == OBJ_OFS_DELTA);
 }
 
+static void read_and_inflate(unsigned long offset,
+			     void *buf, unsigned long size,
+			     unsigned long wraparound,
+			     git_SHA_CTX *ctx,
+			     unsigned char *sha1)
+{
+	git_zstream stream;
+	int status;
+
+	memset(&stream, 0, sizeof(stream));
+	git_inflate_init(&stream);
+	stream.next_out = buf;
+	stream.avail_out = wraparound ? wraparound : size;
+
+	do {
+		unsigned char *last_out = stream.next_out;
+		stream.next_in = fill(1);
+		stream.avail_in = input_len;
+		status = git_inflate(&stream, 0);
+		use(input_len - stream.avail_in);
+		if (sha1)
+			git_SHA1_Update(ctx, last_out, stream.next_out - last_out);
+		if (wraparound) {
+			stream.next_out = buf;
+			stream.avail_out = wraparound;
+		}
+	} while (status == Z_OK);
+	if (stream.total_out != size || status != Z_STREAM_END)
+		bad_object(offset, _("inflate returned %d"), status);
+	git_inflate_end(&stream);
+	if (sha1)
+		git_SHA1_Final(sha1, ctx);
+}
+
 /*
  * Unpack an entry data in the streamed pack, calculate the object
  * SHA-1 if it's not a large blob. Otherwise just try to inflate the
@@ -440,8 +474,6 @@ static void *unpack_entry_data(unsigned long offset, unsigned long size,
 			       enum object_type type, unsigned char *sha1)
 {
 	static char fixed_buf[8192];
-	int status;
-	git_zstream stream;
 	void *buf;
 	git_SHA_CTX c;
 	char hdr[32];
@@ -459,29 +491,9 @@ static void *unpack_entry_data(unsigned long offset, unsigned long size,
 	else
 		buf = xmalloc(size);
 
-	memset(&stream, 0, sizeof(stream));
-	git_inflate_init(&stream);
-	stream.next_out = buf;
-	stream.avail_out = buf == fixed_buf ? sizeof(fixed_buf) : size;
-
-	do {
-		unsigned char *last_out = stream.next_out;
-		stream.next_in = fill(1);
-		stream.avail_in = input_len;
-		status = git_inflate(&stream, 0);
-		use(input_len - stream.avail_in);
-		if (sha1)
-			git_SHA1_Update(&c, last_out, stream.next_out - last_out);
-		if (buf == fixed_buf) {
-			stream.next_out = buf;
-			stream.avail_out = sizeof(fixed_buf);
-		}
-	} while (status == Z_OK);
-	if (stream.total_out != size || status != Z_STREAM_END)
-		bad_object(offset, _("inflate returned %d"), status);
-	git_inflate_end(&stream);
-	if (sha1)
-		git_SHA1_Final(sha1, &c);
+	read_and_inflate(offset, buf, size,
+			 buf == fixed_buf ? sizeof(fixed_buf) : 0,
+			 &c, sha1);
 	return buf == fixed_buf ? NULL : buf;
 }
 
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH v2 07/14] index-pack: parse v4 header and dictionaries
  2013-09-08  7:22   ` [PATCH v2 00/14] pack v4 support in index-pack Nguyễn Thái Ngọc Duy
                       ` (5 preceding siblings ...)
  2013-09-08  7:22     ` [PATCH v2 06/14] index-pack: split inflate/digest code out of unpack_entry_data Nguyễn Thái Ngọc Duy
@ 2013-09-08  7:22     ` Nguyễn Thái Ngọc Duy
  2013-09-08  7:22     ` [PATCH v2 08/14] index-pack: make sure all objects are registered in v4's SHA-1 table Nguyễn Thái Ngọc Duy
                       ` (6 subsequent siblings)
  13 siblings, 0 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-08  7:22 UTC (permalink / raw)
  To: git; +Cc: Nguyễn Thái Ngọc Duy

---
 builtin/index-pack.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 48 insertions(+), 1 deletion(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index 3389262..83e6e79 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -11,6 +11,7 @@
 #include "exec_cmd.h"
 #include "streaming.h"
 #include "thread-utils.h"
+#include "packv4-parse.h"
 
 static const char index_pack_usage[] =
 "git index-pack [-v] [-o <index-file>] [--keep | --keep=<msg>] [--verify] [--strict] (<pack-file> | --stdin [--fix-thin] [<pack-file>])";
@@ -70,6 +71,8 @@ struct delta_entry {
 static struct object_entry *objects;
 static struct delta_entry *deltas;
 static struct thread_local nothread_data;
+static unsigned char *sha1_table;
+static struct packv4_dict *name_dict, *path_dict;
 static int nr_objects;
 static int nr_deltas;
 static int nr_resolved_deltas;
@@ -81,6 +84,7 @@ static int do_fsck_object;
 static int verbose;
 static int show_stat;
 static int check_self_contained_and_connected;
+static int packv4;
 
 static struct progress *progress;
 
@@ -334,7 +338,9 @@ static void parse_pack_header(void)
 	/* Header consistency check */
 	if (hdr->hdr_signature != htonl(PACK_SIGNATURE))
 		die(_("pack signature mismatch"));
-	if (!pack_version_ok(hdr->hdr_version))
+	if (hdr->hdr_version == htonl(4))
+		packv4 = 1;
+	else if (!pack_version_ok(hdr->hdr_version))
 		die(_("pack version %"PRIu32" unsupported"),
 			ntohl(hdr->hdr_version));
 
@@ -1035,6 +1041,40 @@ static void *threaded_second_pass(void *data)
 }
 #endif
 
+static struct packv4_dict *read_dict(void)
+{
+	unsigned long size;
+	unsigned char *data;
+	struct packv4_dict *dict;
+
+	size = read_varint();
+	data = xmallocz(size);
+	read_and_inflate(consumed_bytes, data, size, 0, NULL, NULL);
+	dict = pv4_create_dict(data, size);
+	if (!dict)
+		die("unable to parse dictionary");
+	return dict;
+}
+
+static void parse_dictionaries(void)
+{
+	int i;
+	if (!packv4)
+		return;
+
+	sha1_table = xmalloc(20 * nr_objects);
+	hashcpy(sha1_table, fill_and_use(20));
+	for (i = 1; i < nr_objects; i++) {
+		unsigned char *p = sha1_table + i * 20;
+		hashcpy(p, fill_and_use(20));
+		if (hashcmp(p - 20, p) >= 0)
+			die(_("wrong order in SHA-1 table at entry %d"), i);
+	}
+
+	name_dict = read_dict();
+	path_dict = read_dict();
+}
+
 /*
  * First pass:
  * - find locations of all objects;
@@ -1673,6 +1713,7 @@ int cmd_index_pack(int argc, const char **argv, const char *prefix)
 	parse_pack_header();
 	objects = xcalloc(nr_objects + 1, sizeof(struct object_entry));
 	deltas = xcalloc(nr_objects, sizeof(struct delta_entry));
+	parse_dictionaries();
 	parse_pack_objects(pack_sha1);
 	resolve_deltas();
 	conclude_pack(fix_thin_pack, curr_pack, pack_sha1);
@@ -1683,6 +1724,9 @@ int cmd_index_pack(int argc, const char **argv, const char *prefix)
 	if (show_stat)
 		show_pack_info(stat_only);
 
+	if (packv4)
+		die("we're not there yet");
+
 	idx_objects = xmalloc((nr_objects) * sizeof(struct pack_idx_entry *));
 	for (i = 0; i < nr_objects; i++)
 		idx_objects[i] = &objects[i].idx;
@@ -1699,6 +1743,9 @@ int cmd_index_pack(int argc, const char **argv, const char *prefix)
 	free(objects);
 	free(index_name_buf);
 	free(keep_name_buf);
+	free(sha1_table);
+	pv4_free_dict(name_dict);
+	pv4_free_dict(path_dict);
 	if (pack_name == NULL)
 		free((void *) curr_pack);
 	if (index_name == NULL)
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH v2 08/14] index-pack: make sure all objects are registered in v4's SHA-1 table
  2013-09-08  7:22   ` [PATCH v2 00/14] pack v4 support in index-pack Nguyễn Thái Ngọc Duy
                       ` (6 preceding siblings ...)
  2013-09-08  7:22     ` [PATCH v2 07/14] index-pack: parse v4 header and dictionaries Nguyễn Thái Ngọc Duy
@ 2013-09-08  7:22     ` Nguyễn Thái Ngọc Duy
  2013-09-08  7:22     ` [PATCH v2 09/14] index-pack: parse v4 commit format Nguyễn Thái Ngọc Duy
                       ` (5 subsequent siblings)
  13 siblings, 0 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-08  7:22 UTC (permalink / raw)
  To: git; +Cc: Nguyễn Thái Ngọc Duy

---
 builtin/index-pack.c | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index 83e6e79..efb969a 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -288,6 +288,19 @@ static inline void *fill_and_use(int bytes)
 	return p;
 }
 
+static void check_against_sha1table(const unsigned char *sha1)
+{
+	const unsigned char *found;
+	if (!packv4)
+		return;
+
+	found = bsearch(sha1, sha1_table, nr_objects, 20,
+			(int (*)(const void *, const void *))hashcmp);
+	if (!found)
+		die(_("object %s not found in SHA-1 table"),
+		    sha1_to_hex(sha1));
+}
+
 static NORETURN void bad_object(unsigned long offset, const char *format,
 		       ...) __attribute__((format (printf, 2, 3)));
 
@@ -907,6 +920,7 @@ static void resolve_delta(struct object_entry *delta_obj,
 		bad_object(delta_obj->idx.offset, _("failed to apply delta"));
 	hash_sha1_file(result->data, result->size,
 		       typename(delta_obj->real_type), delta_obj->idx.sha1);
+	check_against_sha1table(delta_obj->idx.sha1);
 	sha1_object(result->data, NULL, result->size, delta_obj->real_type,
 		    delta_obj->idx.sha1);
 	counter_lock();
@@ -1103,8 +1117,12 @@ static void parse_pack_objects(unsigned char *sha1)
 			/* large blobs, check later */
 			obj->real_type = OBJ_BAD;
 			nr_delays++;
-		} else
-			sha1_object(data, NULL, obj->size, obj->type, obj->idx.sha1);
+			check_against_sha1table(obj->idx.sha1);
+		} else {
+			check_against_sha1table(obj->idx.sha1);
+			sha1_object(data, NULL, obj->size, obj->type,
+				    obj->idx.sha1);
+		}
 		free(data);
 		display_progress(progress, i+1);
 	}
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH v2 09/14] index-pack: parse v4 commit format
  2013-09-08  7:22   ` [PATCH v2 00/14] pack v4 support in index-pack Nguyễn Thái Ngọc Duy
                       ` (7 preceding siblings ...)
  2013-09-08  7:22     ` [PATCH v2 08/14] index-pack: make sure all objects are registered in v4's SHA-1 table Nguyễn Thái Ngọc Duy
@ 2013-09-08  7:22     ` Nguyễn Thái Ngọc Duy
  2013-09-08  7:22     ` [PATCH v2 10/14] index-pack: parse v4 tree format Nguyễn Thái Ngọc Duy
                       ` (4 subsequent siblings)
  13 siblings, 0 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-08  7:22 UTC (permalink / raw)
  To: git; +Cc: Nguyễn Thái Ngọc Duy

---
 builtin/index-pack.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 91 insertions(+), 3 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index efb969a..473514a 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -319,6 +319,30 @@ static uintmax_t read_varint(void)
 	return val;
 }
 
+static const unsigned char *read_sha1ref(void)
+{
+	unsigned int index = read_varint();
+	if (!index) {
+		static unsigned char sha1[20];
+		hashcpy(sha1, fill_and_use(20));
+		return sha1;
+	}
+	index--;
+	if (index >= nr_objects)
+		bad_object(consumed_bytes,
+			   _("bad index in read_sha1ref"));
+	return sha1_table + index * 20;
+}
+
+static const unsigned char *read_dictref(struct packv4_dict *dict)
+{
+	unsigned int index = read_varint();
+	if (index >= dict->nb_entries)
+		bad_object(consumed_bytes,
+			   _("bad index in read_dictref"));
+	return  dict->data + dict->offsets[index];
+}
+
 static const char *open_pack_file(const char *pack_name)
 {
 	if (from_stdin) {
@@ -484,6 +508,58 @@ static void read_and_inflate(unsigned long offset,
 		git_SHA1_Final(sha1, ctx);
 }
 
+static void *unpack_commit_v4(unsigned int offset, unsigned long size,
+			      unsigned char *sha1)
+{
+	unsigned int nb_parents;
+	const unsigned char *committer, *author, *ident;
+	unsigned long author_time, committer_time;
+	git_SHA_CTX ctx;
+	char hdr[32];
+	int hdrlen;
+	int16_t committer_tz, author_tz;
+	struct strbuf dst;
+
+	strbuf_init(&dst, size);
+
+	strbuf_addf(&dst, "tree %s\n", sha1_to_hex(read_sha1ref()));
+	nb_parents = read_varint();
+	while (nb_parents--)
+		strbuf_addf(&dst, "parent %s\n", sha1_to_hex(read_sha1ref()));
+
+	committer_time = read_varint();
+	ident = read_dictref(name_dict);
+	committer_tz = (ident[0] << 8) | ident[1];
+	committer = ident + 2;
+
+	author_time = read_varint();
+	ident = read_dictref(name_dict);
+	author_tz = (ident[0] << 8) | ident[1];
+	author = ident + 2;
+
+	if (author_time & 1)
+		author_time = committer_time + (author_time >> 1);
+	else
+		author_time = committer_time - (author_time >> 1);
+
+	strbuf_addf(&dst,
+		    "author %s %lu %+05d\n"
+		    "committer %s %lu %+05d\n",
+		    author, author_time, author_tz,
+		    committer, committer_time, committer_tz);
+
+	if (dst.len > size)
+		bad_object(offset, _("bad commit"));
+
+	hdrlen = sprintf(hdr, "commit %lu", size) + 1;
+	git_SHA1_Init(&ctx);
+	git_SHA1_Update(&ctx, hdr, hdrlen);
+	git_SHA1_Update(&ctx, dst.buf, dst.len);
+	read_and_inflate(offset, dst.buf + dst.len, size - dst.len,
+			 0, &ctx, sha1);
+	return dst.buf;
+}
+
 /*
  * Unpack an entry data in the streamed pack, calculate the object
  * SHA-1 if it's not a large blob. Otherwise just try to inflate the
@@ -498,6 +574,9 @@ static void *unpack_entry_data(unsigned long offset, unsigned long size,
 	char hdr[32];
 	int hdrlen;
 
+	if (type == OBJ_PV4_COMMIT)
+		return unpack_commit_v4(offset, size, sha1);
+
 	if (!is_delta_type(type)) {
 		hdrlen = sprintf(hdr, "%s %lu", typename(type), size) + 1;
 		git_SHA1_Init(&c);
@@ -541,7 +620,13 @@ static void *unpack_raw_entry(struct object_entry *obj,
 	obj->idx.offset = consumed_bytes;
 	input_crc32 = crc32(0, NULL, 0);
 
-	read_typesize_v2(obj);
+	if (packv4) {
+		val = read_varint();
+		obj->type = val & 15;
+		obj->size = val >> 4;
+	} else
+		read_typesize_v2(obj);
+	obj->real_type = obj->type;
 
 	switch (obj->type) {
 	case OBJ_REF_DELTA:
@@ -559,6 +644,10 @@ static void *unpack_raw_entry(struct object_entry *obj,
 	case OBJ_BLOB:
 	case OBJ_TAG:
 		break;
+
+	case OBJ_PV4_COMMIT:
+		obj->real_type = OBJ_COMMIT;
+		break;
 	default:
 		bad_object(obj->idx.offset, _("unknown object type %d"), obj->type);
 	}
@@ -1108,7 +1197,6 @@ static void parse_pack_objects(unsigned char *sha1)
 	for (i = 0; i < nr_objects; i++) {
 		struct object_entry *obj = &objects[i];
 		void *data = unpack_raw_entry(obj, &delta->base, obj->idx.sha1);
-		obj->real_type = obj->type;
 		if (is_delta_type(obj->type)) {
 			nr_deltas++;
 			delta->obj_no = i;
@@ -1120,7 +1208,7 @@ static void parse_pack_objects(unsigned char *sha1)
 			check_against_sha1table(obj->idx.sha1);
 		} else {
 			check_against_sha1table(obj->idx.sha1);
-			sha1_object(data, NULL, obj->size, obj->type,
+			sha1_object(data, NULL, obj->size, obj->real_type,
 				    obj->idx.sha1);
 		}
 		free(data);
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH v2 10/14] index-pack: parse v4 tree format
  2013-09-08  7:22   ` [PATCH v2 00/14] pack v4 support in index-pack Nguyễn Thái Ngọc Duy
                       ` (8 preceding siblings ...)
  2013-09-08  7:22     ` [PATCH v2 09/14] index-pack: parse v4 commit format Nguyễn Thái Ngọc Duy
@ 2013-09-08  7:22     ` Nguyễn Thái Ngọc Duy
  2013-09-08  7:22     ` [PATCH v2 11/14] index-pack: move delta base queuing code to unpack_raw_entry Nguyễn Thái Ngọc Duy
                       ` (3 subsequent siblings)
  13 siblings, 0 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-08  7:22 UTC (permalink / raw)
  To: git; +Cc: Nguyễn Thái Ngọc Duy

---
 builtin/index-pack.c | 105 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 101 insertions(+), 4 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index 473514a..dcb6409 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -334,6 +334,14 @@ static const unsigned char *read_sha1ref(void)
 	return sha1_table + index * 20;
 }
 
+static const unsigned char *read_sha1table_ref(void)
+{
+	const unsigned char *sha1 = read_sha1ref();
+	if (sha1 < sha1_table || sha1 >= sha1_table + nr_objects * 20)
+		check_against_sha1table(sha1);
+	return sha1;
+}
+
 static const unsigned char *read_dictref(struct packv4_dict *dict)
 {
 	unsigned int index = read_varint();
@@ -561,21 +569,105 @@ static void *unpack_commit_v4(unsigned int offset, unsigned long size,
 }
 
 /*
+ * v4 trees are actually kind of deltas and we don't do delta in the
+ * first pass. This function only walks through a tree object to find
+ * the end offset, register object dependencies and performs limited
+ * validation. For v4 trees that have no dependencies, we do
+ * uncompress and calculate their SHA-1.
+ */
+static void *unpack_tree_v4(struct object_entry *obj,
+			    unsigned int offset, unsigned long size,
+			    unsigned char *sha1)
+{
+	unsigned int nr = read_varint();
+	const unsigned char *last_base = NULL;
+	struct strbuf sb = STRBUF_INIT;
+	while (nr) {
+		unsigned int copy_start_or_path = read_varint();
+		if (copy_start_or_path & 1) { /* copy_start */
+			unsigned int copy_count = read_varint();
+			if (copy_count & 1) { /* first delta */
+				last_base = read_sha1table_ref();
+			} else if (!last_base)
+				bad_object(offset,
+					   _("missing delta base unpack_tree_v4"));
+			copy_count >>= 1;
+			if (!copy_count || copy_count > nr)
+				bad_object(offset,
+					   _("bad copy count index in unpack_tree_v4"));
+			nr -= copy_count;
+		} else {	/* path */
+			unsigned int path_idx = copy_start_or_path >> 1;
+			const unsigned char *entry_sha1;
+
+			if (path_idx >= path_dict->nb_entries)
+				bad_object(offset,
+					   _("bad path index in unpack_tree_v4"));
+			entry_sha1 = read_sha1ref();
+			nr--;
+
+			/*
+			 * Attempt to rebuild a canonical (base) tree.
+			 * If last_base is set, this tree depends on
+			 * another tree, which we have no access at this
+			 * stage, so reconstruction must be delayed until
+			 * the second pass.
+			 */
+			if (!last_base) {
+				const unsigned char *path;
+				unsigned mode;
+
+				path = path_dict->data + path_dict->offsets[path_idx];
+				mode = (path[0] << 8) | path[1];
+				strbuf_addf(&sb, "%o %s%c", mode, path+2, '\0');
+				strbuf_add(&sb, entry_sha1, 20);
+				if (sb.len > size)
+					bad_object(offset,
+						   _("tree larger than expected"));
+			}
+		}
+	}
+
+	if (last_base) {
+		strbuf_release(&sb);
+		return NULL;
+	} else {
+		git_SHA_CTX ctx;
+		char hdr[32];
+		int hdrlen;
+
+		if (sb.len != size)
+			bad_object(offset, _("tree size mismatch"));
+
+		hdrlen = sprintf(hdr, "tree %lu", size) + 1;
+		git_SHA1_Init(&ctx);
+		git_SHA1_Update(&ctx, hdr, hdrlen);
+		git_SHA1_Update(&ctx, sb.buf, size);
+		git_SHA1_Final(sha1, &ctx);
+		return strbuf_detach(&sb, NULL);
+	}
+}
+
+/*
  * Unpack an entry data in the streamed pack, calculate the object
  * SHA-1 if it's not a large blob. Otherwise just try to inflate the
  * object to /dev/null to determine the end of the entry in the pack.
  */
-static void *unpack_entry_data(unsigned long offset, unsigned long size,
-			       enum object_type type, unsigned char *sha1)
+static void *unpack_entry_data(struct object_entry *obj, unsigned char *sha1)
 {
 	static char fixed_buf[8192];
 	void *buf;
 	git_SHA_CTX c;
 	char hdr[32];
 	int hdrlen;
+	unsigned long offset = obj->idx.offset;
+	unsigned long size = obj->size;
+	enum object_type type = obj->type;
 
 	if (type == OBJ_PV4_COMMIT)
 		return unpack_commit_v4(offset, size, sha1);
+	if (type == OBJ_PV4_TREE)
+		return unpack_tree_v4(obj, offset, size, sha1);
 
 	if (!is_delta_type(type)) {
 		hdrlen = sprintf(hdr, "%s %lu", typename(type), size) + 1;
@@ -644,16 +736,19 @@ static void *unpack_raw_entry(struct object_entry *obj,
 	case OBJ_BLOB:
 	case OBJ_TAG:
 		break;
-
 	case OBJ_PV4_COMMIT:
 		obj->real_type = OBJ_COMMIT;
 		break;
+	case OBJ_PV4_TREE:
+		obj->real_type = OBJ_TREE;
+		break;
+
 	default:
 		bad_object(obj->idx.offset, _("unknown object type %d"), obj->type);
 	}
 	obj->hdr_size = consumed_bytes - obj->idx.offset;
 
-	data = unpack_entry_data(obj->idx.offset, obj->size, obj->type, sha1);
+	data = unpack_entry_data(obj, sha1);
 	obj->idx.crc32 = input_crc32;
 	return data;
 }
@@ -1201,6 +1296,8 @@ static void parse_pack_objects(unsigned char *sha1)
 			nr_deltas++;
 			delta->obj_no = i;
 			delta++;
+		} else if (!data && obj->type == OBJ_PV4_TREE) {
+			/* delay sha1_object() until second pass */
 		} else if (!data) {
 			/* large blobs, check later */
 			obj->real_type = OBJ_BAD;
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH v2 11/14] index-pack: move delta base queuing code to unpack_raw_entry
  2013-09-08  7:22   ` [PATCH v2 00/14] pack v4 support in index-pack Nguyễn Thái Ngọc Duy
                       ` (9 preceding siblings ...)
  2013-09-08  7:22     ` [PATCH v2 10/14] index-pack: parse v4 tree format Nguyễn Thái Ngọc Duy
@ 2013-09-08  7:22     ` Nguyễn Thái Ngọc Duy
  2013-09-08  7:22     ` [PATCH v2 12/14] index-pack: record all delta bases in v4 (tree and ref-delta) Nguyễn Thái Ngọc Duy
                       ` (2 subsequent siblings)
  13 siblings, 0 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-08  7:22 UTC (permalink / raw)
  To: git; +Cc: Nguyễn Thái Ngọc Duy

For v2, ofs-delta and ref-delta can only have queue one delta base at
a time. A v4 tree can have more than one delta base. Move the queuing
code up to unpack_raw_entry() and give unpack_tree_v4() more
flexibility to add its bases.
---
 builtin/index-pack.c | 46 ++++++++++++++++++++++++++++++----------------
 1 file changed, 30 insertions(+), 16 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index dcb6409..8f2d929 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -568,6 +568,25 @@ static void *unpack_commit_v4(unsigned int offset, unsigned long size,
 	return dst.buf;
 }
 
+static void add_sha1_delta(struct object_entry *obj,
+			   const unsigned char *sha1)
+{
+	struct delta_entry *delta = deltas + nr_deltas;
+	delta->obj_no = obj - objects;
+	hashcpy(delta->base.sha1, sha1);
+	nr_deltas++;
+}
+
+static void add_ofs_delta(struct object_entry *obj,
+			  off_t offset)
+{
+	struct delta_entry *delta = deltas + nr_deltas;
+	delta->obj_no = obj - objects;
+	memset(&delta->base, 0, sizeof(delta->base));
+	delta->base.offset = offset;
+	nr_deltas++;
+}
+
 /*
  * v4 trees are actually kind of deltas and we don't do delta in the
  * first pass. This function only walks through a tree object to find
@@ -703,17 +722,16 @@ static void read_typesize_v2(struct object_entry *obj)
 }
 
 static void *unpack_raw_entry(struct object_entry *obj,
-			      union delta_base *delta_base,
 			      unsigned char *sha1)
 {
 	void *data;
-	uintmax_t val;
+	off_t offset;
 
 	obj->idx.offset = consumed_bytes;
 	input_crc32 = crc32(0, NULL, 0);
 
 	if (packv4) {
-		val = read_varint();
+		uintmax_t val = read_varint();
 		obj->type = val & 15;
 		obj->size = val >> 4;
 	} else
@@ -722,14 +740,14 @@ static void *unpack_raw_entry(struct object_entry *obj,
 
 	switch (obj->type) {
 	case OBJ_REF_DELTA:
-		hashcpy(delta_base->sha1, fill_and_use(20));
+		add_sha1_delta(obj, fill_and_use(20));
 		break;
 	case OBJ_OFS_DELTA:
-		memset(delta_base, 0, sizeof(*delta_base));
-		val = read_varint();
-		delta_base->offset = obj->idx.offset - val;
-		if (delta_base->offset <= 0 || delta_base->offset >= obj->idx.offset)
-			bad_object(obj->idx.offset, _("delta base offset is out of bound"));
+		offset = obj->idx.offset - read_varint();
+		if (offset <= 0 || offset >= obj->idx.offset)
+			bad_object(obj->idx.offset,
+				   _("delta base offset is out of bound"));
+		add_ofs_delta(obj, offset);
 		break;
 	case OBJ_COMMIT:
 	case OBJ_TREE:
@@ -1282,7 +1300,6 @@ static void parse_dictionaries(void)
 static void parse_pack_objects(unsigned char *sha1)
 {
 	int i, nr_delays = 0;
-	struct delta_entry *delta = deltas;
 	struct stat st;
 
 	if (verbose)
@@ -1291,12 +1308,9 @@ static void parse_pack_objects(unsigned char *sha1)
 				nr_objects);
 	for (i = 0; i < nr_objects; i++) {
 		struct object_entry *obj = &objects[i];
-		void *data = unpack_raw_entry(obj, &delta->base, obj->idx.sha1);
-		if (is_delta_type(obj->type)) {
-			nr_deltas++;
-			delta->obj_no = i;
-			delta++;
-		} else if (!data && obj->type == OBJ_PV4_TREE) {
+		void *data = unpack_raw_entry(obj, obj->idx.sha1);
+		if (is_delta_type(obj->type) ||
+		    (!data && obj->type == OBJ_PV4_TREE)) {
 			/* delay sha1_object() until second pass */
 		} else if (!data) {
 			/* large blobs, check later */
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH v2 12/14] index-pack: record all delta bases in v4 (tree and ref-delta)
  2013-09-08  7:22   ` [PATCH v2 00/14] pack v4 support in index-pack Nguyễn Thái Ngọc Duy
                       ` (10 preceding siblings ...)
  2013-09-08  7:22     ` [PATCH v2 11/14] index-pack: move delta base queuing code to unpack_raw_entry Nguyễn Thái Ngọc Duy
@ 2013-09-08  7:22     ` Nguyễn Thái Ngọc Duy
  2013-09-08  7:22     ` [PATCH v2 13/14] index-pack: skip looking for ofs-deltas in v4 as they are not allowed Nguyễn Thái Ngọc Duy
  2013-09-08  7:22     ` [PATCH v2 14/14] index-pack: resolve v4 one-base trees Nguyễn Thái Ngọc Duy
  13 siblings, 0 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-08  7:22 UTC (permalink / raw)
  To: git; +Cc: Nguyễn Thái Ngọc Duy

---
 builtin/index-pack.c | 33 ++++++++++++++++++++++++++++++---
 1 file changed, 30 insertions(+), 3 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index 8f2d929..e903a49 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -24,6 +24,7 @@ struct object_entry {
 	enum object_type real_type; /* type after delta resolving */
 	unsigned delta_depth;
 	int base_object_no;
+	int nr_bases;		/* only valid for v4 trees */
 };
 
 union delta_base {
@@ -482,6 +483,11 @@ static int is_delta_type(enum object_type type)
 	return (type == OBJ_REF_DELTA || type == OBJ_OFS_DELTA);
 }
 
+static int is_delta_tree(const struct object_entry *obj)
+{
+	return obj->type == OBJ_PV4_TREE && obj->nr_bases > 0;
+}
+
 static void read_and_inflate(unsigned long offset,
 			     void *buf, unsigned long size,
 			     unsigned long wraparound,
@@ -587,6 +593,20 @@ static void add_ofs_delta(struct object_entry *obj,
 	nr_deltas++;
 }
 
+static void add_tree_delta_base(struct object_entry *obj,
+				const unsigned char *base,
+				int delta_start)
+{
+	int i;
+
+	for (i = delta_start; i < nr_deltas; i++)
+		if (!hashcmp(base, deltas[i].base.sha1))
+			return;
+
+	add_sha1_delta(obj, base);
+	obj->nr_bases++;
+}
+
 /*
  * v4 trees are actually kind of deltas and we don't do delta in the
  * first pass. This function only walks through a tree object to find
@@ -601,12 +621,14 @@ static void *unpack_tree_v4(struct object_entry *obj,
 	unsigned int nr = read_varint();
 	const unsigned char *last_base = NULL;
 	struct strbuf sb = STRBUF_INIT;
+	int delta_start = nr_deltas;
 	while (nr) {
 		unsigned int copy_start_or_path = read_varint();
 		if (copy_start_or_path & 1) { /* copy_start */
 			unsigned int copy_count = read_varint();
 			if (copy_count & 1) { /* first delta */
 				last_base = read_sha1table_ref();
+				add_tree_delta_base(obj, last_base, delta_start);
 			} else if (!last_base)
 				bad_object(offset,
 					   _("missing delta base unpack_tree_v4"));
@@ -740,9 +762,15 @@ static void *unpack_raw_entry(struct object_entry *obj,
 
 	switch (obj->type) {
 	case OBJ_REF_DELTA:
-		add_sha1_delta(obj, fill_and_use(20));
+		if (packv4)
+			add_sha1_delta(obj, read_sha1table_ref());
+		else
+			add_sha1_delta(obj, fill_and_use(20));
 		break;
 	case OBJ_OFS_DELTA:
+		if (packv4)
+			die(_("pack version 4 does not support ofs-delta type (offset %lu)"),
+			    obj->idx.offset);
 		offset = obj->idx.offset - read_varint();
 		if (offset <= 0 || offset >= obj->idx.offset)
 			bad_object(obj->idx.offset,
@@ -1309,8 +1337,7 @@ static void parse_pack_objects(unsigned char *sha1)
 	for (i = 0; i < nr_objects; i++) {
 		struct object_entry *obj = &objects[i];
 		void *data = unpack_raw_entry(obj, obj->idx.sha1);
-		if (is_delta_type(obj->type) ||
-		    (!data && obj->type == OBJ_PV4_TREE)) {
+		if (is_delta_type(obj->type) || is_delta_tree(obj)) {
 			/* delay sha1_object() until second pass */
 		} else if (!data) {
 			/* large blobs, check later */
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH v2 13/14] index-pack: skip looking for ofs-deltas in v4 as they are not allowed
  2013-09-08  7:22   ` [PATCH v2 00/14] pack v4 support in index-pack Nguyễn Thái Ngọc Duy
                       ` (11 preceding siblings ...)
  2013-09-08  7:22     ` [PATCH v2 12/14] index-pack: record all delta bases in v4 (tree and ref-delta) Nguyễn Thái Ngọc Duy
@ 2013-09-08  7:22     ` Nguyễn Thái Ngọc Duy
  2013-09-08  7:22     ` [PATCH v2 14/14] index-pack: resolve v4 one-base trees Nguyễn Thái Ngọc Duy
  13 siblings, 0 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-08  7:22 UTC (permalink / raw)
  To: git; +Cc: Nguyễn Thái Ngọc Duy

---
 builtin/index-pack.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index e903a49..ce06473 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -1173,10 +1173,13 @@ static struct base_data *find_unresolved_deltas_1(struct base_data *base,
 		find_delta_children(&base_spec,
 				    &base->ref_first, &base->ref_last, OBJ_REF_DELTA);
 
-		memset(&base_spec, 0, sizeof(base_spec));
-		base_spec.offset = base->obj->idx.offset;
-		find_delta_children(&base_spec,
-				    &base->ofs_first, &base->ofs_last, OBJ_OFS_DELTA);
+		if (!packv4) {
+			memset(&base_spec, 0, sizeof(base_spec));
+			base_spec.offset = base->obj->idx.offset;
+			find_delta_children(&base_spec,
+					    &base->ofs_first, &base->ofs_last,
+					    OBJ_OFS_DELTA);
+		}
 
 		if (base->ref_last == -1 && base->ofs_last == -1) {
 			free(base->data);
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH v2 14/14] index-pack: resolve v4 one-base trees
  2013-09-08  7:22   ` [PATCH v2 00/14] pack v4 support in index-pack Nguyễn Thái Ngọc Duy
                       ` (12 preceding siblings ...)
  2013-09-08  7:22     ` [PATCH v2 13/14] index-pack: skip looking for ofs-deltas in v4 as they are not allowed Nguyễn Thái Ngọc Duy
@ 2013-09-08  7:22     ` Nguyễn Thái Ngọc Duy
  13 siblings, 0 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-08  7:22 UTC (permalink / raw)
  To: git; +Cc: Nguyễn Thái Ngọc Duy

This is the most common case for delta trees. In fact it's the only
kind that's produced by packv4-create. It fits well in the way
index-pack resolves deltas and benefits from threading (the set of
objects depending on this base does not overlap with the set of
objects depending on another base)

Multi-base trees will be probably processed differently.
---
 builtin/index-pack.c | 195 ++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 179 insertions(+), 16 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index ce06473..88340b5 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -12,6 +12,8 @@
 #include "streaming.h"
 #include "thread-utils.h"
 #include "packv4-parse.h"
+#include "varint.h"
+#include "tree-walk.h"
 
 static const char index_pack_usage[] =
 "git index-pack [-v] [-o <index-file>] [--keep | --keep=<msg>] [--verify] [--strict] (<pack-file> | --stdin [--fix-thin] [<pack-file>])";
@@ -38,8 +40,8 @@ struct base_data {
 	struct object_entry *obj;
 	void *data;
 	unsigned long size;
-	int ref_first, ref_last;
-	int ofs_first, ofs_last;
+	int ref_first, ref_last, tree_first;
+	int ofs_first, ofs_last, tree_last;
 };
 
 #if !defined(NO_PTHREADS) && defined(NO_THREAD_SAFE_PREAD)
@@ -430,6 +432,7 @@ static struct base_data *alloc_base_data(void)
 	memset(base, 0, sizeof(*base));
 	base->ref_last = -1;
 	base->ofs_last = -1;
+	base->tree_last = -1;
 	return base;
 }
 
@@ -670,6 +673,8 @@ static void *unpack_tree_v4(struct object_entry *obj,
 	}
 
 	if (last_base) {
+		if (nr_deltas - delta_start > 1)
+			die("sorry guys, multi-base trees are not supported yet");
 		strbuf_release(&sb);
 		return NULL;
 	} else {
@@ -800,6 +805,84 @@ static void *unpack_raw_entry(struct object_entry *obj,
 }
 
 /*
+ * Some checks are skipped because they are already done by
+ * unpack_tree_v4() in the first pass.
+ */
+static void *patch_one_base_tree(const struct object_entry *src,
+				 const unsigned char *src_buf,
+				 const unsigned char *delta_buf,
+				 unsigned long delta_size,
+				 unsigned long *dst_size)
+{
+	int nr;
+	const unsigned char *last_base = NULL;
+	struct strbuf sb = STRBUF_INIT;
+	const unsigned char *p = delta_buf;
+
+	nr = decode_varint(&p);
+	while (nr > 0 && p < delta_buf + delta_size) {
+		unsigned int copy_start_or_path = decode_varint(&p);
+		if (copy_start_or_path & 1) { /* copy_start */
+			struct tree_desc desc;
+			struct name_entry entry;
+			unsigned int copy_count = decode_varint(&p);
+			unsigned int copy_start = copy_start_or_path >> 1;
+			if (!src)
+				die("we are not supposed to copy from another tree!");
+			if (copy_count & 1) { /* first delta */
+				unsigned int id = decode_varint(&p);
+				if (!id) {
+					last_base = p;
+					p += 20;
+				} else
+					last_base = sha1_table + (id - 1) * 20;
+				if (hashcmp(last_base, src->idx.sha1))
+					die(_("bad tree base in patch_one_base_tree"));
+			}
+
+			copy_count >>= 1;
+			nr -= copy_count;
+
+			init_tree_desc(&desc, src_buf, src->size);
+			while (tree_entry(&desc, &entry)) {
+				if (copy_start)
+					copy_start--;
+				else if (copy_count) {
+					strbuf_addf(&sb, "%o %s%c",
+						    entry.mode, entry.path, '\0');
+					strbuf_add(&sb, entry.sha1, 20);
+					copy_count--;
+				} else
+					break;
+			}
+		} else {	/* path */
+			unsigned int path_idx = copy_start_or_path >> 1;
+			const unsigned char *path;
+			unsigned mode;
+			unsigned int id;
+			const unsigned char *entry_sha1;
+
+			id = decode_varint(&p);
+			if (!id) {
+				entry_sha1 = p;
+				p += 20;
+			} else
+				entry_sha1 = sha1_table + (id - 1) * 20;
+			nr--;
+
+			path = path_dict->data + path_dict->offsets[path_idx];
+			mode = (path[0] << 8) | path[1];
+			strbuf_addf(&sb, "%o %s%c", mode, path+2, '\0');
+			strbuf_add(&sb, entry_sha1, 20);
+		}
+	}
+	if (nr != 0 || p != delta_buf + delta_size)
+		die(_("bad delta tree"));
+	*dst_size = sb.len;
+	return sb.buf;
+}
+
+/*
  * Unpack entry data in the second pass when the pack is already
  * stored on disk. consume call back is used for large-blob case. Must
  * be thread safe.
@@ -865,8 +948,33 @@ static void *unpack_data(struct object_entry *obj,
 	return data;
 }
 
+static void *get_tree_v4_from_pack(struct object_entry *obj,
+				   unsigned long *len_p)
+{
+	off_t from = obj[0].idx.offset + obj[0].hdr_size;
+	unsigned long len = obj[1].idx.offset - from;
+	unsigned char *data;
+	ssize_t n;
+
+	data = xmalloc(len);
+	n = pread(pack_fd, data, len, from);
+	if (n < 0)
+		die_errno(_("cannot pread pack file"));
+	if (!n)
+		die(Q_("premature end of pack file, %lu byte missing",
+		       "premature end of pack file, %lu bytes missing",
+		       len),
+		    len);
+	if (len_p)
+		*len_p = len;
+	return data;
+}
+
 static void *get_data_from_pack(struct object_entry *obj)
 {
+	if (obj->type == OBJ_PV4_COMMIT || obj->type == OBJ_PV4_TREE)
+		die("BUG: unsupported code path");
+
 	return unpack_data(obj, NULL, NULL);
 }
 
@@ -1093,14 +1201,25 @@ static void *get_base_data(struct base_data *c)
 		struct object_entry *obj = c->obj;
 		struct base_data **delta = NULL;
 		int delta_nr = 0, delta_alloc = 0;
+		unsigned long size, len;
 
-		while (is_delta_type(c->obj->type) && !c->data) {
+		while ((is_delta_type(c->obj->type) ||
+			(c->base && c->obj->type == OBJ_PV4_TREE)) &&
+		       !c->data) {
 			ALLOC_GROW(delta, delta_nr + 1, delta_alloc);
 			delta[delta_nr++] = c;
 			c = c->base;
 		}
 		if (!delta_nr) {
-			c->data = get_data_from_pack(obj);
+			if (c->obj->type == OBJ_PV4_TREE) {
+				void *tree_v4 = get_tree_v4_from_pack(obj, &len);
+				c->data = patch_one_base_tree(NULL, NULL,
+							      tree_v4, len, &size);
+				if (size != obj->size)
+					die("size mismatch");
+				free(tree_v4);
+			} else
+				c->data = get_data_from_pack(obj);
 			c->size = obj->size;
 			get_thread_data()->base_cache_used += c->size;
 			prune_base_data(c);
@@ -1110,11 +1229,18 @@ static void *get_base_data(struct base_data *c)
 			c = delta[delta_nr - 1];
 			obj = c->obj;
 			base = get_base_data(c->base);
-			raw = get_data_from_pack(obj);
-			c->data = patch_delta(
-				base, c->base->size,
-				raw, obj->size,
-				&c->size);
+			if (c->obj->type == OBJ_PV4_TREE) {
+				raw = get_tree_v4_from_pack(obj, &len);
+				c->data = patch_one_base_tree(c->base->obj, base,
+							      raw, len, &size);
+				if (size != obj->size)
+					die("size mismatch");
+			} else {
+				raw = get_data_from_pack(obj);
+				c->data = patch_delta(base, c->base->size,
+						      raw, obj->size,
+						      &c->size);
+			}
 			free(raw);
 			if (!c->data)
 				bad_object(obj->idx.offset, _("failed to apply delta"));
@@ -1130,6 +1256,8 @@ static void resolve_delta(struct object_entry *delta_obj,
 			  struct base_data *base, struct base_data *result)
 {
 	void *base_data, *delta_data;
+	int tree_v4 = delta_obj->type == OBJ_PV4_TREE;
+	unsigned long tree_size;
 
 	delta_obj->real_type = base->obj->real_type;
 	if (show_stat) {
@@ -1140,10 +1268,18 @@ static void resolve_delta(struct object_entry *delta_obj,
 		deepest_delta_unlock();
 	}
 	delta_obj->base_object_no = base->obj - objects;
-	delta_data = get_data_from_pack(delta_obj);
+	if (tree_v4)
+		delta_data = get_tree_v4_from_pack(delta_obj, &tree_size);
+	else
+		delta_data = get_data_from_pack(delta_obj);
 	base_data = get_base_data(base);
 	result->obj = delta_obj;
-	result->data = patch_delta(base_data, base->size,
+	if (tree_v4)
+		result->data = patch_one_base_tree(base->obj, base_data,
+						   delta_data, tree_size,
+						   &result->size);
+	else
+		result->data = patch_delta(base_data, base->size,
 				   delta_data, delta_obj->size, &result->size);
 	free(delta_data);
 	if (!result->data)
@@ -1166,7 +1302,8 @@ static void resolve_delta(struct object_entry *delta_obj,
 static struct base_data *find_unresolved_deltas_1(struct base_data *base,
 						  struct base_data *prev_base)
 {
-	if (base->ref_last == -1 && base->ofs_last == -1) {
+	if (base->ref_last == -1 && base->ofs_last == -1 &&
+	    base->tree_last == -1) {
 		union delta_base base_spec;
 
 		hashcpy(base_spec.sha1, base->obj->idx.sha1);
@@ -1179,9 +1316,15 @@ static struct base_data *find_unresolved_deltas_1(struct base_data *base,
 			find_delta_children(&base_spec,
 					    &base->ofs_first, &base->ofs_last,
 					    OBJ_OFS_DELTA);
+		} else {
+			hashcpy(base_spec.sha1, base->obj->idx.sha1);
+			find_delta_children(&base_spec,
+					    &base->tree_first, &base->tree_last,
+					    OBJ_PV4_TREE);
 		}
 
-		if (base->ref_last == -1 && base->ofs_last == -1) {
+		if (base->ref_last == -1 && base->ofs_last == -1 &&
+		    base->tree_last == -1) {
 			free(base->data);
 			return NULL;
 		}
@@ -1215,6 +1358,25 @@ static struct base_data *find_unresolved_deltas_1(struct base_data *base,
 		return result;
 	}
 
+	while (base->tree_first <= base->tree_last) {
+		struct object_entry *child = objects + deltas[base->tree_first].obj_no;
+		struct base_data *result;
+
+		assert(child->type == OBJ_PV4_TREE);
+		if (child->nr_bases > 1) {
+			/* maybe resolved in the third pass or something */
+			base->tree_first++;
+			continue;
+		}
+		result = alloc_base_data();
+		resolve_delta(child, base, result);
+		if (base->tree_first == base->tree_last)
+			free_base_data(base);
+
+		base->tree_first++;
+		return result;
+	}
+
 	unlink_base_data(base);
 	return NULL;
 }
@@ -1273,7 +1435,8 @@ static void *threaded_second_pass(void *data)
 		counter_unlock();
 		work_lock();
 		while (nr_dispatched < nr_objects &&
-		       is_delta_type(objects[nr_dispatched].type))
+		       (is_delta_type(objects[nr_dispatched].type) ||
+			is_delta_tree(objects + nr_dispatched)))
 			nr_dispatched++;
 		if (nr_dispatched >= nr_objects) {
 			work_unlock();
@@ -1427,7 +1590,7 @@ static void resolve_deltas(void)
 	for (i = 0; i < nr_objects; i++) {
 		struct object_entry *obj = &objects[i];
 
-		if (is_delta_type(obj->type))
+		if (is_delta_type(obj->type) || is_delta_tree(obj))
 			continue;
 		resolve_base(obj);
 		display_progress(progress, nr_resolved_deltas);
@@ -1972,7 +2135,7 @@ int cmd_index_pack(int argc, const char **argv, const char *prefix)
 		show_pack_info(stat_only);
 
 	if (packv4)
-		die("we're not there yet");
+		opts.version = 3;
 
 	idx_objects = xmalloc((nr_objects) * sizeof(struct pack_idx_entry *));
 	for (i = 0; i < nr_objects; i++)
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 00/11] pack v4 support in pack-objects
  2013-09-05  6:19 [PATCH 00/38] pack version 4 basic functionalities Nicolas Pitre
                   ` (38 preceding siblings ...)
  2013-09-07 10:43 ` [PATCH 00/12] pack v4 support in index-pack Nguyễn Thái Ngọc Duy
@ 2013-09-08 15:04 ` Nguyễn Thái Ngọc Duy
  2013-09-08 15:04   ` [PATCH 01/11] pack v4: allocate dicts from the beginning Nguyễn Thái Ngọc Duy
                     ` (11 more replies)
  39 siblings, 12 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-08 15:04 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git, Nguyễn Thái Ngọc Duy

I can produce pack v4 on git.git with this and verify it with
index-pack. I'm not familiar with pack-objects code and not really
confident with my changes. Suggestions are welcome.

Also I chose to keep packv4-create.c in libgit.a and move test code
out to test-packv4.c. Not sure if it's good decision. The other option
is to copy necessary code to pack-objects.c, then delete
packv4-create.c in the end. Either way we have the same amount of code
move.

Thin pack support is not there yet, but it should be simple on
pack-objects' end. Like the compatibility layer you added to
sha1_file.c, this code does not take advantage of v4 as source packs
(performance regressions entail) A lot of rooms for improvements.

Nguyễn Thái Ngọc Duy (11):
  pack v4: allocate dicts from the beginning
  pack v4: stop using static/global variables in packv4-create.c
  pack v4: move packv4-create.c to libgit.a
  pack v4: add version argument to write_pack_header
  pack-write.c: add pv4_encode_in_pack_object_header
  pack-objects: add --version to specify written pack version
  list-objects.c: add show_tree_entry callback to traverse_commit_list
  pack-objects: create pack v4 tables
  pack-objects: do not cache delta for v4 trees
  pack-objects: exclude commits out of delta objects in v4
  pack-objects: support writing pack v4

 Makefile               |   4 +-
 builtin/pack-objects.c | 187 +++++++++++++++--
 builtin/rev-list.c     |   4 +-
 bulk-checkin.c         |   2 +-
 list-objects.c         |   9 +-
 list-objects.h         |   3 +-
 pack-write.c           |  36 +++-
 pack.h                 |   6 +-
 packv4-create.c        | 534 ++++---------------------------------------------
 packv4-create.h (new)  |  50 +++++
 test-packv4.c (new)    | 476 +++++++++++++++++++++++++++++++++++++++++++
 upload-pack.c          |   2 +-
 12 files changed, 789 insertions(+), 524 deletions(-)
 create mode 100644 packv4-create.h
 create mode 100644 test-packv4.c

-- 
1.8.2.83.gc99314b

^ permalink raw reply	[flat|nested] 124+ messages in thread

* [PATCH 01/11] pack v4: allocate dicts from the beginning
  2013-09-08 15:04 ` [PATCH 00/11] pack v4 support in pack-objects Nguyễn Thái Ngọc Duy
@ 2013-09-08 15:04   ` Nguyễn Thái Ngọc Duy
  2013-09-08 15:04   ` [PATCH 02/11] pack v4: stop using static/global variables in packv4-create.c Nguyễn Thái Ngọc Duy
                     ` (10 subsequent siblings)
  11 siblings, 0 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-08 15:04 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git, Nguyễn Thái Ngọc Duy

commit_ident_table and tree_path_table are local to packv4-create.c
and test-packv4.c. Move them out of add_*_dict_entries so
add_*_dict_entries can be exported to pack-objects.c

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 packv4-create.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/packv4-create.c b/packv4-create.c
index 38fa594..dbc2a03 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -181,14 +181,12 @@ static char *get_nameend_and_tz(char *from, int *tz_val)
 	return end;
 }
 
-static int add_commit_dict_entries(void *buf, unsigned long size)
+int add_commit_dict_entries(struct dict_table *commit_ident_table,
+			    void *buf, unsigned long size)
 {
 	char *name, *end = NULL;
 	int tz_val;
 
-	if (!commit_ident_table)
-		commit_ident_table = create_dict_table();
-
 	/* parse and add author info */
 	name = strstr(buf, "\nauthor ");
 	if (name) {
@@ -212,14 +210,12 @@ static int add_commit_dict_entries(void *buf, unsigned long size)
 	return 0;
 }
 
-static int add_tree_dict_entries(void *buf, unsigned long size)
+static int add_tree_dict_entries(struct dict_table *tree_path_table,
+				 void *buf, unsigned long size)
 {
 	struct tree_desc desc;
 	struct name_entry name_entry;
 
-	if (!tree_path_table)
-		tree_path_table = create_dict_table();
-
 	init_tree_desc(&desc, buf, size);
 	while (tree_entry(&desc, &name_entry)) {
 		int pathlen = tree_entry_len(&name_entry);
@@ -659,6 +655,9 @@ static int create_pack_dictionaries(struct packed_git *p,
 	struct progress *progress_state;
 	unsigned int i;
 
+	commit_ident_table = create_dict_table();
+	tree_path_table = create_dict_table();
+
 	progress_state = start_progress("Scanning objects", p->num_objects);
 	for (i = 0; i < p->num_objects; i++) {
 		struct pack_idx_entry *obj = obj_list[i];
@@ -666,7 +665,8 @@ static int create_pack_dictionaries(struct packed_git *p,
 		enum object_type type;
 		unsigned long size;
 		struct object_info oi = {};
-		int (*add_dict_entries)(void *, unsigned long);
+		int (*add_dict_entries)(struct dict_table *, void *, unsigned long);
+		struct dict_table *dict;
 
 		display_progress(progress_state, i+1);
 
@@ -679,9 +679,11 @@ static int create_pack_dictionaries(struct packed_git *p,
 		switch (type) {
 		case OBJ_COMMIT:
 			add_dict_entries = add_commit_dict_entries;
+			dict = commit_ident_table;
 			break;
 		case OBJ_TREE:
 			add_dict_entries = add_tree_dict_entries;
+			dict = tree_path_table;
 			break;
 		default:
 			continue;
@@ -693,7 +695,7 @@ static int create_pack_dictionaries(struct packed_git *p,
 		if (check_sha1_signature(obj->sha1, data, size, typename(type)))
 			die("packed %s from %s is corrupt",
 			    sha1_to_hex(obj->sha1), p->pack_name);
-		if (add_dict_entries(data, size) < 0)
+		if (add_dict_entries(dict, data, size) < 0)
 			die("can't process %s object %s",
 				typename(type), sha1_to_hex(obj->sha1));
 		free(data);
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 02/11] pack v4: stop using static/global variables in packv4-create.c
  2013-09-08 15:04 ` [PATCH 00/11] pack v4 support in pack-objects Nguyễn Thái Ngọc Duy
  2013-09-08 15:04   ` [PATCH 01/11] pack v4: allocate dicts from the beginning Nguyễn Thái Ngọc Duy
@ 2013-09-08 15:04   ` Nguyễn Thái Ngọc Duy
  2013-09-08 15:04   ` [PATCH 03/11] pack v4: move packv4-create.c to libgit.a Nguyễn Thái Ngọc Duy
                     ` (9 subsequent siblings)
  11 siblings, 0 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-08 15:04 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git, Nguyễn Thái Ngọc Duy


Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 packv4-create.c       | 103 ++++++++++++++++++++++++++++----------------------
 packv4-create.h (new) |  11 ++++++
 2 files changed, 69 insertions(+), 45 deletions(-)
 create mode 100644 packv4-create.h

diff --git a/packv4-create.c b/packv4-create.c
index dbc2a03..920a0b4 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -15,6 +15,7 @@
 #include "pack-revindex.h"
 #include "progress.h"
 #include "varint.h"
+#include "packv4-create.h"
 
 
 static int pack_compression_seen;
@@ -145,9 +146,6 @@ static void sort_dict_entries_by_hits(struct dict_table *t)
 	rehash_entries(t);
 }
 
-static struct dict_table *commit_ident_table;
-static struct dict_table *tree_path_table;
-
 /*
  * Parse the author/committer line from a canonical commit object.
  * The 'from' argument points right after the "author " or "committer "
@@ -243,10 +241,10 @@ void dump_dict_table(struct dict_table *t)
 	}
 }
 
-static void dict_dump(void)
+static void dict_dump(struct packv4_tables *v4)
 {
-	dump_dict_table(commit_ident_table);
-	dump_dict_table(tree_path_table);
+	dump_dict_table(v4->commit_ident_table);
+	dump_dict_table(v4->tree_path_table);
 }
 
 /*
@@ -254,10 +252,12 @@ static void dict_dump(void)
  * pack SHA1 table incremented by 1, or the literal SHA1 value prefixed
  * with a zero byte if the needed SHA1 is not available in the table.
  */
-static struct pack_idx_entry *all_objs;
-static unsigned all_objs_nr;
-static int encode_sha1ref(const unsigned char *sha1, unsigned char *buf)
+
+int encode_sha1ref(const struct packv4_tables *v4,
+		   const unsigned char *sha1, unsigned char *buf)
 {
+	unsigned all_objs_nr = v4->all_objs_nr;
+	struct pack_idx_entry *all_objs = v4->all_objs;
 	unsigned lo = 0, hi = all_objs_nr;
 
 	do {
@@ -284,7 +284,8 @@ static int encode_sha1ref(const unsigned char *sha1, unsigned char *buf)
  * strict so to ensure the canonical version may always be
  * regenerated and produce the same hash.
  */
-void *pv4_encode_commit(void *buffer, unsigned long *sizep)
+void *pv4_encode_commit(const struct packv4_tables *v4,
+			void *buffer, unsigned long *sizep)
 {
 	unsigned long size = *sizep;
 	char *in, *tail, *end;
@@ -310,7 +311,7 @@ void *pv4_encode_commit(void *buffer, unsigned long *sizep)
 	if (get_sha1_lowhex(in + 5, sha1) < 0)
 		goto bad_data;
 	in += 46;
-	out += encode_sha1ref(sha1, out);
+	out += encode_sha1ref(v4, sha1, out);
 
 	/* count how many "parent" lines */
 	nb_parents = 0;
@@ -325,7 +326,7 @@ void *pv4_encode_commit(void *buffer, unsigned long *sizep)
 	while (nb_parents--) {
 		if (get_sha1_lowhex(in + 7, sha1))
 			goto bad_data;
-		out += encode_sha1ref(sha1, out);
+		out += encode_sha1ref(v4, sha1, out);
 		in += 48;
 	}
 
@@ -337,7 +338,7 @@ void *pv4_encode_commit(void *buffer, unsigned long *sizep)
 	end = get_nameend_and_tz(in, &tz_val);
 	if (!end)
 		goto bad_data;
-	author_index = dict_add_entry(commit_ident_table, tz_val, in, end - in);
+	author_index = dict_add_entry(v4->commit_ident_table, tz_val, in, end - in);
 	if (author_index < 0)
 		goto bad_dict;
 	author_time = strtoul(end, &end, 10);
@@ -353,7 +354,7 @@ void *pv4_encode_commit(void *buffer, unsigned long *sizep)
 	end = get_nameend_and_tz(in, &tz_val);
 	if (!end)
 		goto bad_data;
-	commit_index = dict_add_entry(commit_ident_table, tz_val, in, end - in);
+	commit_index = dict_add_entry(v4->commit_ident_table, tz_val, in, end - in);
 	if (commit_index < 0)
 		goto bad_dict;
 	commit_time = strtoul(end, &end, 10);
@@ -436,7 +437,8 @@ static int compare_tree_entries(struct name_entry *e1, struct name_entry *e2)
  * If a delta buffer is provided, we may encode multiple ranges of tree
  * entries against that buffer.
  */
-void *pv4_encode_tree(void *_buffer, unsigned long *sizep,
+void *pv4_encode_tree(const struct packv4_tables *v4,
+		      void *_buffer, unsigned long *sizep,
 		      void *delta, unsigned long delta_size,
 		      const unsigned char *delta_sha1)
 {
@@ -551,7 +553,7 @@ void *pv4_encode_tree(void *_buffer, unsigned long *sizep,
 			cp += encode_varint(copy_start, cp);
 			cp += encode_varint(copy_count, cp);
 			if (first_delta)
-				cp += encode_sha1ref(delta_sha1, cp);
+				cp += encode_sha1ref(v4, delta_sha1, cp);
 
 			/*
 			 * Now let's make sure this is going to take less
@@ -577,7 +579,7 @@ void *pv4_encode_tree(void *_buffer, unsigned long *sizep,
 		}
 
 		pathlen = tree_entry_len(&name_entry);
-		index = dict_add_entry(tree_path_table, name_entry.mode,
+		index = dict_add_entry(v4->tree_path_table, name_entry.mode,
 				       name_entry.path, pathlen);
 		if (index < 0) {
 			error("missing tree dict entry");
@@ -585,7 +587,7 @@ void *pv4_encode_tree(void *_buffer, unsigned long *sizep,
 			return NULL;
 		}
 		out += encode_varint(index << 1, out);
-		out += encode_sha1ref(name_entry.sha1, out);
+		out += encode_sha1ref(v4, name_entry.sha1, out);
 	}
 
 	if (copy_count) {
@@ -596,7 +598,7 @@ void *pv4_encode_tree(void *_buffer, unsigned long *sizep,
 		cp += encode_varint(copy_start, cp);
 		cp += encode_varint(copy_count, cp);
 		if (first_delta)
-			cp += encode_sha1ref(delta_sha1, cp);
+			cp += encode_sha1ref(v4, delta_sha1, cp);
 		if (copy_count >= min_tree_copy &&
 		    cp - copy_buf < out - &buffer[copy_pos]) {
 			out = buffer + copy_pos;
@@ -649,14 +651,15 @@ static struct pack_idx_entry **sort_objs_by_offset(struct pack_idx_entry *list,
 	return sorted;
 }
 
-static int create_pack_dictionaries(struct packed_git *p,
+static int create_pack_dictionaries(struct packv4_tables *v4,
+				    struct packed_git *p,
 				    struct pack_idx_entry **obj_list)
 {
 	struct progress *progress_state;
 	unsigned int i;
 
-	commit_ident_table = create_dict_table();
-	tree_path_table = create_dict_table();
+	v4->commit_ident_table = create_dict_table();
+	v4->tree_path_table = create_dict_table();
 
 	progress_state = start_progress("Scanning objects", p->num_objects);
 	for (i = 0; i < p->num_objects; i++) {
@@ -679,11 +682,11 @@ static int create_pack_dictionaries(struct packed_git *p,
 		switch (type) {
 		case OBJ_COMMIT:
 			add_dict_entries = add_commit_dict_entries;
-			dict = commit_ident_table;
+			dict = v4->commit_ident_table;
 			break;
 		case OBJ_TREE:
 			add_dict_entries = add_tree_dict_entries;
-			dict = tree_path_table;
+			dict = v4->tree_path_table;
 			break;
 		default:
 			continue;
@@ -776,9 +779,13 @@ static unsigned int packv4_write_header(struct sha1file *f, unsigned nr_objects)
 	return sizeof(hdr);
 }
 
-static unsigned long packv4_write_tables(struct sha1file *f, unsigned nr_objects,
-					 struct pack_idx_entry *objs)
+unsigned long packv4_write_tables(struct sha1file *f,
+				  const struct packv4_tables *v4)
 {
+	unsigned nr_objects = v4->all_objs_nr;
+	struct pack_idx_entry *objs = v4->all_objs;
+	struct dict_table *commit_ident_table = v4->commit_ident_table;
+	struct dict_table *tree_path_table = v4->tree_path_table;
 	unsigned i;
 	unsigned long written = 0;
 
@@ -823,7 +830,8 @@ static int write_object_header(struct sha1file *f, enum object_type type, unsign
 	return len;
 }
 
-static unsigned long copy_object_data(struct sha1file *f, struct packed_git *p,
+static unsigned long copy_object_data(struct packv4_tables *v4,
+				      struct sha1file *f, struct packed_git *p,
 				      off_t offset)
 {
 	struct pack_window *w_curs = NULL;
@@ -850,11 +858,13 @@ static unsigned long copy_object_data(struct sha1file *f, struct packed_git *p,
 		if (base_offset <= 0 || base_offset >= offset)
 			die("delta offset out of bound");
 		revidx = find_pack_revindex(p, base_offset);
-		reflen = encode_sha1ref(nth_packed_object_sha1(p, revidx->nr), buf);
+		reflen = encode_sha1ref(v4,
+					nth_packed_object_sha1(p, revidx->nr),
+					buf);
 		sha1write(f, buf, reflen);
 		written += reflen;
 	} else if (type == OBJ_REF_DELTA) {
-		reflen = encode_sha1ref(src + hdrlen, buf);
+		reflen = encode_sha1ref(v4, src + hdrlen, buf);
 		hdrlen += 20;
 		sha1write(f, buf, reflen);
 		written += reflen;
@@ -919,7 +929,8 @@ static unsigned char *get_delta_base(struct packed_git *p, off_t offset,
 	return sha1_buf;
 }
 
-static off_t packv4_write_object(struct sha1file *f, struct packed_git *p,
+static off_t packv4_write_object(struct packv4_tables *v4,
+				 struct sha1file *f, struct packed_git *p,
 				 struct pack_idx_entry *obj)
 {
 	void *src, *result;
@@ -941,7 +952,7 @@ static off_t packv4_write_object(struct sha1file *f, struct packed_git *p,
 	case OBJ_TREE:
 		break;
 	default:
-		return copy_object_data(f, p, obj->offset);
+		return copy_object_data(v4, f, p, obj->offset);
 	}
 
 	/* The rest is converted into their new format */
@@ -955,7 +966,7 @@ static off_t packv4_write_object(struct sha1file *f, struct packed_git *p,
 
 	switch (type) {
 	case OBJ_COMMIT:
-		result = pv4_encode_commit(src, &buf_size);
+		result = pv4_encode_commit(v4, src, &buf_size);
 		break;
 	case OBJ_TREE:
 		if (packed_type != OBJ_TREE) {
@@ -972,11 +983,12 @@ static off_t packv4_write_object(struct sha1file *f, struct packed_git *p,
 			if (!ref || ref_type != OBJ_TREE)
 				die("cannot obtain delta base for %s",
 						sha1_to_hex(obj->sha1));
-			result = pv4_encode_tree(src, &buf_size,
+			result = pv4_encode_tree(v4, src, &buf_size,
 						 ref, ref_size, ref_sha1);
 			free(ref);
 		} else {
-			result = pv4_encode_tree(src, &buf_size, NULL, 0, NULL);
+			result = pv4_encode_tree(v4, src, &buf_size,
+						 NULL, 0, NULL);
 		}
 		break;
 	default:
@@ -987,7 +999,7 @@ static off_t packv4_write_object(struct sha1file *f, struct packed_git *p,
 		warning("can't convert %s object %s",
 			typename(type), sha1_to_hex(obj->sha1));
 		/* fall back to copy the object in its original form */
-		return copy_object_data(f, p, obj->offset);
+		return copy_object_data(v4, f, p, obj->offset);
 	}
 
 	/* Use bit 3 to indicate a special type encoding */
@@ -1041,7 +1053,7 @@ static struct packed_git *open_pack(const char *path)
 	return p;
 }
 
-static void process_one_pack(char *src_pack, char *dst_pack)
+static void process_one_pack(struct packv4_tables *v4, char *src_pack, char *dst_pack)
 {
 	struct packed_git *p;
 	struct sha1file *f;
@@ -1061,26 +1073,26 @@ static void process_one_pack(char *src_pack, char *dst_pack)
 	objs = get_packed_object_list(p);
 	p_objs = sort_objs_by_offset(objs, nr_objects);
 
-	create_pack_dictionaries(p, p_objs);
-	sort_dict_entries_by_hits(commit_ident_table);
-	sort_dict_entries_by_hits(tree_path_table);
+	create_pack_dictionaries(v4, p, p_objs);
+	sort_dict_entries_by_hits(v4->commit_ident_table);
+	sort_dict_entries_by_hits(v4->tree_path_table);
 
 	packname = normalize_pack_name(dst_pack);
 	f = packv4_open(packname);
 	if (!f)
 		die("unable to open destination pack");
 	written += packv4_write_header(f, nr_objects);
-	written += packv4_write_tables(f, nr_objects, objs);
+	written += packv4_write_tables(f, v4);
 
 	/* Let's write objects out, updating the object index list in place */
 	progress_state = start_progress("Writing objects", nr_objects);
-	all_objs = objs;
-	all_objs_nr = nr_objects;
+	v4->all_objs = objs;
+	v4->all_objs_nr = nr_objects;
 	for (i = 0; i < nr_objects; i++) {
 		off_t obj_pos = written;
 		struct pack_idx_entry *obj = p_objs[i];
 		crc32_begin(f);
-		written += packv4_write_object(f, p, obj);
+		written += packv4_write_object(v4, f, p, obj);
 		obj->offset = obj_pos;
 		obj->crc32 = crc32_end(f);
 		display_progress(progress_state, i+1);
@@ -1114,6 +1126,7 @@ static int git_pack_config(const char *k, const char *v, void *cb)
 
 int main(int argc, char *argv[])
 {
+	struct packv4_tables v4;
 	char *src_pack, *dst_pack;
 
 	if (argc == 3) {
@@ -1131,8 +1144,8 @@ int main(int argc, char *argv[])
 	git_config(git_pack_config, NULL);
 	if (!pack_compression_seen && core_compression_seen)
 		pack_compression_level = core_compression_level;
-	process_one_pack(src_pack, dst_pack);
+	process_one_pack(&v4, src_pack, dst_pack);
 	if (0)
-		dict_dump();
+		dict_dump(&v4);
 	return 0;
 }
diff --git a/packv4-create.h b/packv4-create.h
new file mode 100644
index 0000000..0c8c77b
--- /dev/null
+++ b/packv4-create.h
@@ -0,0 +1,11 @@
+#ifndef PACKV4_CREATE_H
+#define PACKV4_CREATE_H
+
+struct packv4_tables {
+	struct pack_idx_entry *all_objs;
+	unsigned all_objs_nr;
+	struct dict_table *commit_ident_table;
+	struct dict_table *tree_path_table;
+};
+
+#endif
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 03/11] pack v4: move packv4-create.c to libgit.a
  2013-09-08 15:04 ` [PATCH 00/11] pack v4 support in pack-objects Nguyễn Thái Ngọc Duy
  2013-09-08 15:04   ` [PATCH 01/11] pack v4: allocate dicts from the beginning Nguyễn Thái Ngọc Duy
  2013-09-08 15:04   ` [PATCH 02/11] pack v4: stop using static/global variables in packv4-create.c Nguyễn Thái Ngọc Duy
@ 2013-09-08 15:04   ` Nguyễn Thái Ngọc Duy
  2013-09-08 20:56     ` Nicolas Pitre
  2013-09-08 15:04   ` [PATCH 04/11] pack v4: add version argument to write_pack_header Nguyễn Thái Ngọc Duy
                     ` (8 subsequent siblings)
  11 siblings, 1 reply; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-08 15:04 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git, Nguyễn Thái Ngọc Duy

git-packv4-create now becomes test-packv4. Code that will not be used
by pack-objects.c is moved to test-packv4.c. It may be removed when
the code transition to pack-objects completes.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 Makefile            |   4 +-
 packv4-create.c     | 491 +---------------------------------------------------
 packv4-create.h     |  39 +++++
 test-packv4.c (new) | 476 ++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 525 insertions(+), 485 deletions(-)
 create mode 100644 test-packv4.c

diff --git a/Makefile b/Makefile
index 22fc276..af2e3e3 100644
--- a/Makefile
+++ b/Makefile
@@ -550,7 +550,6 @@ PROGRAM_OBJS += shell.o
 PROGRAM_OBJS += show-index.o
 PROGRAM_OBJS += upload-pack.o
 PROGRAM_OBJS += remote-testsvn.o
-PROGRAM_OBJS += packv4-create.o
 
 # Binary suffix, set to .exe for Windows builds
 X =
@@ -568,6 +567,7 @@ TEST_PROGRAMS_NEED_X += test-line-buffer
 TEST_PROGRAMS_NEED_X += test-match-trees
 TEST_PROGRAMS_NEED_X += test-mergesort
 TEST_PROGRAMS_NEED_X += test-mktemp
+TEST_PROGRAMS_NEED_X += test-packv4
 TEST_PROGRAMS_NEED_X += test-parse-options
 TEST_PROGRAMS_NEED_X += test-path-utils
 TEST_PROGRAMS_NEED_X += test-prio-queue
@@ -702,6 +702,7 @@ LIB_H += notes.h
 LIB_H += object.h
 LIB_H += pack-revindex.h
 LIB_H += pack.h
+LIB_H += packv4-create.h
 LIB_H += packv4-parse.h
 LIB_H += parse-options.h
 LIB_H += patch-ids.h
@@ -839,6 +840,7 @@ LIB_OBJS += object.o
 LIB_OBJS += pack-check.o
 LIB_OBJS += pack-revindex.o
 LIB_OBJS += pack-write.o
+LIB_OBJS += packv4-create.o
 LIB_OBJS += packv4-parse.o
 LIB_OBJS += pager.o
 LIB_OBJS += parse-options.o
diff --git a/packv4-create.c b/packv4-create.c
index 920a0b4..cdf82c0 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -18,9 +18,9 @@
 #include "packv4-create.h"
 
 
-static int pack_compression_seen;
-static int pack_compression_level = Z_DEFAULT_COMPRESSION;
-static int min_tree_copy = 1;
+int pack_compression_seen;
+int pack_compression_level = Z_DEFAULT_COMPRESSION;
+int min_tree_copy = 1;
 
 struct data_entry {
 	unsigned offset;
@@ -28,17 +28,6 @@ struct data_entry {
 	unsigned hits;
 };
 
-struct dict_table {
-	unsigned char *data;
-	unsigned cur_offset;
-	unsigned size;
-	struct data_entry *entry;
-	unsigned nb_entries;
-	unsigned max_entries;
-	unsigned *hash;
-	unsigned hash_size;
-};
-
 struct dict_table *create_dict_table(void)
 {
 	return xcalloc(sizeof(struct dict_table), 1);
@@ -139,7 +128,7 @@ static int cmp_dict_entries(const void *a_, const void *b_)
 	return diff;
 }
 
-static void sort_dict_entries_by_hits(struct dict_table *t)
+void sort_dict_entries_by_hits(struct dict_table *t)
 {
 	qsort(t->entry, t->nb_entries, sizeof(*t->entry), cmp_dict_entries);
 	t->hash_size = (t->nb_entries * 4 / 3) / 2;
@@ -208,7 +197,7 @@ int add_commit_dict_entries(struct dict_table *commit_ident_table,
 	return 0;
 }
 
-static int add_tree_dict_entries(struct dict_table *tree_path_table,
+int add_tree_dict_entries(struct dict_table *tree_path_table,
 				 void *buf, unsigned long size)
 {
 	struct tree_desc desc;
@@ -224,7 +213,7 @@ static int add_tree_dict_entries(struct dict_table *tree_path_table,
 	return 0;
 }
 
-void dump_dict_table(struct dict_table *t)
+static void dump_dict_table(struct dict_table *t)
 {
 	int i;
 
@@ -241,7 +230,7 @@ void dump_dict_table(struct dict_table *t)
 	}
 }
 
-static void dict_dump(struct packv4_tables *v4)
+void dict_dump(struct packv4_tables *v4)
 {
 	dump_dict_table(v4->commit_ident_table);
 	dump_dict_table(v4->tree_path_table);
@@ -611,103 +600,6 @@ void *pv4_encode_tree(const struct packv4_tables *v4,
 	return buffer;
 }
 
-static struct pack_idx_entry *get_packed_object_list(struct packed_git *p)
-{
-	unsigned i, nr_objects = p->num_objects;
-	struct pack_idx_entry *objects;
-
-	objects = xmalloc((nr_objects + 1) * sizeof(*objects));
-	objects[nr_objects].offset = p->pack_size - 20;
-	for (i = 0; i < nr_objects; i++) {
-		hashcpy(objects[i].sha1, nth_packed_object_sha1(p, i));
-		objects[i].offset = nth_packed_object_offset(p, i);
-	}
-
-	return objects;
-}
-
-static int sort_by_offset(const void *e1, const void *e2)
-{
-	const struct pack_idx_entry * const *entry1 = e1;
-	const struct pack_idx_entry * const *entry2 = e2;
-	if ((*entry1)->offset < (*entry2)->offset)
-		return -1;
-	if ((*entry1)->offset > (*entry2)->offset)
-		return 1;
-	return 0;
-}
-
-static struct pack_idx_entry **sort_objs_by_offset(struct pack_idx_entry *list,
-						    unsigned nr_objects)
-{
-	unsigned i;
-	struct pack_idx_entry **sorted;
-
-	sorted = xmalloc((nr_objects + 1) * sizeof(*sorted));
-	for (i = 0; i < nr_objects + 1; i++)
-		sorted[i] = &list[i];
-	qsort(sorted, nr_objects + 1, sizeof(*sorted), sort_by_offset);
-
-	return sorted;
-}
-
-static int create_pack_dictionaries(struct packv4_tables *v4,
-				    struct packed_git *p,
-				    struct pack_idx_entry **obj_list)
-{
-	struct progress *progress_state;
-	unsigned int i;
-
-	v4->commit_ident_table = create_dict_table();
-	v4->tree_path_table = create_dict_table();
-
-	progress_state = start_progress("Scanning objects", p->num_objects);
-	for (i = 0; i < p->num_objects; i++) {
-		struct pack_idx_entry *obj = obj_list[i];
-		void *data;
-		enum object_type type;
-		unsigned long size;
-		struct object_info oi = {};
-		int (*add_dict_entries)(struct dict_table *, void *, unsigned long);
-		struct dict_table *dict;
-
-		display_progress(progress_state, i+1);
-
-		oi.typep = &type;
-		oi.sizep = &size;
-		if (packed_object_info(p, obj->offset, &oi) < 0)
-			die("cannot get type of %s from %s",
-			    sha1_to_hex(obj->sha1), p->pack_name);
-
-		switch (type) {
-		case OBJ_COMMIT:
-			add_dict_entries = add_commit_dict_entries;
-			dict = v4->commit_ident_table;
-			break;
-		case OBJ_TREE:
-			add_dict_entries = add_tree_dict_entries;
-			dict = v4->tree_path_table;
-			break;
-		default:
-			continue;
-		}
-		data = unpack_entry(p, obj->offset, &type, &size);
-		if (!data)
-			die("cannot unpack %s from %s",
-			    sha1_to_hex(obj->sha1), p->pack_name);
-		if (check_sha1_signature(obj->sha1, data, size, typename(type)))
-			die("packed %s from %s is corrupt",
-			    sha1_to_hex(obj->sha1), p->pack_name);
-		if (add_dict_entries(dict, data, size) < 0)
-			die("can't process %s object %s",
-				typename(type), sha1_to_hex(obj->sha1));
-		free(data);
-	}
-
-	stop_progress(&progress_state);
-	return 0;
-}
-
 static unsigned long write_dict_table(struct sha1file *f, struct dict_table *t)
 {
 	unsigned char buffer[1024];
@@ -757,28 +649,6 @@ static unsigned long write_dict_table(struct sha1file *f, struct dict_table *t)
 	return hdrlen + datalen;
 }
 
-static struct sha1file * packv4_open(char *path)
-{
-	int fd;
-
-	fd = open(path, O_CREAT|O_EXCL|O_WRONLY, 0600);
-	if (fd < 0)
-		die_errno("unable to create '%s'", path);
-	return sha1fd(fd, path);
-}
-
-static unsigned int packv4_write_header(struct sha1file *f, unsigned nr_objects)
-{
-	struct pack_header hdr;
-
-	hdr.hdr_signature = htonl(PACK_SIGNATURE);
-	hdr.hdr_version = htonl(4);
-	hdr.hdr_entries = htonl(nr_objects);
-	sha1write(f, &hdr, sizeof(hdr));
-
-	return sizeof(hdr);
-}
-
 unsigned long packv4_write_tables(struct sha1file *f,
 				  const struct packv4_tables *v4)
 {
@@ -802,350 +672,3 @@ unsigned long packv4_write_tables(struct sha1file *f,
 
 	return written;
 }
-
-static int write_object_header(struct sha1file *f, enum object_type type, unsigned long size)
-{
-	unsigned char buf[16];
-	uint64_t val;
-	int len;
-
-	/*
-	 * We really have only one kind of delta object.
-	 */
-	if (type == OBJ_OFS_DELTA)
-		type = OBJ_REF_DELTA;
-
-	/*
-	 * We allocate 4 bits in the LSB for the object type which should
-	 * be good for quite a while, given that we effectively encodes
-	 * only 5 object types: commit, tree, blob, delta, tag.
-	 */
-	val = size;
-	if (MSB(val, 4))
-		die("fixme: the code doesn't currently cope with big sizes");
-	val <<= 4;
-	val |= type;
-	len = encode_varint(val, buf);
-	sha1write(f, buf, len);
-	return len;
-}
-
-static unsigned long copy_object_data(struct packv4_tables *v4,
-				      struct sha1file *f, struct packed_git *p,
-				      off_t offset)
-{
-	struct pack_window *w_curs = NULL;
-	struct revindex_entry *revidx;
-	enum object_type type;
-	unsigned long avail, size, datalen, written;
-	int hdrlen, reflen, idx_nr;
-	unsigned char *src, buf[24];
-
-	revidx = find_pack_revindex(p, offset);
-	idx_nr = revidx->nr;
-	datalen = revidx[1].offset - offset;
-
-	src = use_pack(p, &w_curs, offset, &avail);
-	hdrlen = unpack_object_header_buffer(src, avail, &type, &size);
-
-	written = write_object_header(f, type, size);
-
-	if (type == OBJ_OFS_DELTA) {
-		const unsigned char *cp = src + hdrlen;
-		off_t base_offset = decode_varint(&cp);
-		hdrlen = cp - src;
-		base_offset = offset - base_offset;
-		if (base_offset <= 0 || base_offset >= offset)
-			die("delta offset out of bound");
-		revidx = find_pack_revindex(p, base_offset);
-		reflen = encode_sha1ref(v4,
-					nth_packed_object_sha1(p, revidx->nr),
-					buf);
-		sha1write(f, buf, reflen);
-		written += reflen;
-	} else if (type == OBJ_REF_DELTA) {
-		reflen = encode_sha1ref(v4, src + hdrlen, buf);
-		hdrlen += 20;
-		sha1write(f, buf, reflen);
-		written += reflen;
-	}
-
-	if (p->index_version > 1 &&
-	    check_pack_crc(p, &w_curs, offset, datalen, idx_nr))
-		die("bad CRC for object at offset %"PRIuMAX" in %s",
-		    (uintmax_t)offset, p->pack_name);
-
-	offset += hdrlen;
-	datalen -= hdrlen;
-
-	while (datalen) {
-		src = use_pack(p, &w_curs, offset, &avail);
-		if (avail > datalen)
-			avail = datalen;
-		sha1write(f, src, avail);
-		written += avail;
-		offset += avail;
-		datalen -= avail;
-	}
-	unuse_pack(&w_curs);
-
-	return written;
-}
-
-static unsigned char *get_delta_base(struct packed_git *p, off_t offset,
-				     unsigned char *sha1_buf)
-{
-	struct pack_window *w_curs = NULL;
-	enum object_type type;
-	unsigned long avail, size;
-	int hdrlen;
-	unsigned char *src;
-	const unsigned char *base_sha1 = NULL; ;
-
-	src = use_pack(p, &w_curs, offset, &avail);
-	hdrlen = unpack_object_header_buffer(src, avail, &type, &size);
-
-	if (type == OBJ_OFS_DELTA) {
-		const unsigned char *cp = src + hdrlen;
-		off_t base_offset = decode_varint(&cp);
-		base_offset = offset - base_offset;
-		if (base_offset <= 0 || base_offset >= offset) {
-			error("delta offset out of bound");
-		} else {
-			struct revindex_entry *revidx;
-			revidx = find_pack_revindex(p, base_offset);
-			base_sha1 = nth_packed_object_sha1(p, revidx->nr);
-		}
-	} else if (type == OBJ_REF_DELTA) {
-		base_sha1 = src + hdrlen;
-	} else
-		error("expected to get a delta but got a %s", typename(type));
-
-	unuse_pack(&w_curs);
-
-	if (!base_sha1)
-		return NULL;
-	hashcpy(sha1_buf, base_sha1);
-	return sha1_buf;
-}
-
-static off_t packv4_write_object(struct packv4_tables *v4,
-				 struct sha1file *f, struct packed_git *p,
-				 struct pack_idx_entry *obj)
-{
-	void *src, *result;
-	struct object_info oi = {};
-	enum object_type type, packed_type;
-	unsigned long obj_size, buf_size;
-	unsigned int hdrlen;
-
-	oi.typep = &type;
-	oi.sizep = &obj_size;
-	packed_type = packed_object_info(p, obj->offset, &oi);
-	if (packed_type < 0)
-		die("cannot get type of %s from %s",
-		    sha1_to_hex(obj->sha1), p->pack_name);
-
-	/* Some objects are copied without decompression */
-	switch (type) {
-	case OBJ_COMMIT:
-	case OBJ_TREE:
-		break;
-	default:
-		return copy_object_data(v4, f, p, obj->offset);
-	}
-
-	/* The rest is converted into their new format */
-	src = unpack_entry(p, obj->offset, &type, &buf_size);
-	if (!src || obj_size != buf_size)
-		die("cannot unpack %s from %s",
-		    sha1_to_hex(obj->sha1), p->pack_name);
-	if (check_sha1_signature(obj->sha1, src, buf_size, typename(type)))
-		die("packed %s from %s is corrupt",
-		    sha1_to_hex(obj->sha1), p->pack_name);
-
-	switch (type) {
-	case OBJ_COMMIT:
-		result = pv4_encode_commit(v4, src, &buf_size);
-		break;
-	case OBJ_TREE:
-		if (packed_type != OBJ_TREE) {
-			unsigned char sha1_buf[20], *ref_sha1;
-			void *ref;
-			enum object_type ref_type;
-			unsigned long ref_size;
-
-			ref_sha1 = get_delta_base(p, obj->offset, sha1_buf);
-			if (!ref_sha1)
-				die("unable to get delta base sha1 for %s",
-						sha1_to_hex(obj->sha1));
-			ref = read_sha1_file(ref_sha1, &ref_type, &ref_size);
-			if (!ref || ref_type != OBJ_TREE)
-				die("cannot obtain delta base for %s",
-						sha1_to_hex(obj->sha1));
-			result = pv4_encode_tree(v4, src, &buf_size,
-						 ref, ref_size, ref_sha1);
-			free(ref);
-		} else {
-			result = pv4_encode_tree(v4, src, &buf_size,
-						 NULL, 0, NULL);
-		}
-		break;
-	default:
-		die("unexpected object type %d", type);
-	}
-	free(src);
-	if (!result) {
-		warning("can't convert %s object %s",
-			typename(type), sha1_to_hex(obj->sha1));
-		/* fall back to copy the object in its original form */
-		return copy_object_data(v4, f, p, obj->offset);
-	}
-
-	/* Use bit 3 to indicate a special type encoding */
-	type += 8;
-	hdrlen = write_object_header(f, type, obj_size);
-	sha1write(f, result, buf_size);
-	free(result);
-	return hdrlen + buf_size;
-}
-
-static char *normalize_pack_name(const char *path)
-{
-	char buf[PATH_MAX];
-	int len;
-
-	len = strlcpy(buf, path, PATH_MAX);
-	if (len >= PATH_MAX - 6)
-		die("name too long: %s", path);
-
-	/*
-	 * In addition to "foo.idx" we accept "foo.pack" and "foo";
-	 * normalize these forms to "foo.pack".
-	 */
-	if (has_extension(buf, ".idx")) {
-		strcpy(buf + len - 4, ".pack");
-		len++;
-	} else if (!has_extension(buf, ".pack")) {
-		strcpy(buf + len, ".pack");
-		len += 5;
-	}
-
-	return xstrdup(buf);
-}
-
-static struct packed_git *open_pack(const char *path)
-{
-	char *packname = normalize_pack_name(path);
-	int len = strlen(packname);
-	struct packed_git *p;
-
-	strcpy(packname + len - 5, ".idx");
-	p = add_packed_git(packname, len - 1, 1);
-	if (!p)
-		die("packfile %s not found.", packname);
-
-	install_packed_git(p);
-	if (open_pack_index(p))
-		die("packfile %s index not opened", p->pack_name);
-
-	free(packname);
-	return p;
-}
-
-static void process_one_pack(struct packv4_tables *v4, char *src_pack, char *dst_pack)
-{
-	struct packed_git *p;
-	struct sha1file *f;
-	struct pack_idx_entry *objs, **p_objs;
-	struct pack_idx_option idx_opts;
-	unsigned i, nr_objects;
-	off_t written = 0;
-	char *packname;
-	unsigned char pack_sha1[20];
-	struct progress *progress_state;
-
-	p = open_pack(src_pack);
-	if (!p)
-		die("unable to open source pack");
-
-	nr_objects = p->num_objects;
-	objs = get_packed_object_list(p);
-	p_objs = sort_objs_by_offset(objs, nr_objects);
-
-	create_pack_dictionaries(v4, p, p_objs);
-	sort_dict_entries_by_hits(v4->commit_ident_table);
-	sort_dict_entries_by_hits(v4->tree_path_table);
-
-	packname = normalize_pack_name(dst_pack);
-	f = packv4_open(packname);
-	if (!f)
-		die("unable to open destination pack");
-	written += packv4_write_header(f, nr_objects);
-	written += packv4_write_tables(f, v4);
-
-	/* Let's write objects out, updating the object index list in place */
-	progress_state = start_progress("Writing objects", nr_objects);
-	v4->all_objs = objs;
-	v4->all_objs_nr = nr_objects;
-	for (i = 0; i < nr_objects; i++) {
-		off_t obj_pos = written;
-		struct pack_idx_entry *obj = p_objs[i];
-		crc32_begin(f);
-		written += packv4_write_object(v4, f, p, obj);
-		obj->offset = obj_pos;
-		obj->crc32 = crc32_end(f);
-		display_progress(progress_state, i+1);
-	}
-	stop_progress(&progress_state);
-
-	sha1close(f, pack_sha1, CSUM_CLOSE | CSUM_FSYNC);
-
-	reset_pack_idx_option(&idx_opts);
-	idx_opts.version = 3;
-	strcpy(packname + strlen(packname) - 5, ".idx");
-	write_idx_file(packname, p_objs, nr_objects, &idx_opts, pack_sha1);
-
-	free(packname);
-}
-
-static int git_pack_config(const char *k, const char *v, void *cb)
-{
-	if (!strcmp(k, "pack.compression")) {
-		int level = git_config_int(k, v);
-		if (level == -1)
-			level = Z_DEFAULT_COMPRESSION;
-		else if (level < 0 || level > Z_BEST_COMPRESSION)
-			die("bad pack compression level %d", level);
-		pack_compression_level = level;
-		pack_compression_seen = 1;
-		return 0;
-	}
-	return git_default_config(k, v, cb);
-}
-
-int main(int argc, char *argv[])
-{
-	struct packv4_tables v4;
-	char *src_pack, *dst_pack;
-
-	if (argc == 3) {
-		src_pack = argv[1];
-		dst_pack = argv[2];
-	} else if (argc == 4 && !prefixcmp(argv[1], "--min-tree-copy=")) {
-		min_tree_copy = atoi(argv[1] + strlen("--min-tree-copy="));
-		src_pack = argv[2];
-		dst_pack = argv[3];
-	} else {
-		fprintf(stderr, "Usage: %s [--min-tree-copy=<n>] <src_packfile> <dst_packfile>\n", argv[0]);
-		exit(1);
-	}
-
-	git_config(git_pack_config, NULL);
-	if (!pack_compression_seen && core_compression_seen)
-		pack_compression_level = core_compression_level;
-	process_one_pack(&v4, src_pack, dst_pack);
-	if (0)
-		dict_dump(&v4);
-	return 0;
-}
diff --git a/packv4-create.h b/packv4-create.h
index 0c8c77b..c1f32fd 100644
--- a/packv4-create.h
+++ b/packv4-create.h
@@ -8,4 +8,43 @@ struct packv4_tables {
 	struct dict_table *tree_path_table;
 };
 
+struct dict_table {
+	unsigned char *data;
+	unsigned cur_offset;
+	unsigned size;
+	struct data_entry *entry;
+	unsigned nb_entries;
+	unsigned max_entries;
+	unsigned *hash;
+	unsigned hash_size;
+};
+
+
+struct sha1file;
+
+struct dict_table *create_dict_table(void);
+int dict_add_entry(struct dict_table *t, int val, const char *str, int str_len);
+void destroy_dict_table(struct dict_table *t);
+void dict_dump(struct packv4_tables *v4);
+
+int add_commit_dict_entries(struct dict_table *commit_ident_table,
+			    void *buf, unsigned long size);
+int add_tree_dict_entries(struct dict_table *tree_path_table,
+			  void *buf, unsigned long size);
+void sort_dict_entries_by_hits(struct dict_table *t);
+
+int encode_sha1ref(const struct packv4_tables *v4,
+		   const unsigned char *sha1, unsigned char *buf);
+unsigned long packv4_write_tables(struct sha1file *f,
+				  const struct packv4_tables *v4);
+void *pv4_encode_commit(const struct packv4_tables *v4,
+			void *buffer, unsigned long *sizep);
+void *pv4_encode_tree(const struct packv4_tables *v4,
+		      void *_buffer, unsigned long *sizep,
+		      void *delta, unsigned long delta_size,
+		      const unsigned char *delta_sha1);
+
+void process_one_pack(struct packv4_tables *v4,
+		      char *src_pack, char *dst_pack);
+
 #endif
diff --git a/test-packv4.c b/test-packv4.c
new file mode 100644
index 0000000..3b0d7a2
--- /dev/null
+++ b/test-packv4.c
@@ -0,0 +1,476 @@
+#include "cache.h"
+#include "pack.h"
+#include "pack-revindex.h"
+#include "progress.h"
+#include "varint.h"
+#include "packv4-create.h"
+
+extern int pack_compression_seen;
+extern int pack_compression_level;
+extern int min_tree_copy;
+
+static struct pack_idx_entry *get_packed_object_list(struct packed_git *p)
+{
+	unsigned i, nr_objects = p->num_objects;
+	struct pack_idx_entry *objects;
+
+	objects = xmalloc((nr_objects + 1) * sizeof(*objects));
+	objects[nr_objects].offset = p->pack_size - 20;
+	for (i = 0; i < nr_objects; i++) {
+		hashcpy(objects[i].sha1, nth_packed_object_sha1(p, i));
+		objects[i].offset = nth_packed_object_offset(p, i);
+	}
+
+	return objects;
+}
+
+static int sort_by_offset(const void *e1, const void *e2)
+{
+	const struct pack_idx_entry * const *entry1 = e1;
+	const struct pack_idx_entry * const *entry2 = e2;
+	if ((*entry1)->offset < (*entry2)->offset)
+		return -1;
+	if ((*entry1)->offset > (*entry2)->offset)
+		return 1;
+	return 0;
+}
+
+static struct pack_idx_entry **sort_objs_by_offset(struct pack_idx_entry *list,
+						    unsigned nr_objects)
+{
+	unsigned i;
+	struct pack_idx_entry **sorted;
+
+	sorted = xmalloc((nr_objects + 1) * sizeof(*sorted));
+	for (i = 0; i < nr_objects + 1; i++)
+		sorted[i] = &list[i];
+	qsort(sorted, nr_objects + 1, sizeof(*sorted), sort_by_offset);
+
+	return sorted;
+}
+
+static int create_pack_dictionaries(struct packv4_tables *v4,
+				    struct packed_git *p,
+				    struct pack_idx_entry **obj_list)
+{
+	struct progress *progress_state;
+	unsigned int i;
+
+	v4->commit_ident_table = create_dict_table();
+	v4->tree_path_table = create_dict_table();
+
+	progress_state = start_progress("Scanning objects", p->num_objects);
+	for (i = 0; i < p->num_objects; i++) {
+		struct pack_idx_entry *obj = obj_list[i];
+		void *data;
+		enum object_type type;
+		unsigned long size;
+		struct object_info oi = {};
+		int (*add_dict_entries)(struct dict_table *, void *, unsigned long);
+		struct dict_table *dict;
+
+		display_progress(progress_state, i+1);
+
+		oi.typep = &type;
+		oi.sizep = &size;
+		if (packed_object_info(p, obj->offset, &oi) < 0)
+			die("cannot get type of %s from %s",
+			    sha1_to_hex(obj->sha1), p->pack_name);
+
+		switch (type) {
+		case OBJ_COMMIT:
+			add_dict_entries = add_commit_dict_entries;
+			dict = v4->commit_ident_table;
+			break;
+		case OBJ_TREE:
+			add_dict_entries = add_tree_dict_entries;
+			dict = v4->tree_path_table;
+			break;
+		default:
+			continue;
+		}
+		data = unpack_entry(p, obj->offset, &type, &size);
+		if (!data)
+			die("cannot unpack %s from %s",
+			    sha1_to_hex(obj->sha1), p->pack_name);
+		if (check_sha1_signature(obj->sha1, data, size, typename(type)))
+			die("packed %s from %s is corrupt",
+			    sha1_to_hex(obj->sha1), p->pack_name);
+		if (add_dict_entries(dict, data, size) < 0)
+			die("can't process %s object %s",
+				typename(type), sha1_to_hex(obj->sha1));
+		free(data);
+	}
+
+	stop_progress(&progress_state);
+	return 0;
+}
+
+static struct sha1file * packv4_open(char *path)
+{
+	int fd;
+
+	fd = open(path, O_CREAT|O_EXCL|O_WRONLY, 0600);
+	if (fd < 0)
+		die_errno("unable to create '%s'", path);
+	return sha1fd(fd, path);
+}
+
+static unsigned int packv4_write_header(struct sha1file *f, unsigned nr_objects)
+{
+	struct pack_header hdr;
+
+	hdr.hdr_signature = htonl(PACK_SIGNATURE);
+	hdr.hdr_version = htonl(4);
+	hdr.hdr_entries = htonl(nr_objects);
+	sha1write(f, &hdr, sizeof(hdr));
+
+	return sizeof(hdr);
+}
+
+static int write_object_header(struct sha1file *f, enum object_type type, unsigned long size)
+{
+	unsigned char buf[16];
+	uint64_t val;
+	int len;
+
+	/*
+	 * We really have only one kind of delta object.
+	 */
+	if (type == OBJ_OFS_DELTA)
+		type = OBJ_REF_DELTA;
+
+	/*
+	 * We allocate 4 bits in the LSB for the object type which should
+	 * be good for quite a while, given that we effectively encodes
+	 * only 5 object types: commit, tree, blob, delta, tag.
+	 */
+	val = size;
+	if (MSB(val, 4))
+		die("fixme: the code doesn't currently cope with big sizes");
+	val <<= 4;
+	val |= type;
+	len = encode_varint(val, buf);
+	sha1write(f, buf, len);
+	return len;
+}
+
+static unsigned long copy_object_data(struct packv4_tables *v4,
+				      struct sha1file *f, struct packed_git *p,
+				      off_t offset)
+{
+	struct pack_window *w_curs = NULL;
+	struct revindex_entry *revidx;
+	enum object_type type;
+	unsigned long avail, size, datalen, written;
+	int hdrlen, reflen, idx_nr;
+	unsigned char *src, buf[24];
+
+	revidx = find_pack_revindex(p, offset);
+	idx_nr = revidx->nr;
+	datalen = revidx[1].offset - offset;
+
+	src = use_pack(p, &w_curs, offset, &avail);
+	hdrlen = unpack_object_header_buffer(src, avail, &type, &size);
+
+	written = write_object_header(f, type, size);
+
+	if (type == OBJ_OFS_DELTA) {
+		const unsigned char *cp = src + hdrlen;
+		off_t base_offset = decode_varint(&cp);
+		hdrlen = cp - src;
+		base_offset = offset - base_offset;
+		if (base_offset <= 0 || base_offset >= offset)
+			die("delta offset out of bound");
+		revidx = find_pack_revindex(p, base_offset);
+		reflen = encode_sha1ref(v4,
+					nth_packed_object_sha1(p, revidx->nr),
+					buf);
+		sha1write(f, buf, reflen);
+		written += reflen;
+	} else if (type == OBJ_REF_DELTA) {
+		reflen = encode_sha1ref(v4, src + hdrlen, buf);
+		hdrlen += 20;
+		sha1write(f, buf, reflen);
+		written += reflen;
+	}
+
+	if (p->index_version > 1 &&
+	    check_pack_crc(p, &w_curs, offset, datalen, idx_nr))
+		die("bad CRC for object at offset %"PRIuMAX" in %s",
+		    (uintmax_t)offset, p->pack_name);
+
+	offset += hdrlen;
+	datalen -= hdrlen;
+
+	while (datalen) {
+		src = use_pack(p, &w_curs, offset, &avail);
+		if (avail > datalen)
+			avail = datalen;
+		sha1write(f, src, avail);
+		written += avail;
+		offset += avail;
+		datalen -= avail;
+	}
+	unuse_pack(&w_curs);
+
+	return written;
+}
+
+static unsigned char *get_delta_base(struct packed_git *p, off_t offset,
+				     unsigned char *sha1_buf)
+{
+	struct pack_window *w_curs = NULL;
+	enum object_type type;
+	unsigned long avail, size;
+	int hdrlen;
+	unsigned char *src;
+	const unsigned char *base_sha1 = NULL; ;
+
+	src = use_pack(p, &w_curs, offset, &avail);
+	hdrlen = unpack_object_header_buffer(src, avail, &type, &size);
+
+	if (type == OBJ_OFS_DELTA) {
+		const unsigned char *cp = src + hdrlen;
+		off_t base_offset = decode_varint(&cp);
+		base_offset = offset - base_offset;
+		if (base_offset <= 0 || base_offset >= offset) {
+			error("delta offset out of bound");
+		} else {
+			struct revindex_entry *revidx;
+			revidx = find_pack_revindex(p, base_offset);
+			base_sha1 = nth_packed_object_sha1(p, revidx->nr);
+		}
+	} else if (type == OBJ_REF_DELTA) {
+		base_sha1 = src + hdrlen;
+	} else
+		error("expected to get a delta but got a %s", typename(type));
+
+	unuse_pack(&w_curs);
+
+	if (!base_sha1)
+		return NULL;
+	hashcpy(sha1_buf, base_sha1);
+	return sha1_buf;
+}
+
+static off_t packv4_write_object(struct packv4_tables *v4,
+				 struct sha1file *f, struct packed_git *p,
+				 struct pack_idx_entry *obj)
+{
+	void *src, *result;
+	struct object_info oi = {};
+	enum object_type type, packed_type;
+	unsigned long obj_size, buf_size;
+	unsigned int hdrlen;
+
+	oi.typep = &type;
+	oi.sizep = &obj_size;
+	packed_type = packed_object_info(p, obj->offset, &oi);
+	if (packed_type < 0)
+		die("cannot get type of %s from %s",
+		    sha1_to_hex(obj->sha1), p->pack_name);
+
+	/* Some objects are copied without decompression */
+	switch (type) {
+	case OBJ_COMMIT:
+	case OBJ_TREE:
+		break;
+	default:
+		return copy_object_data(v4, f, p, obj->offset);
+	}
+
+	/* The rest is converted into their new format */
+	src = unpack_entry(p, obj->offset, &type, &buf_size);
+	if (!src || obj_size != buf_size)
+		die("cannot unpack %s from %s",
+		    sha1_to_hex(obj->sha1), p->pack_name);
+	if (check_sha1_signature(obj->sha1, src, buf_size, typename(type)))
+		die("packed %s from %s is corrupt",
+		    sha1_to_hex(obj->sha1), p->pack_name);
+
+	switch (type) {
+	case OBJ_COMMIT:
+		result = pv4_encode_commit(v4, src, &buf_size);
+		break;
+	case OBJ_TREE:
+		if (packed_type != OBJ_TREE) {
+			unsigned char sha1_buf[20], *ref_sha1;
+			void *ref;
+			enum object_type ref_type;
+			unsigned long ref_size;
+
+			ref_sha1 = get_delta_base(p, obj->offset, sha1_buf);
+			if (!ref_sha1)
+				die("unable to get delta base sha1 for %s",
+						sha1_to_hex(obj->sha1));
+			ref = read_sha1_file(ref_sha1, &ref_type, &ref_size);
+			if (!ref || ref_type != OBJ_TREE)
+				die("cannot obtain delta base for %s",
+						sha1_to_hex(obj->sha1));
+			result = pv4_encode_tree(v4, src, &buf_size,
+						 ref, ref_size, ref_sha1);
+			free(ref);
+		} else {
+			result = pv4_encode_tree(v4, src, &buf_size,
+						 NULL, 0, NULL);
+		}
+		break;
+	default:
+		die("unexpected object type %d", type);
+	}
+	free(src);
+	if (!result) {
+		warning("can't convert %s object %s",
+			typename(type), sha1_to_hex(obj->sha1));
+		/* fall back to copy the object in its original form */
+		return copy_object_data(v4, f, p, obj->offset);
+	}
+
+	/* Use bit 3 to indicate a special type encoding */
+	type += 8;
+	hdrlen = write_object_header(f, type, obj_size);
+	sha1write(f, result, buf_size);
+	free(result);
+	return hdrlen + buf_size;
+}
+
+static char *normalize_pack_name(const char *path)
+{
+	char buf[PATH_MAX];
+	int len;
+
+	len = strlcpy(buf, path, PATH_MAX);
+	if (len >= PATH_MAX - 6)
+		die("name too long: %s", path);
+
+	/*
+	 * In addition to "foo.idx" we accept "foo.pack" and "foo";
+	 * normalize these forms to "foo.pack".
+	 */
+	if (has_extension(buf, ".idx")) {
+		strcpy(buf + len - 4, ".pack");
+		len++;
+	} else if (!has_extension(buf, ".pack")) {
+		strcpy(buf + len, ".pack");
+		len += 5;
+	}
+
+	return xstrdup(buf);
+}
+
+static struct packed_git *open_pack(const char *path)
+{
+	char *packname = normalize_pack_name(path);
+	int len = strlen(packname);
+	struct packed_git *p;
+
+	strcpy(packname + len - 5, ".idx");
+	p = add_packed_git(packname, len - 1, 1);
+	if (!p)
+		die("packfile %s not found.", packname);
+
+	install_packed_git(p);
+	if (open_pack_index(p))
+		die("packfile %s index not opened", p->pack_name);
+
+	free(packname);
+	return p;
+}
+
+void process_one_pack(struct packv4_tables *v4, char *src_pack, char *dst_pack)
+{
+	struct packed_git *p;
+	struct sha1file *f;
+	struct pack_idx_entry *objs, **p_objs;
+	struct pack_idx_option idx_opts;
+	unsigned i, nr_objects;
+	off_t written = 0;
+	char *packname;
+	unsigned char pack_sha1[20];
+	struct progress *progress_state;
+
+	p = open_pack(src_pack);
+	if (!p)
+		die("unable to open source pack");
+
+	nr_objects = p->num_objects;
+	objs = get_packed_object_list(p);
+	p_objs = sort_objs_by_offset(objs, nr_objects);
+
+	create_pack_dictionaries(v4, p, p_objs);
+	sort_dict_entries_by_hits(v4->commit_ident_table);
+	sort_dict_entries_by_hits(v4->tree_path_table);
+
+	packname = normalize_pack_name(dst_pack);
+	f = packv4_open(packname);
+	if (!f)
+		die("unable to open destination pack");
+	written += packv4_write_header(f, nr_objects);
+	written += packv4_write_tables(f, v4);
+
+	/* Let's write objects out, updating the object index list in place */
+	progress_state = start_progress("Writing objects", nr_objects);
+	v4->all_objs = objs;
+	v4->all_objs_nr = nr_objects;
+	for (i = 0; i < nr_objects; i++) {
+		off_t obj_pos = written;
+		struct pack_idx_entry *obj = p_objs[i];
+		crc32_begin(f);
+		written += packv4_write_object(v4, f, p, obj);
+		obj->offset = obj_pos;
+		obj->crc32 = crc32_end(f);
+		display_progress(progress_state, i+1);
+	}
+	stop_progress(&progress_state);
+
+	sha1close(f, pack_sha1, CSUM_CLOSE | CSUM_FSYNC);
+
+	reset_pack_idx_option(&idx_opts);
+	idx_opts.version = 3;
+	strcpy(packname + strlen(packname) - 5, ".idx");
+	write_idx_file(packname, p_objs, nr_objects, &idx_opts, pack_sha1);
+
+	free(packname);
+}
+
+static int git_pack_config(const char *k, const char *v, void *cb)
+{
+	if (!strcmp(k, "pack.compression")) {
+		int level = git_config_int(k, v);
+		if (level == -1)
+			level = Z_DEFAULT_COMPRESSION;
+		else if (level < 0 || level > Z_BEST_COMPRESSION)
+			die("bad pack compression level %d", level);
+		pack_compression_level = level;
+		pack_compression_seen = 1;
+		return 0;
+	}
+	return git_default_config(k, v, cb);
+}
+
+int main(int argc, char *argv[])
+{
+	struct packv4_tables v4;
+	char *src_pack, *dst_pack;
+
+	if (argc == 3) {
+		src_pack = argv[1];
+		dst_pack = argv[2];
+	} else if (argc == 4 && !prefixcmp(argv[1], "--min-tree-copy=")) {
+		min_tree_copy = atoi(argv[1] + strlen("--min-tree-copy="));
+		src_pack = argv[2];
+		dst_pack = argv[3];
+	} else {
+		fprintf(stderr, "Usage: %s [--min-tree-copy=<n>] <src_packfile> <dst_packfile>\n", argv[0]);
+		exit(1);
+	}
+
+	git_config(git_pack_config, NULL);
+	if (!pack_compression_seen && core_compression_seen)
+		pack_compression_level = core_compression_level;
+	process_one_pack(&v4, src_pack, dst_pack);
+	if (0)
+		dict_dump(&v4);
+	return 0;
+}
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 04/11] pack v4: add version argument to write_pack_header
  2013-09-08 15:04 ` [PATCH 00/11] pack v4 support in pack-objects Nguyễn Thái Ngọc Duy
                     ` (2 preceding siblings ...)
  2013-09-08 15:04   ` [PATCH 03/11] pack v4: move packv4-create.c to libgit.a Nguyễn Thái Ngọc Duy
@ 2013-09-08 15:04   ` Nguyễn Thái Ngọc Duy
  2013-09-08 15:04   ` [PATCH 05/11] pack-write.c: add pv4_encode_in_pack_object_header Nguyễn Thái Ngọc Duy
                     ` (7 subsequent siblings)
  11 siblings, 0 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-08 15:04 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git, Nguyễn Thái Ngọc Duy


Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 2 +-
 bulk-checkin.c         | 2 +-
 pack-write.c           | 7 +++++--
 pack.h                 | 3 +--
 4 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index f069462..33faea8 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -735,7 +735,7 @@ static void write_pack_file(void)
 		else
 			f = create_tmp_packfile(&pack_tmp_name);
 
-		offset = write_pack_header(f, nr_remaining);
+		offset = write_pack_header(f, 2, nr_remaining);
 		if (!offset)
 			die_errno("unable to write pack header");
 		nr_written = 0;
diff --git a/bulk-checkin.c b/bulk-checkin.c
index 6b0b6d4..9d8f0d0 100644
--- a/bulk-checkin.c
+++ b/bulk-checkin.c
@@ -176,7 +176,7 @@ static void prepare_to_stream(struct bulk_checkin_state *state,
 	reset_pack_idx_option(&state->pack_idx_opts);
 
 	/* Pretend we are going to write only one object */
-	state->offset = write_pack_header(state->f, 1);
+	state->offset = write_pack_header(state->f, 2, 1);
 	if (!state->offset)
 		die_errno("unable to write pack header");
 }
diff --git a/pack-write.c b/pack-write.c
index 631007e..88e4788 100644
--- a/pack-write.c
+++ b/pack-write.c
@@ -186,12 +186,15 @@ const char *write_idx_file(const char *index_name, struct pack_idx_entry **objec
 	return index_name;
 }
 
-off_t write_pack_header(struct sha1file *f, uint32_t nr_entries)
+off_t write_pack_header(struct sha1file *f,
+			int version, uint32_t nr_entries)
 {
 	struct pack_header hdr;
 
 	hdr.hdr_signature = htonl(PACK_SIGNATURE);
-	hdr.hdr_version = htonl(PACK_VERSION);
+	hdr.hdr_version = htonl(version);
+	if (!pack_version_ok(hdr.hdr_version))
+		die(_("pack version %d is not supported"), version);
 	hdr.hdr_entries = htonl(nr_entries);
 	if (sha1write(f, &hdr, sizeof(hdr)))
 		return 0;
diff --git a/pack.h b/pack.h
index aa6ee7d..855f6c6 100644
--- a/pack.h
+++ b/pack.h
@@ -8,7 +8,6 @@
  * Packed object header
  */
 #define PACK_SIGNATURE 0x5041434b	/* "PACK" */
-#define PACK_VERSION 2
 #define pack_version_ok(v) ((v) == htonl(2) || (v) == htonl(3))
 struct pack_header {
 	uint32_t hdr_signature;
@@ -80,7 +79,7 @@ extern const char *write_idx_file(const char *index_name, struct pack_idx_entry
 extern int check_pack_crc(struct packed_git *p, struct pack_window **w_curs, off_t offset, off_t len, unsigned int nr);
 extern int verify_pack_index(struct packed_git *);
 extern int verify_pack(struct packed_git *, verify_fn fn, struct progress *, uint32_t);
-extern off_t write_pack_header(struct sha1file *f, uint32_t);
+extern off_t write_pack_header(struct sha1file *f, int, uint32_t);
 extern void fixup_pack_header_footer(int, unsigned char *, const char *, uint32_t, unsigned char *, off_t);
 extern char *index_pack_lockfile(int fd);
 extern int encode_in_pack_object_header(enum object_type, uintmax_t, unsigned char *);
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 05/11] pack-write.c: add pv4_encode_in_pack_object_header
  2013-09-08 15:04 ` [PATCH 00/11] pack v4 support in pack-objects Nguyễn Thái Ngọc Duy
                     ` (3 preceding siblings ...)
  2013-09-08 15:04   ` [PATCH 04/11] pack v4: add version argument to write_pack_header Nguyễn Thái Ngọc Duy
@ 2013-09-08 15:04   ` Nguyễn Thái Ngọc Duy
  2013-09-08 20:51     ` Nicolas Pitre
  2013-09-08 15:04   ` [PATCH 06/11] pack-objects: add --version to specify written pack version Nguyễn Thái Ngọc Duy
                     ` (6 subsequent siblings)
  11 siblings, 1 reply; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-08 15:04 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git, Nguyễn Thái Ngọc Duy


Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 pack-write.c | 29 +++++++++++++++++++++++++++++
 pack.h       |  1 +
 2 files changed, 30 insertions(+)

diff --git a/pack-write.c b/pack-write.c
index 88e4788..6f11104 100644
--- a/pack-write.c
+++ b/pack-write.c
@@ -1,6 +1,7 @@
 #include "cache.h"
 #include "pack.h"
 #include "csum-file.h"
+#include "varint.h"
 
 void reset_pack_idx_option(struct pack_idx_option *opts)
 {
@@ -340,6 +341,34 @@ int encode_in_pack_object_header(enum object_type type, uintmax_t size, unsigned
 	return n;
 }
 
+/*
+ * The per-object header is a pretty dense thing, which is
+ *  - first byte: low four bits are "size", then three bits of "type",
+ *    and the high bit is "size continues".
+ *  - each byte afterwards: low seven bits are size continuation,
+ *    with the high bit being "size continues"
+ */
+int pv4_encode_in_pack_object_header(enum object_type type,
+				     uintmax_t size, unsigned char *hdr)
+{
+	uintmax_t val;
+	if (type < OBJ_COMMIT || type > OBJ_PV4_TREE || type == OBJ_OFS_DELTA)
+		die("bad type %d", type);
+
+	/*
+	 * We allocate 4 bits in the LSB for the object type which
+	 * should be good for quite a while, given that we effectively
+	 * encodes only 5 object types: commit, tree, blob, delta,
+	 * tag.
+	 */
+	val = size;
+	if (MSB(val, 4))
+		die("fixme: the code doesn't currently cope with big sizes");
+	val <<= 4;
+	val |= type;
+	return encode_varint(val, hdr);
+}
+
 struct sha1file *create_tmp_packfile(char **pack_tmp_name)
 {
 	char tmpname[PATH_MAX];
diff --git a/pack.h b/pack.h
index 855f6c6..38f869d 100644
--- a/pack.h
+++ b/pack.h
@@ -83,6 +83,7 @@ extern off_t write_pack_header(struct sha1file *f, int, uint32_t);
 extern void fixup_pack_header_footer(int, unsigned char *, const char *, uint32_t, unsigned char *, off_t);
 extern char *index_pack_lockfile(int fd);
 extern int encode_in_pack_object_header(enum object_type, uintmax_t, unsigned char *);
+extern int pv4_encode_in_pack_object_header(enum object_type, uintmax_t, unsigned char *);
 
 #define PH_ERROR_EOF		(-1)
 #define PH_ERROR_PACK_SIGNATURE	(-2)
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 06/11] pack-objects: add --version to specify written pack version
  2013-09-08 15:04 ` [PATCH 00/11] pack v4 support in pack-objects Nguyễn Thái Ngọc Duy
                     ` (4 preceding siblings ...)
  2013-09-08 15:04   ` [PATCH 05/11] pack-write.c: add pv4_encode_in_pack_object_header Nguyễn Thái Ngọc Duy
@ 2013-09-08 15:04   ` Nguyễn Thái Ngọc Duy
  2013-09-08 15:04   ` [PATCH 07/11] list-objects.c: add show_tree_entry callback to traverse_commit_list Nguyễn Thái Ngọc Duy
                     ` (5 subsequent siblings)
  11 siblings, 0 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-08 15:04 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git, Nguyễn Thái Ngọc Duy


Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 33faea8..ef68fc5 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -81,6 +81,7 @@ static int num_preferred_base;
 static struct progress *progress_state;
 static int pack_compression_level = Z_DEFAULT_COMPRESSION;
 static int pack_compression_seen;
+static int pack_version = 2;
 
 static unsigned long delta_cache_size = 0;
 static unsigned long max_delta_cache_size = 256 * 1024 * 1024;
@@ -735,7 +736,7 @@ static void write_pack_file(void)
 		else
 			f = create_tmp_packfile(&pack_tmp_name);
 
-		offset = write_pack_header(f, 2, nr_remaining);
+		offset = write_pack_header(f, pack_version, nr_remaining);
 		if (!offset)
 			die_errno("unable to write pack header");
 		nr_written = 0;
@@ -2455,6 +2456,7 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 		{ OPTION_CALLBACK, 0, "index-version", NULL, N_("version[,offset]"),
 		  N_("write the pack index file in the specified idx format version"),
 		  0, option_parse_index_version },
+		OPT_INTEGER(0, "version", &pack_version, N_("pack version")),
 		OPT_ULONG(0, "max-pack-size", &pack_size_limit,
 			  N_("maximum size of each output pack file")),
 		OPT_BOOL(0, "local", &local,
@@ -2525,6 +2527,8 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 	}
 	if (pack_to_stdout != !base_name || argc)
 		usage_with_options(pack_usage, pack_objects_options);
+	if (pack_version != 2)
+		die(_("pack version %d is not supported"), pack_version);
 
 	rp_av[rp_ac++] = "pack-objects";
 	if (thin) {
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 07/11] list-objects.c: add show_tree_entry callback to traverse_commit_list
  2013-09-08 15:04 ` [PATCH 00/11] pack v4 support in pack-objects Nguyễn Thái Ngọc Duy
                     ` (5 preceding siblings ...)
  2013-09-08 15:04   ` [PATCH 06/11] pack-objects: add --version to specify written pack version Nguyễn Thái Ngọc Duy
@ 2013-09-08 15:04   ` Nguyễn Thái Ngọc Duy
  2013-09-08 15:04   ` [PATCH 08/11] pack-objects: create pack v4 tables Nguyễn Thái Ngọc Duy
                     ` (4 subsequent siblings)
  11 siblings, 0 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-08 15:04 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git, Nguyễn Thái Ngọc Duy

This helps construct tree dictionary in pack v4.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 2 +-
 builtin/rev-list.c     | 4 ++--
 list-objects.c         | 9 ++++++++-
 list-objects.h         | 3 ++-
 upload-pack.c          | 2 +-
 5 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index ef68fc5..b38d3dc 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -2380,7 +2380,7 @@ static void get_object_list(int ac, const char **av)
 	if (prepare_revision_walk(&revs))
 		die("revision walk setup failed");
 	mark_edges_uninteresting(revs.commits, &revs, show_edge);
-	traverse_commit_list(&revs, show_commit, show_object, NULL);
+	traverse_commit_list(&revs, show_commit, NULL, show_object, NULL);
 
 	if (keep_unreachable)
 		add_objects_in_unpacked_packs(&revs);
diff --git a/builtin/rev-list.c b/builtin/rev-list.c
index a5ec30d..b25f896 100644
--- a/builtin/rev-list.c
+++ b/builtin/rev-list.c
@@ -243,7 +243,7 @@ static int show_bisect_vars(struct rev_list_info *info, int reaches, int all)
 		strcpy(hex, sha1_to_hex(revs->commits->item->object.sha1));
 
 	if (flags & BISECT_SHOW_ALL) {
-		traverse_commit_list(revs, show_commit, show_object, info);
+		traverse_commit_list(revs, show_commit, NULL, show_object, info);
 		printf("------\n");
 	}
 
@@ -348,7 +348,7 @@ int cmd_rev_list(int argc, const char **argv, const char *prefix)
 			return show_bisect_vars(&info, reaches, all);
 	}
 
-	traverse_commit_list(&revs, show_commit, show_object, &info);
+	traverse_commit_list(&revs, show_commit, NULL, show_object, &info);
 
 	if (revs.count) {
 		if (revs.left_right && revs.cherry_mark)
diff --git a/list-objects.c b/list-objects.c
index 3dd4a96..6def897 100644
--- a/list-objects.c
+++ b/list-objects.c
@@ -61,6 +61,7 @@ static void process_gitlink(struct rev_info *revs,
 
 static void process_tree(struct rev_info *revs,
 			 struct tree *tree,
+			 show_tree_entry_fn show_tree_entry,
 			 show_object_fn show,
 			 struct name_path *path,
 			 struct strbuf *base,
@@ -107,9 +108,13 @@ static void process_tree(struct rev_info *revs,
 				continue;
 		}
 
+		if (show_tree_entry)
+			show_tree_entry(&entry, cb_data);
+
 		if (S_ISDIR(entry.mode))
 			process_tree(revs,
 				     lookup_tree(entry.sha1),
+				     show_tree_entry,
 				     show, &me, base, entry.path,
 				     cb_data);
 		else if (S_ISGITLINK(entry.mode))
@@ -167,6 +172,7 @@ static void add_pending_tree(struct rev_info *revs, struct tree *tree)
 
 void traverse_commit_list(struct rev_info *revs,
 			  show_commit_fn show_commit,
+			  show_tree_entry_fn show_tree_entry,
 			  show_object_fn show_object,
 			  void *data)
 {
@@ -196,7 +202,8 @@ void traverse_commit_list(struct rev_info *revs,
 			continue;
 		}
 		if (obj->type == OBJ_TREE) {
-			process_tree(revs, (struct tree *)obj, show_object,
+			process_tree(revs, (struct tree *)obj,
+				     show_tree_entry, show_object,
 				     NULL, &base, name, data);
 			continue;
 		}
diff --git a/list-objects.h b/list-objects.h
index 3db7bb6..297b2e0 100644
--- a/list-objects.h
+++ b/list-objects.h
@@ -2,8 +2,9 @@
 #define LIST_OBJECTS_H
 
 typedef void (*show_commit_fn)(struct commit *, void *);
+typedef void (*show_tree_entry_fn)(const struct name_entry *, void *);
 typedef void (*show_object_fn)(struct object *, const struct name_path *, const char *, void *);
-void traverse_commit_list(struct rev_info *, show_commit_fn, show_object_fn, void *);
+void traverse_commit_list(struct rev_info *, show_commit_fn, show_tree_entry_fn, show_object_fn, void *);
 
 typedef void (*show_edge_fn)(struct commit *);
 void mark_edges_uninteresting(struct commit_list *, struct rev_info *, show_edge_fn);
diff --git a/upload-pack.c b/upload-pack.c
index 127e59a..ccf76d9 100644
--- a/upload-pack.c
+++ b/upload-pack.c
@@ -125,7 +125,7 @@ static int do_rev_list(int in, int out, void *user_data)
 		for (i = 0; i < extra_edge_obj.nr; i++)
 			fprintf(pack_pipe, "-%s\n", sha1_to_hex(
 					extra_edge_obj.objects[i].item->sha1));
-	traverse_commit_list(&revs, show_commit, show_object, NULL);
+	traverse_commit_list(&revs, show_commit, NULL, show_object, NULL);
 	fflush(pack_pipe);
 	fclose(pack_pipe);
 	return 0;
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 08/11] pack-objects: create pack v4 tables
  2013-09-08 15:04 ` [PATCH 00/11] pack v4 support in pack-objects Nguyễn Thái Ngọc Duy
                     ` (6 preceding siblings ...)
  2013-09-08 15:04   ` [PATCH 07/11] list-objects.c: add show_tree_entry callback to traverse_commit_list Nguyễn Thái Ngọc Duy
@ 2013-09-08 15:04   ` Nguyễn Thái Ngọc Duy
  2013-09-09 10:40     ` Duy Nguyen
  2013-09-08 15:04   ` [PATCH 09/11] pack-objects: do not cache delta for v4 trees Nguyễn Thái Ngọc Duy
                     ` (3 subsequent siblings)
  11 siblings, 1 reply; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-08 15:04 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git, Nguyễn Thái Ngọc Duy


Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 85 insertions(+), 2 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index b38d3dc..69a22c1 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -18,6 +18,7 @@
 #include "refs.h"
 #include "streaming.h"
 #include "thread-utils.h"
+#include "packv4-create.h"
 
 static const char *pack_usage[] = {
 	N_("git pack-objects --stdout [options...] [< ref-list | < object-list]"),
@@ -61,6 +62,8 @@ static struct object_entry *objects;
 static struct pack_idx_entry **written_list;
 static uint32_t nr_objects, nr_alloc, nr_result, nr_written;
 
+static struct packv4_tables v4;
+
 static int non_empty;
 static int reuse_delta = 1, reuse_object = 1;
 static int keep_unreachable, unpack_unreachable, include_tag;
@@ -2039,12 +2042,42 @@ static int add_ref_tag(const char *path, const unsigned char *sha1, int flag, vo
 	return 0;
 }
 
+static int sha1_idx_sort(const void *a_, const void *b_)
+{
+	const struct pack_idx_entry *a = a_;
+	const struct pack_idx_entry *b = b_;
+	return hashcmp(a->sha1, b->sha1);
+}
+
+static void prepare_sha1_table(void)
+{
+	unsigned i;
+	/*
+	 * This table includes SHA-1s that may not be present in the
+	 * pack. One of the use of such SHA-1 is for completing thin
+	 * packs, where index-pack does not need to add SHA-1 to the
+	 * table at completion time.
+	 */
+	v4.all_objs = xmalloc(nr_objects * sizeof(*v4.all_objs));
+	v4.all_objs_nr = nr_objects;
+	for (i = 0; i < nr_objects; i++)
+		v4.all_objs[i] = objects[i].idx;
+	qsort(v4.all_objs, nr_objects, sizeof(*v4.all_objs),
+	      sha1_idx_sort);
+}
+
 static void prepare_pack(int window, int depth)
 {
 	struct object_entry **delta_list;
 	uint32_t i, nr_deltas;
 	unsigned n;
 
+	if (pack_version == 4) {
+		sort_dict_entries_by_hits(v4.commit_ident_table);
+		sort_dict_entries_by_hits(v4.tree_path_table);
+		prepare_sha1_table();
+	}
+
 	get_object_details();
 
 	/*
@@ -2191,6 +2224,34 @@ static void read_object_list_from_stdin(void)
 
 		add_preferred_base_object(line+41);
 		add_object_entry(sha1, 0, line+41, 0);
+
+		if (pack_version == 4) {
+			void *data;
+			enum object_type type;
+			unsigned long size;
+			int (*add_dict_entries)(struct dict_table *, void *, unsigned long);
+			struct dict_table *dict;
+
+			switch (sha1_object_info(sha1, &size)) {
+			case OBJ_COMMIT:
+				add_dict_entries = add_commit_dict_entries;
+				dict = v4.commit_ident_table;
+				break;
+			case OBJ_TREE:
+				add_dict_entries = add_tree_dict_entries;
+				dict = v4.tree_path_table;
+				break;
+			default:
+				continue;
+			}
+			data = read_sha1_file(sha1, &type, &size);
+			if (!data)
+				die("cannot unpack %s", sha1_to_hex(sha1));
+			if (add_dict_entries(dict, data, size) < 0)
+				die("can't process %s object %s",
+				    typename(type), sha1_to_hex(sha1));
+			free(data);
+		}
 	}
 }
 
@@ -2198,10 +2259,26 @@ static void read_object_list_from_stdin(void)
 
 static void show_commit(struct commit *commit, void *data)
 {
+	if (pack_version == 4) {
+		unsigned long size;
+		enum object_type type;
+		unsigned char *buf;
+
+		/* commit->buffer is NULL most of the time, don't bother */
+		buf = read_sha1_file(commit->object.sha1, &type, &size);
+		add_commit_dict_entries(v4.commit_ident_table, buf, size);
+		free(buf);
+	}
 	add_object_entry(commit->object.sha1, OBJ_COMMIT, NULL, 0);
 	commit->object.flags |= OBJECT_ADDED;
 }
 
+static void show_tree_entry(const struct name_entry *entry, void *data)
+{
+	dict_add_entry(v4.tree_path_table, entry->mode, entry->path,
+		       tree_entry_len(entry));
+}
+
 static void show_object(struct object *obj,
 			const struct name_path *path, const char *last,
 			void *data)
@@ -2380,7 +2457,9 @@ static void get_object_list(int ac, const char **av)
 	if (prepare_revision_walk(&revs))
 		die("revision walk setup failed");
 	mark_edges_uninteresting(revs.commits, &revs, show_edge);
-	traverse_commit_list(&revs, show_commit, NULL, show_object, NULL);
+	traverse_commit_list(&revs, show_commit,
+			     pack_version == 4 ? show_tree_entry : NULL,
+			     show_object, NULL);
 
 	if (keep_unreachable)
 		add_objects_in_unpacked_packs(&revs);
@@ -2527,7 +2606,7 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 	}
 	if (pack_to_stdout != !base_name || argc)
 		usage_with_options(pack_usage, pack_objects_options);
-	if (pack_version != 2)
+	if (pack_version != 2 && pack_version != 4)
 		die(_("pack version %d is not supported"), pack_version);
 
 	rp_av[rp_ac++] = "pack-objects";
@@ -2579,6 +2658,10 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 		progress = 2;
 
 	prepare_packed_git();
+	if (pack_version == 4) {
+		v4.commit_ident_table = create_dict_table();
+		v4.tree_path_table = create_dict_table();
+	}
 
 	if (progress)
 		progress_state = start_progress("Counting objects", 0);
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 09/11] pack-objects: do not cache delta for v4 trees
  2013-09-08 15:04 ` [PATCH 00/11] pack v4 support in pack-objects Nguyễn Thái Ngọc Duy
                     ` (7 preceding siblings ...)
  2013-09-08 15:04   ` [PATCH 08/11] pack-objects: create pack v4 tables Nguyễn Thái Ngọc Duy
@ 2013-09-08 15:04   ` Nguyễn Thái Ngọc Duy
  2013-09-08 15:04   ` [PATCH 10/11] pack-objects: exclude commits out of delta objects in v4 Nguyễn Thái Ngọc Duy
                     ` (2 subsequent siblings)
  11 siblings, 0 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-08 15:04 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git, Nguyễn Thái Ngọc Duy


Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 69a22c1..665853d 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -1759,8 +1759,12 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
 		 * and therefore it is best to go to the write phase ASAP
 		 * instead, as we can afford spending more time compressing
 		 * between writes at that moment.
+		 *
+		 * For v4 trees we'll need to delta differently anyway
+		 * so no cache. v4 commits simply do not delta.
 		 */
-		if (entry->delta_data && !pack_to_stdout) {
+		if (entry->delta_data && !pack_to_stdout &&
+		    (pack_version < 4 || entry->type == OBJ_BLOB)) {
 			entry->z_delta_size = do_compress(&entry->delta_data,
 							  entry->delta_size);
 			cache_lock();
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 10/11] pack-objects: exclude commits out of delta objects in v4
  2013-09-08 15:04 ` [PATCH 00/11] pack v4 support in pack-objects Nguyễn Thái Ngọc Duy
                     ` (8 preceding siblings ...)
  2013-09-08 15:04   ` [PATCH 09/11] pack-objects: do not cache delta for v4 trees Nguyễn Thái Ngọc Duy
@ 2013-09-08 15:04   ` Nguyễn Thái Ngọc Duy
  2013-09-08 15:04   ` [PATCH 11/11] pack-objects: support writing pack v4 Nguyễn Thái Ngọc Duy
  2013-09-09 13:57   ` [PATCH v2 00/16] pack v4 support in pack-objects Nguyễn Thái Ngọc Duy
  11 siblings, 0 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-08 15:04 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git, Nguyễn Thái Ngọc Duy


Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 665853d..daa4349 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -1332,7 +1332,8 @@ static void check_object(struct object_entry *entry)
 			break;
 		}
 
-		if (base_ref && (base_entry = locate_object_entry(base_ref))) {
+		if (base_ref && (base_entry = locate_object_entry(base_ref)) &&
+		    (pack_version < 4 || entry->type != OBJ_COMMIT)) {
 			/*
 			 * If base_ref was set above that means we wish to
 			 * reuse delta data, and we even found that base
@@ -1416,6 +1417,8 @@ static void get_object_details(void)
 		check_object(entry);
 		if (big_file_threshold < entry->size)
 			entry->no_try_delta = 1;
+		if (pack_version == 4 && entry->type == OBJ_COMMIT)
+			entry->no_try_delta = 1;
 	}
 
 	free(sorted_by_offset);
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH 11/11] pack-objects: support writing pack v4
  2013-09-08 15:04 ` [PATCH 00/11] pack v4 support in pack-objects Nguyễn Thái Ngọc Duy
                     ` (9 preceding siblings ...)
  2013-09-08 15:04   ` [PATCH 10/11] pack-objects: exclude commits out of delta objects in v4 Nguyễn Thái Ngọc Duy
@ 2013-09-08 15:04   ` Nguyễn Thái Ngọc Duy
  2013-09-09 13:57   ` [PATCH v2 00/16] pack v4 support in pack-objects Nguyễn Thái Ngọc Duy
  11 siblings, 0 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-08 15:04 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git, Nguyễn Thái Ngọc Duy


Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 85 ++++++++++++++++++++++++++++++++++++++++++++------
 pack.h                 |  2 +-
 2 files changed, 76 insertions(+), 11 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index daa4349..f6586a1 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -254,8 +254,10 @@ static unsigned long write_no_reuse_object(struct sha1file *f, struct object_ent
 	enum object_type type;
 	void *buf;
 	struct git_istream *st = NULL;
+	char *result = "OK";
 
-	if (!usable_delta) {
+	if (!usable_delta ||
+	    (pack_version == 4 || entry->type == OBJ_TREE)) {
 		if (entry->type == OBJ_BLOB &&
 		    entry->size > big_file_threshold &&
 		    (st = open_istream(entry->idx.sha1, &type, &size, NULL)) != NULL)
@@ -287,7 +289,37 @@ static unsigned long write_no_reuse_object(struct sha1file *f, struct object_ent
 
 	if (st)	/* large blob case, just assume we don't compress well */
 		datalen = size;
-	else if (entry->z_delta_size)
+	else if (pack_version == 4 && entry->type == OBJ_COMMIT) {
+		datalen = size;
+		result = pv4_encode_commit(&v4, buf, &datalen);
+		if (result) {
+			free(buf);
+			buf = result;
+			type = OBJ_PV4_COMMIT;
+		}
+	} else if (pack_version == 4 && entry->type == OBJ_TREE) {
+		datalen = size;
+		if (usable_delta) {
+			unsigned long base_size;
+			char *base_buf;
+			base_buf = read_sha1_file(entry->delta->idx.sha1, &type,
+						  &base_size);
+			if (!base_buf || type != OBJ_TREE)
+				die("unable to read %s",
+				    sha1_to_hex(entry->delta->idx.sha1));
+			result = pv4_encode_tree(&v4, buf, &datalen,
+						 base_buf, base_size,
+						 entry->delta->idx.sha1);
+			free(base_buf);
+		} else
+			result = pv4_encode_tree(&v4, buf, &datalen,
+						 NULL, 0, NULL);
+		if (result) {
+			free(buf);
+			buf = result;
+			type = OBJ_PV4_TREE;
+		}
+	} else if (entry->z_delta_size)
 		datalen = entry->z_delta_size;
 	else
 		datalen = do_compress(&buf, size);
@@ -296,7 +328,10 @@ static unsigned long write_no_reuse_object(struct sha1file *f, struct object_ent
 	 * The object header is a byte of 'type' followed by zero or
 	 * more bytes of length.
 	 */
-	hdrlen = encode_in_pack_object_header(type, size, header);
+	if (pack_version < 4)
+		hdrlen = encode_in_pack_object_header(type, size, header);
+	else
+		hdrlen = pv4_encode_in_pack_object_header(type, size, header);
 
 	if (type == OBJ_OFS_DELTA) {
 		/*
@@ -318,7 +353,7 @@ static unsigned long write_no_reuse_object(struct sha1file *f, struct object_ent
 		sha1write(f, header, hdrlen);
 		sha1write(f, dheader + pos, sizeof(dheader) - pos);
 		hdrlen += sizeof(dheader) - pos;
-	} else if (type == OBJ_REF_DELTA) {
+	} else if (type == OBJ_REF_DELTA && pack_version < 4) {
 		/*
 		 * Deltas with a base reference contain
 		 * an additional 20 bytes for the base sha1.
@@ -332,6 +367,10 @@ static unsigned long write_no_reuse_object(struct sha1file *f, struct object_ent
 		sha1write(f, header, hdrlen);
 		sha1write(f, entry->delta->idx.sha1, 20);
 		hdrlen += 20;
+	} else if (type == OBJ_REF_DELTA && pack_version == 4) {
+		hdrlen += encode_sha1ref(&v4, entry->delta->idx.sha1,
+					header + hdrlen);
+		sha1write(f, header, hdrlen);
 	} else {
 		if (limit && hdrlen + datalen + 20 >= limit) {
 			if (st)
@@ -341,14 +380,26 @@ static unsigned long write_no_reuse_object(struct sha1file *f, struct object_ent
 		}
 		sha1write(f, header, hdrlen);
 	}
+
 	if (st) {
 		datalen = write_large_blob_data(st, f, entry->idx.sha1);
 		close_istream(st);
-	} else {
-		sha1write(f, buf, datalen);
-		free(buf);
+		return hdrlen + datalen;
 	}
 
+	if (!result) {
+		warning(_("can't convert %s object %s"),
+			typename(entry->type),
+			sha1_to_hex(entry->idx.sha1));
+		free(buf);
+		buf = read_sha1_file(entry->idx.sha1, &type, &size);
+		if (!buf)
+			die(_("unable to read %s"),
+			    sha1_to_hex(entry->idx.sha1));
+		datalen = do_compress(&buf, size);
+	}
+	sha1write(f, buf, datalen);
+	free(buf);
 	return hdrlen + datalen;
 }
 
@@ -368,7 +419,10 @@ static unsigned long write_reuse_object(struct sha1file *f, struct object_entry
 	if (entry->delta)
 		type = (allow_ofs_delta && entry->delta->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
-	hdrlen = encode_in_pack_object_header(type, entry->size, header);
+	if (pack_version < 4)
+		hdrlen = encode_in_pack_object_header(type, entry->size, header);
+	else
+		hdrlen = pv4_encode_in_pack_object_header(type, entry->size, header);
 
 	offset = entry->in_pack_offset;
 	revidx = find_pack_revindex(p, offset);
@@ -404,7 +458,7 @@ static unsigned long write_reuse_object(struct sha1file *f, struct object_entry
 		sha1write(f, dheader + pos, sizeof(dheader) - pos);
 		hdrlen += sizeof(dheader) - pos;
 		reused_delta++;
-	} else if (type == OBJ_REF_DELTA) {
+	} else if (type == OBJ_REF_DELTA && pack_version < 4) {
 		if (limit && hdrlen + 20 + datalen + 20 >= limit) {
 			unuse_pack(&w_curs);
 			return 0;
@@ -413,6 +467,11 @@ static unsigned long write_reuse_object(struct sha1file *f, struct object_entry
 		sha1write(f, entry->delta->idx.sha1, 20);
 		hdrlen += 20;
 		reused_delta++;
+	} else if (type == OBJ_REF_DELTA && pack_version == 4) {
+		hdrlen += encode_sha1ref(&v4, entry->delta->idx.sha1,
+					header + hdrlen);
+		sha1write(f, header, hdrlen);
+		reused_delta++;
 	} else {
 		if (limit && hdrlen + datalen + 20 >= limit) {
 			unuse_pack(&w_curs);
@@ -477,7 +536,9 @@ static unsigned long write_object(struct sha1file *f,
 				 * and we do not need to deltify it.
 				 */
 
-	if (!to_reuse)
+	if (!to_reuse ||
+	    (pack_version == 4 &&
+	     (entry->type == OBJ_TREE || entry->type == OBJ_COMMIT)))
 		len = write_no_reuse_object(f, entry, limit, usable_delta);
 	else
 		len = write_reuse_object(f, entry, limit, usable_delta);
@@ -742,6 +803,8 @@ static void write_pack_file(void)
 		offset = write_pack_header(f, pack_version, nr_remaining);
 		if (!offset)
 			die_errno("unable to write pack header");
+		if (pack_version == 4)
+			offset += packv4_write_tables(f, &v4);
 		nr_written = 0;
 		for (; i < nr_objects; i++) {
 			struct object_entry *e = write_order[i];
@@ -2083,6 +2146,8 @@ static void prepare_pack(int window, int depth)
 		sort_dict_entries_by_hits(v4.commit_ident_table);
 		sort_dict_entries_by_hits(v4.tree_path_table);
 		prepare_sha1_table();
+		pack_idx_opts.version = 3;
+		allow_ofs_delta = 0;
 	}
 
 	get_object_details();
diff --git a/pack.h b/pack.h
index 38f869d..fde60ec 100644
--- a/pack.h
+++ b/pack.h
@@ -8,7 +8,7 @@
  * Packed object header
  */
 #define PACK_SIGNATURE 0x5041434b	/* "PACK" */
-#define pack_version_ok(v) ((v) == htonl(2) || (v) == htonl(3))
+#define pack_version_ok(v) ((v) == htonl(2) || (v) == htonl(3) || (v) == htonl(4))
 struct pack_header {
 	uint32_t hdr_signature;
 	uint32_t hdr_version;
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* Re: [PATCH 05/11] pack-write.c: add pv4_encode_in_pack_object_header
  2013-09-08 15:04   ` [PATCH 05/11] pack-write.c: add pv4_encode_in_pack_object_header Nguyễn Thái Ngọc Duy
@ 2013-09-08 20:51     ` Nicolas Pitre
  0 siblings, 0 replies; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-08 20:51 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git

[-- Attachment #1: Type: TEXT/PLAIN, Size: 3238 bytes --]

On Sun, 8 Sep 2013, Nguyễn Thái Ngọc Duy wrote:

> 
> Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
> ---
>  pack-write.c | 29 +++++++++++++++++++++++++++++
>  pack.h       |  1 +
>  2 files changed, 30 insertions(+)
> 
> diff --git a/pack-write.c b/pack-write.c
> index 88e4788..6f11104 100644
> --- a/pack-write.c
> +++ b/pack-write.c
> @@ -1,6 +1,7 @@
>  #include "cache.h"
>  #include "pack.h"
>  #include "csum-file.h"
> +#include "varint.h"
>  
>  void reset_pack_idx_option(struct pack_idx_option *opts)
>  {
> @@ -340,6 +341,34 @@ int encode_in_pack_object_header(enum object_type type, uintmax_t size, unsigned
>  	return n;
>  }
>  
> +/*
> + * The per-object header is a pretty dense thing, which is
> + *  - first byte: low four bits are "size", then three bits of "type",
> + *    and the high bit is "size continues".
> + *  - each byte afterwards: low seven bits are size continuation,
> + *    with the high bit being "size continues"
> + */

This comment is a bit misleading.  It looks almost like the pack v2 
object header encoding which is not a varint encoded value like this one 
is.

> +int pv4_encode_in_pack_object_header(enum object_type type,
> +				     uintmax_t size, unsigned char *hdr)

Could we have a somewhat shorter function name? 
pv4_encode_object_header() should be acceptable given "pv4" already 
implies a pack.

> +{
> +	uintmax_t val;
> +	if (type < OBJ_COMMIT || type > OBJ_PV4_TREE || type == OBJ_OFS_DELTA)
> +		die("bad type %d", type);

This test has holes, such as types 5 and 8.

I think this would be better as:

	switch (type) {
	case OBJ_COMMIT:
	case OBJ_TREE:
	case OBJ_BLOB:
	case OBJ_TAG:
	case OBJ_REF_DELTA:
	case OBJ_PV4_COMMIT:
	case OBJ_PV4_TREE:
		break;
	default:
		die("bad type %d", type);
	}

The compiler ought to be smart enough to optimize the contiguous case 
range.  And that makes it explicit and obvious what we test for.

> +
> +	/*
> +	 * We allocate 4 bits in the LSB for the object type which
> +	 * should be good for quite a while, given that we effectively
> +	 * encodes only 5 object types: commit, tree, blob, delta,
> +	 * tag.
> +	 */
> +	val = size;
> +	if (MSB(val, 4))
> +		die("fixme: the code doesn't currently cope with big sizes");
> +	val <<= 4;
> +	val |= type;
> +	return encode_varint(val, hdr);
> +}
> +
>  struct sha1file *create_tmp_packfile(char **pack_tmp_name)
>  {
>  	char tmpname[PATH_MAX];
> diff --git a/pack.h b/pack.h
> index 855f6c6..38f869d 100644
> --- a/pack.h
> +++ b/pack.h
> @@ -83,6 +83,7 @@ extern off_t write_pack_header(struct sha1file *f, int, uint32_t);
>  extern void fixup_pack_header_footer(int, unsigned char *, const char *, uint32_t, unsigned char *, off_t);
>  extern char *index_pack_lockfile(int fd);
>  extern int encode_in_pack_object_header(enum object_type, uintmax_t, unsigned char *);
> +extern int pv4_encode_in_pack_object_header(enum object_type, uintmax_t, unsigned char *);
>  
>  #define PH_ERROR_EOF		(-1)
>  #define PH_ERROR_PACK_SIGNATURE	(-2)
> -- 
> 1.8.2.83.gc99314b
> 
> --
> To unsubscribe from this list: send the line "unsubscribe git" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply	[flat|nested] 124+ messages in thread

* Re: [PATCH 03/11] pack v4: move packv4-create.c to libgit.a
  2013-09-08 15:04   ` [PATCH 03/11] pack v4: move packv4-create.c to libgit.a Nguyễn Thái Ngọc Duy
@ 2013-09-08 20:56     ` Nicolas Pitre
  0 siblings, 0 replies; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-08 20:56 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git

[-- Attachment #1: Type: TEXT/PLAIN, Size: 2795 bytes --]

On Sun, 8 Sep 2013, Nguyễn Thái Ngọc Duy wrote:

> git-packv4-create now becomes test-packv4. Code that will not be used
> by pack-objects.c is moved to test-packv4.c. It may be removed when
> the code transition to pack-objects completes.
> 
> Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
> ---
>  Makefile            |   4 +-
>  packv4-create.c     | 491 +---------------------------------------------------
>  packv4-create.h     |  39 +++++
>  test-packv4.c (new) | 476 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 525 insertions(+), 485 deletions(-)
>  create mode 100644 test-packv4.c
> 
> diff --git a/Makefile b/Makefile
> index 22fc276..af2e3e3 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -550,7 +550,6 @@ PROGRAM_OBJS += shell.o
>  PROGRAM_OBJS += show-index.o
>  PROGRAM_OBJS += upload-pack.o
>  PROGRAM_OBJS += remote-testsvn.o
> -PROGRAM_OBJS += packv4-create.o
>  
>  # Binary suffix, set to .exe for Windows builds
>  X =
> @@ -568,6 +567,7 @@ TEST_PROGRAMS_NEED_X += test-line-buffer
>  TEST_PROGRAMS_NEED_X += test-match-trees
>  TEST_PROGRAMS_NEED_X += test-mergesort
>  TEST_PROGRAMS_NEED_X += test-mktemp
> +TEST_PROGRAMS_NEED_X += test-packv4
>  TEST_PROGRAMS_NEED_X += test-parse-options
>  TEST_PROGRAMS_NEED_X += test-path-utils
>  TEST_PROGRAMS_NEED_X += test-prio-queue
> @@ -702,6 +702,7 @@ LIB_H += notes.h
>  LIB_H += object.h
>  LIB_H += pack-revindex.h
>  LIB_H += pack.h
> +LIB_H += packv4-create.h
>  LIB_H += packv4-parse.h
>  LIB_H += parse-options.h
>  LIB_H += patch-ids.h
> @@ -839,6 +840,7 @@ LIB_OBJS += object.o
>  LIB_OBJS += pack-check.o
>  LIB_OBJS += pack-revindex.o
>  LIB_OBJS += pack-write.o
> +LIB_OBJS += packv4-create.o
>  LIB_OBJS += packv4-parse.o
>  LIB_OBJS += pager.o
>  LIB_OBJS += parse-options.o
> diff --git a/packv4-create.c b/packv4-create.c
> index 920a0b4..cdf82c0 100644
> --- a/packv4-create.c
> +++ b/packv4-create.c
> @@ -18,9 +18,9 @@
>  #include "packv4-create.h"
>  
>  
> -static int pack_compression_seen;
> -static int pack_compression_level = Z_DEFAULT_COMPRESSION;
> -static int min_tree_copy = 1;
> +int pack_compression_seen;
> +int pack_compression_level = Z_DEFAULT_COMPRESSION;
> +int min_tree_copy = 1;
>  
>  struct data_entry {
>  	unsigned offset;
> @@ -28,17 +28,6 @@ struct data_entry {
>  	unsigned hits;
>  };
>  
> -struct dict_table {
> -	unsigned char *data;
> -	unsigned cur_offset;
> -	unsigned size;
> -	struct data_entry *entry;
> -	unsigned nb_entries;
> -	unsigned max_entries;
> -	unsigned *hash;
> -	unsigned hash_size;
> -};

It doesn't seem necessary to move this structure definition to the 
header file.  Only an opaque

	struct dict_table;

should be needed in packv4-create.h.  That would keep the dictionary 
handling localized.


Nicolas

^ permalink raw reply	[flat|nested] 124+ messages in thread

* Re: [PATCH 08/11] pack-objects: create pack v4 tables
  2013-09-08 15:04   ` [PATCH 08/11] pack-objects: create pack v4 tables Nguyễn Thái Ngọc Duy
@ 2013-09-09 10:40     ` Duy Nguyen
  2013-09-09 13:07       ` Nicolas Pitre
  0 siblings, 1 reply; 124+ messages in thread
From: Duy Nguyen @ 2013-09-09 10:40 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Git Mailing List, Nguyễn Thái Ngọc Duy

On Sun, Sep 8, 2013 at 10:04 PM, Nguyễn Thái Ngọc Duy <pclouds@gmail.com> wrote:
> +static void prepare_sha1_table(void)
> +{
> +       unsigned i;
> +       /*
> +        * This table includes SHA-1s that may not be present in the
> +        * pack. One of the use of such SHA-1 is for completing thin
> +        * packs, where index-pack does not need to add SHA-1 to the
> +        * table at completion time.
> +        */
> +       v4.all_objs = xmalloc(nr_objects * sizeof(*v4.all_objs));
> +       v4.all_objs_nr = nr_objects;
> +       for (i = 0; i < nr_objects; i++)
> +               v4.all_objs[i] = objects[i].idx;
> +       qsort(v4.all_objs, nr_objects, sizeof(*v4.all_objs),
> +             sha1_idx_sort);
> +}
> +

fwiw this is wrong. Even in the non-thin pack case, pack-objects could
write multiple packs to disk and we need different sha-1 table for
each one. The situation is worse for thin pack because not all
preferred_base entries end up a real dependency in the final pack. I'm
working on it..
-- 
Duy

^ permalink raw reply	[flat|nested] 124+ messages in thread

* Re: [PATCH 08/11] pack-objects: create pack v4 tables
  2013-09-09 10:40     ` Duy Nguyen
@ 2013-09-09 13:07       ` Nicolas Pitre
  2013-09-09 15:21         ` Junio C Hamano
  0 siblings, 1 reply; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-09 13:07 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Git Mailing List

[-- Attachment #1: Type: TEXT/PLAIN, Size: 1549 bytes --]

On Mon, 9 Sep 2013, Duy Nguyen wrote:

> On Sun, Sep 8, 2013 at 10:04 PM, Nguyễn Thái Ngọc Duy <pclouds@gmail.com> wrote:
> > +static void prepare_sha1_table(void)
> > +{
> > +       unsigned i;
> > +       /*
> > +        * This table includes SHA-1s that may not be present in the
> > +        * pack. One of the use of such SHA-1 is for completing thin
> > +        * packs, where index-pack does not need to add SHA-1 to the
> > +        * table at completion time.
> > +        */
> > +       v4.all_objs = xmalloc(nr_objects * sizeof(*v4.all_objs));
> > +       v4.all_objs_nr = nr_objects;
> > +       for (i = 0; i < nr_objects; i++)
> > +               v4.all_objs[i] = objects[i].idx;
> > +       qsort(v4.all_objs, nr_objects, sizeof(*v4.all_objs),
> > +             sha1_idx_sort);
> > +}
> > +
> 
> fwiw this is wrong. Even in the non-thin pack case, pack-objects could
> write multiple packs to disk and we need different sha-1 table for
> each one. The situation is worse for thin pack because not all
> preferred_base entries end up a real dependency in the final pack. I'm
> working on it..

Is anyone still using --max-pack-size ?

I'm wondering if producing multiple packs from pack-objects is really 
useful these days.  If I remember correctly, this was created to allow 
the archiving of large packs onto CDROMs or the like.

I'd be tempted to simply ignore this facility and get rid of its 
complexity if no one uses it.  Or assume that split packs will have 
inter dependencies.  Or they will be pack v2 only.


Nicolas

^ permalink raw reply	[flat|nested] 124+ messages in thread

* [PATCH v2 00/16] pack v4 support in pack-objects
  2013-09-08 15:04 ` [PATCH 00/11] pack v4 support in pack-objects Nguyễn Thái Ngọc Duy
                     ` (10 preceding siblings ...)
  2013-09-08 15:04   ` [PATCH 11/11] pack-objects: support writing pack v4 Nguyễn Thái Ngọc Duy
@ 2013-09-09 13:57   ` Nguyễn Thái Ngọc Duy
  2013-09-09 13:57     ` [PATCH v2 01/16] pack v4: allocate dicts from the beginning Nguyễn Thái Ngọc Duy
                       ` (15 more replies)
  11 siblings, 16 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-09 13:57 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git, Nguyễn Thái Ngọc Duy

This version supports thin pack. I could clone from git.git with only
master, then fetch the rest and fsck did not complain anything. I did
not check if I broke --max-pack-size though.

Interesting patches are the ones near the end: "prepare SHA-1 table",
"support writing pack v4" and "support completing thin packs"

Still rough edges. If I don't find any new problems, I'll try to run
the test suite. My vacation days are over, so I will work at a much
slower pace than the last couple days.

Nguyễn Thái Ngọc Duy (16):
  pack v4: allocate dicts from the beginning
  pack v4: stop using static/global variables in packv4-create.c
  pack v4: move packv4-create.c to libgit.a
  pack v4: add version argument to write_pack_header
  pack_write: tighten valid object type check in
    encode_in_pack_object_header
  pack-write.c: add pv4_encode_object_header
  pack-objects: add --version to specify written pack version
  list-objects.c: add show_tree_entry callback to traverse_commit_list
  pack-objects: do not cache delta for v4 trees
  pack-objects: exclude commits out of delta objects in v4
  pack-objects: create pack v4 tables
  pack-objects: prepare SHA-1 table in v4
  pack-objects: support writing pack v4
  pack v4: support "end-of-pack" indicator in index-pack and
    pack-objects
  index-pack: use nr_objects_final as sha1_table size
  index-pack: support completing thin packs v4

 Makefile               |   4 +-
 builtin/index-pack.c   |  95 ++++++---
 builtin/pack-objects.c | 230 ++++++++++++++++++++--
 builtin/rev-list.c     |   4 +-
 bulk-checkin.c         |   2 +-
 list-objects.c         |   9 +-
 list-objects.h         |   3 +-
 pack-write.c           |  51 ++++-
 pack.h                 |   6 +-
 packv4-create.c        | 523 ++++---------------------------------------------
 packv4-create.h (new)  |  39 ++++
 test-packv4.c (new)    | 476 ++++++++++++++++++++++++++++++++++++++++++++
 upload-pack.c          |   2 +-
 13 files changed, 901 insertions(+), 543 deletions(-)
 create mode 100644 packv4-create.h
 create mode 100644 test-packv4.c

-- 
1.8.2.83.gc99314b

^ permalink raw reply	[flat|nested] 124+ messages in thread

* [PATCH v2 01/16] pack v4: allocate dicts from the beginning
  2013-09-09 13:57   ` [PATCH v2 00/16] pack v4 support in pack-objects Nguyễn Thái Ngọc Duy
@ 2013-09-09 13:57     ` Nguyễn Thái Ngọc Duy
  2013-09-09 13:57     ` [PATCH v2 02/16] pack v4: stop using static/global variables in packv4-create.c Nguyễn Thái Ngọc Duy
                       ` (14 subsequent siblings)
  15 siblings, 0 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-09 13:57 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git, Nguyễn Thái Ngọc Duy

commit_ident_table and tree_path_table are local to packv4-create.c
and test-packv4.c. Move them out of add_*_dict_entries so
add_*_dict_entries can be exported to pack-objects.c

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 packv4-create.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/packv4-create.c b/packv4-create.c
index 38fa594..dbc2a03 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -181,14 +181,12 @@ static char *get_nameend_and_tz(char *from, int *tz_val)
 	return end;
 }
 
-static int add_commit_dict_entries(void *buf, unsigned long size)
+int add_commit_dict_entries(struct dict_table *commit_ident_table,
+			    void *buf, unsigned long size)
 {
 	char *name, *end = NULL;
 	int tz_val;
 
-	if (!commit_ident_table)
-		commit_ident_table = create_dict_table();
-
 	/* parse and add author info */
 	name = strstr(buf, "\nauthor ");
 	if (name) {
@@ -212,14 +210,12 @@ static int add_commit_dict_entries(void *buf, unsigned long size)
 	return 0;
 }
 
-static int add_tree_dict_entries(void *buf, unsigned long size)
+static int add_tree_dict_entries(struct dict_table *tree_path_table,
+				 void *buf, unsigned long size)
 {
 	struct tree_desc desc;
 	struct name_entry name_entry;
 
-	if (!tree_path_table)
-		tree_path_table = create_dict_table();
-
 	init_tree_desc(&desc, buf, size);
 	while (tree_entry(&desc, &name_entry)) {
 		int pathlen = tree_entry_len(&name_entry);
@@ -659,6 +655,9 @@ static int create_pack_dictionaries(struct packed_git *p,
 	struct progress *progress_state;
 	unsigned int i;
 
+	commit_ident_table = create_dict_table();
+	tree_path_table = create_dict_table();
+
 	progress_state = start_progress("Scanning objects", p->num_objects);
 	for (i = 0; i < p->num_objects; i++) {
 		struct pack_idx_entry *obj = obj_list[i];
@@ -666,7 +665,8 @@ static int create_pack_dictionaries(struct packed_git *p,
 		enum object_type type;
 		unsigned long size;
 		struct object_info oi = {};
-		int (*add_dict_entries)(void *, unsigned long);
+		int (*add_dict_entries)(struct dict_table *, void *, unsigned long);
+		struct dict_table *dict;
 
 		display_progress(progress_state, i+1);
 
@@ -679,9 +679,11 @@ static int create_pack_dictionaries(struct packed_git *p,
 		switch (type) {
 		case OBJ_COMMIT:
 			add_dict_entries = add_commit_dict_entries;
+			dict = commit_ident_table;
 			break;
 		case OBJ_TREE:
 			add_dict_entries = add_tree_dict_entries;
+			dict = tree_path_table;
 			break;
 		default:
 			continue;
@@ -693,7 +695,7 @@ static int create_pack_dictionaries(struct packed_git *p,
 		if (check_sha1_signature(obj->sha1, data, size, typename(type)))
 			die("packed %s from %s is corrupt",
 			    sha1_to_hex(obj->sha1), p->pack_name);
-		if (add_dict_entries(data, size) < 0)
+		if (add_dict_entries(dict, data, size) < 0)
 			die("can't process %s object %s",
 				typename(type), sha1_to_hex(obj->sha1));
 		free(data);
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH v2 02/16] pack v4: stop using static/global variables in packv4-create.c
  2013-09-09 13:57   ` [PATCH v2 00/16] pack v4 support in pack-objects Nguyễn Thái Ngọc Duy
  2013-09-09 13:57     ` [PATCH v2 01/16] pack v4: allocate dicts from the beginning Nguyễn Thái Ngọc Duy
@ 2013-09-09 13:57     ` Nguyễn Thái Ngọc Duy
  2013-09-09 13:57     ` [PATCH v2 03/16] pack v4: move packv4-create.c to libgit.a Nguyễn Thái Ngọc Duy
                       ` (13 subsequent siblings)
  15 siblings, 0 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-09 13:57 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git, Nguyễn Thái Ngọc Duy


Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 packv4-create.c       | 103 ++++++++++++++++++++++++++++----------------------
 packv4-create.h (new) |  11 ++++++
 2 files changed, 69 insertions(+), 45 deletions(-)
 create mode 100644 packv4-create.h

diff --git a/packv4-create.c b/packv4-create.c
index dbc2a03..920a0b4 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -15,6 +15,7 @@
 #include "pack-revindex.h"
 #include "progress.h"
 #include "varint.h"
+#include "packv4-create.h"
 
 
 static int pack_compression_seen;
@@ -145,9 +146,6 @@ static void sort_dict_entries_by_hits(struct dict_table *t)
 	rehash_entries(t);
 }
 
-static struct dict_table *commit_ident_table;
-static struct dict_table *tree_path_table;
-
 /*
  * Parse the author/committer line from a canonical commit object.
  * The 'from' argument points right after the "author " or "committer "
@@ -243,10 +241,10 @@ void dump_dict_table(struct dict_table *t)
 	}
 }
 
-static void dict_dump(void)
+static void dict_dump(struct packv4_tables *v4)
 {
-	dump_dict_table(commit_ident_table);
-	dump_dict_table(tree_path_table);
+	dump_dict_table(v4->commit_ident_table);
+	dump_dict_table(v4->tree_path_table);
 }
 
 /*
@@ -254,10 +252,12 @@ static void dict_dump(void)
  * pack SHA1 table incremented by 1, or the literal SHA1 value prefixed
  * with a zero byte if the needed SHA1 is not available in the table.
  */
-static struct pack_idx_entry *all_objs;
-static unsigned all_objs_nr;
-static int encode_sha1ref(const unsigned char *sha1, unsigned char *buf)
+
+int encode_sha1ref(const struct packv4_tables *v4,
+		   const unsigned char *sha1, unsigned char *buf)
 {
+	unsigned all_objs_nr = v4->all_objs_nr;
+	struct pack_idx_entry *all_objs = v4->all_objs;
 	unsigned lo = 0, hi = all_objs_nr;
 
 	do {
@@ -284,7 +284,8 @@ static int encode_sha1ref(const unsigned char *sha1, unsigned char *buf)
  * strict so to ensure the canonical version may always be
  * regenerated and produce the same hash.
  */
-void *pv4_encode_commit(void *buffer, unsigned long *sizep)
+void *pv4_encode_commit(const struct packv4_tables *v4,
+			void *buffer, unsigned long *sizep)
 {
 	unsigned long size = *sizep;
 	char *in, *tail, *end;
@@ -310,7 +311,7 @@ void *pv4_encode_commit(void *buffer, unsigned long *sizep)
 	if (get_sha1_lowhex(in + 5, sha1) < 0)
 		goto bad_data;
 	in += 46;
-	out += encode_sha1ref(sha1, out);
+	out += encode_sha1ref(v4, sha1, out);
 
 	/* count how many "parent" lines */
 	nb_parents = 0;
@@ -325,7 +326,7 @@ void *pv4_encode_commit(void *buffer, unsigned long *sizep)
 	while (nb_parents--) {
 		if (get_sha1_lowhex(in + 7, sha1))
 			goto bad_data;
-		out += encode_sha1ref(sha1, out);
+		out += encode_sha1ref(v4, sha1, out);
 		in += 48;
 	}
 
@@ -337,7 +338,7 @@ void *pv4_encode_commit(void *buffer, unsigned long *sizep)
 	end = get_nameend_and_tz(in, &tz_val);
 	if (!end)
 		goto bad_data;
-	author_index = dict_add_entry(commit_ident_table, tz_val, in, end - in);
+	author_index = dict_add_entry(v4->commit_ident_table, tz_val, in, end - in);
 	if (author_index < 0)
 		goto bad_dict;
 	author_time = strtoul(end, &end, 10);
@@ -353,7 +354,7 @@ void *pv4_encode_commit(void *buffer, unsigned long *sizep)
 	end = get_nameend_and_tz(in, &tz_val);
 	if (!end)
 		goto bad_data;
-	commit_index = dict_add_entry(commit_ident_table, tz_val, in, end - in);
+	commit_index = dict_add_entry(v4->commit_ident_table, tz_val, in, end - in);
 	if (commit_index < 0)
 		goto bad_dict;
 	commit_time = strtoul(end, &end, 10);
@@ -436,7 +437,8 @@ static int compare_tree_entries(struct name_entry *e1, struct name_entry *e2)
  * If a delta buffer is provided, we may encode multiple ranges of tree
  * entries against that buffer.
  */
-void *pv4_encode_tree(void *_buffer, unsigned long *sizep,
+void *pv4_encode_tree(const struct packv4_tables *v4,
+		      void *_buffer, unsigned long *sizep,
 		      void *delta, unsigned long delta_size,
 		      const unsigned char *delta_sha1)
 {
@@ -551,7 +553,7 @@ void *pv4_encode_tree(void *_buffer, unsigned long *sizep,
 			cp += encode_varint(copy_start, cp);
 			cp += encode_varint(copy_count, cp);
 			if (first_delta)
-				cp += encode_sha1ref(delta_sha1, cp);
+				cp += encode_sha1ref(v4, delta_sha1, cp);
 
 			/*
 			 * Now let's make sure this is going to take less
@@ -577,7 +579,7 @@ void *pv4_encode_tree(void *_buffer, unsigned long *sizep,
 		}
 
 		pathlen = tree_entry_len(&name_entry);
-		index = dict_add_entry(tree_path_table, name_entry.mode,
+		index = dict_add_entry(v4->tree_path_table, name_entry.mode,
 				       name_entry.path, pathlen);
 		if (index < 0) {
 			error("missing tree dict entry");
@@ -585,7 +587,7 @@ void *pv4_encode_tree(void *_buffer, unsigned long *sizep,
 			return NULL;
 		}
 		out += encode_varint(index << 1, out);
-		out += encode_sha1ref(name_entry.sha1, out);
+		out += encode_sha1ref(v4, name_entry.sha1, out);
 	}
 
 	if (copy_count) {
@@ -596,7 +598,7 @@ void *pv4_encode_tree(void *_buffer, unsigned long *sizep,
 		cp += encode_varint(copy_start, cp);
 		cp += encode_varint(copy_count, cp);
 		if (first_delta)
-			cp += encode_sha1ref(delta_sha1, cp);
+			cp += encode_sha1ref(v4, delta_sha1, cp);
 		if (copy_count >= min_tree_copy &&
 		    cp - copy_buf < out - &buffer[copy_pos]) {
 			out = buffer + copy_pos;
@@ -649,14 +651,15 @@ static struct pack_idx_entry **sort_objs_by_offset(struct pack_idx_entry *list,
 	return sorted;
 }
 
-static int create_pack_dictionaries(struct packed_git *p,
+static int create_pack_dictionaries(struct packv4_tables *v4,
+				    struct packed_git *p,
 				    struct pack_idx_entry **obj_list)
 {
 	struct progress *progress_state;
 	unsigned int i;
 
-	commit_ident_table = create_dict_table();
-	tree_path_table = create_dict_table();
+	v4->commit_ident_table = create_dict_table();
+	v4->tree_path_table = create_dict_table();
 
 	progress_state = start_progress("Scanning objects", p->num_objects);
 	for (i = 0; i < p->num_objects; i++) {
@@ -679,11 +682,11 @@ static int create_pack_dictionaries(struct packed_git *p,
 		switch (type) {
 		case OBJ_COMMIT:
 			add_dict_entries = add_commit_dict_entries;
-			dict = commit_ident_table;
+			dict = v4->commit_ident_table;
 			break;
 		case OBJ_TREE:
 			add_dict_entries = add_tree_dict_entries;
-			dict = tree_path_table;
+			dict = v4->tree_path_table;
 			break;
 		default:
 			continue;
@@ -776,9 +779,13 @@ static unsigned int packv4_write_header(struct sha1file *f, unsigned nr_objects)
 	return sizeof(hdr);
 }
 
-static unsigned long packv4_write_tables(struct sha1file *f, unsigned nr_objects,
-					 struct pack_idx_entry *objs)
+unsigned long packv4_write_tables(struct sha1file *f,
+				  const struct packv4_tables *v4)
 {
+	unsigned nr_objects = v4->all_objs_nr;
+	struct pack_idx_entry *objs = v4->all_objs;
+	struct dict_table *commit_ident_table = v4->commit_ident_table;
+	struct dict_table *tree_path_table = v4->tree_path_table;
 	unsigned i;
 	unsigned long written = 0;
 
@@ -823,7 +830,8 @@ static int write_object_header(struct sha1file *f, enum object_type type, unsign
 	return len;
 }
 
-static unsigned long copy_object_data(struct sha1file *f, struct packed_git *p,
+static unsigned long copy_object_data(struct packv4_tables *v4,
+				      struct sha1file *f, struct packed_git *p,
 				      off_t offset)
 {
 	struct pack_window *w_curs = NULL;
@@ -850,11 +858,13 @@ static unsigned long copy_object_data(struct sha1file *f, struct packed_git *p,
 		if (base_offset <= 0 || base_offset >= offset)
 			die("delta offset out of bound");
 		revidx = find_pack_revindex(p, base_offset);
-		reflen = encode_sha1ref(nth_packed_object_sha1(p, revidx->nr), buf);
+		reflen = encode_sha1ref(v4,
+					nth_packed_object_sha1(p, revidx->nr),
+					buf);
 		sha1write(f, buf, reflen);
 		written += reflen;
 	} else if (type == OBJ_REF_DELTA) {
-		reflen = encode_sha1ref(src + hdrlen, buf);
+		reflen = encode_sha1ref(v4, src + hdrlen, buf);
 		hdrlen += 20;
 		sha1write(f, buf, reflen);
 		written += reflen;
@@ -919,7 +929,8 @@ static unsigned char *get_delta_base(struct packed_git *p, off_t offset,
 	return sha1_buf;
 }
 
-static off_t packv4_write_object(struct sha1file *f, struct packed_git *p,
+static off_t packv4_write_object(struct packv4_tables *v4,
+				 struct sha1file *f, struct packed_git *p,
 				 struct pack_idx_entry *obj)
 {
 	void *src, *result;
@@ -941,7 +952,7 @@ static off_t packv4_write_object(struct sha1file *f, struct packed_git *p,
 	case OBJ_TREE:
 		break;
 	default:
-		return copy_object_data(f, p, obj->offset);
+		return copy_object_data(v4, f, p, obj->offset);
 	}
 
 	/* The rest is converted into their new format */
@@ -955,7 +966,7 @@ static off_t packv4_write_object(struct sha1file *f, struct packed_git *p,
 
 	switch (type) {
 	case OBJ_COMMIT:
-		result = pv4_encode_commit(src, &buf_size);
+		result = pv4_encode_commit(v4, src, &buf_size);
 		break;
 	case OBJ_TREE:
 		if (packed_type != OBJ_TREE) {
@@ -972,11 +983,12 @@ static off_t packv4_write_object(struct sha1file *f, struct packed_git *p,
 			if (!ref || ref_type != OBJ_TREE)
 				die("cannot obtain delta base for %s",
 						sha1_to_hex(obj->sha1));
-			result = pv4_encode_tree(src, &buf_size,
+			result = pv4_encode_tree(v4, src, &buf_size,
 						 ref, ref_size, ref_sha1);
 			free(ref);
 		} else {
-			result = pv4_encode_tree(src, &buf_size, NULL, 0, NULL);
+			result = pv4_encode_tree(v4, src, &buf_size,
+						 NULL, 0, NULL);
 		}
 		break;
 	default:
@@ -987,7 +999,7 @@ static off_t packv4_write_object(struct sha1file *f, struct packed_git *p,
 		warning("can't convert %s object %s",
 			typename(type), sha1_to_hex(obj->sha1));
 		/* fall back to copy the object in its original form */
-		return copy_object_data(f, p, obj->offset);
+		return copy_object_data(v4, f, p, obj->offset);
 	}
 
 	/* Use bit 3 to indicate a special type encoding */
@@ -1041,7 +1053,7 @@ static struct packed_git *open_pack(const char *path)
 	return p;
 }
 
-static void process_one_pack(char *src_pack, char *dst_pack)
+static void process_one_pack(struct packv4_tables *v4, char *src_pack, char *dst_pack)
 {
 	struct packed_git *p;
 	struct sha1file *f;
@@ -1061,26 +1073,26 @@ static void process_one_pack(char *src_pack, char *dst_pack)
 	objs = get_packed_object_list(p);
 	p_objs = sort_objs_by_offset(objs, nr_objects);
 
-	create_pack_dictionaries(p, p_objs);
-	sort_dict_entries_by_hits(commit_ident_table);
-	sort_dict_entries_by_hits(tree_path_table);
+	create_pack_dictionaries(v4, p, p_objs);
+	sort_dict_entries_by_hits(v4->commit_ident_table);
+	sort_dict_entries_by_hits(v4->tree_path_table);
 
 	packname = normalize_pack_name(dst_pack);
 	f = packv4_open(packname);
 	if (!f)
 		die("unable to open destination pack");
 	written += packv4_write_header(f, nr_objects);
-	written += packv4_write_tables(f, nr_objects, objs);
+	written += packv4_write_tables(f, v4);
 
 	/* Let's write objects out, updating the object index list in place */
 	progress_state = start_progress("Writing objects", nr_objects);
-	all_objs = objs;
-	all_objs_nr = nr_objects;
+	v4->all_objs = objs;
+	v4->all_objs_nr = nr_objects;
 	for (i = 0; i < nr_objects; i++) {
 		off_t obj_pos = written;
 		struct pack_idx_entry *obj = p_objs[i];
 		crc32_begin(f);
-		written += packv4_write_object(f, p, obj);
+		written += packv4_write_object(v4, f, p, obj);
 		obj->offset = obj_pos;
 		obj->crc32 = crc32_end(f);
 		display_progress(progress_state, i+1);
@@ -1114,6 +1126,7 @@ static int git_pack_config(const char *k, const char *v, void *cb)
 
 int main(int argc, char *argv[])
 {
+	struct packv4_tables v4;
 	char *src_pack, *dst_pack;
 
 	if (argc == 3) {
@@ -1131,8 +1144,8 @@ int main(int argc, char *argv[])
 	git_config(git_pack_config, NULL);
 	if (!pack_compression_seen && core_compression_seen)
 		pack_compression_level = core_compression_level;
-	process_one_pack(src_pack, dst_pack);
+	process_one_pack(&v4, src_pack, dst_pack);
 	if (0)
-		dict_dump();
+		dict_dump(&v4);
 	return 0;
 }
diff --git a/packv4-create.h b/packv4-create.h
new file mode 100644
index 0000000..0c8c77b
--- /dev/null
+++ b/packv4-create.h
@@ -0,0 +1,11 @@
+#ifndef PACKV4_CREATE_H
+#define PACKV4_CREATE_H
+
+struct packv4_tables {
+	struct pack_idx_entry *all_objs;
+	unsigned all_objs_nr;
+	struct dict_table *commit_ident_table;
+	struct dict_table *tree_path_table;
+};
+
+#endif
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH v2 03/16] pack v4: move packv4-create.c to libgit.a
  2013-09-09 13:57   ` [PATCH v2 00/16] pack v4 support in pack-objects Nguyễn Thái Ngọc Duy
  2013-09-09 13:57     ` [PATCH v2 01/16] pack v4: allocate dicts from the beginning Nguyễn Thái Ngọc Duy
  2013-09-09 13:57     ` [PATCH v2 02/16] pack v4: stop using static/global variables in packv4-create.c Nguyễn Thái Ngọc Duy
@ 2013-09-09 13:57     ` Nguyễn Thái Ngọc Duy
  2013-09-09 13:57     ` [PATCH v2 04/16] pack v4: add version argument to write_pack_header Nguyễn Thái Ngọc Duy
                       ` (12 subsequent siblings)
  15 siblings, 0 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-09 13:57 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git, Nguyễn Thái Ngọc Duy

git-packv4-create now becomes test-packv4. Code that will not be used
by pack-objects.c is moved to test-packv4.c. It may be removed when
the code transition to pack-objects completes.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 Makefile            |   4 +-
 packv4-create.c     | 480 +---------------------------------------------------
 packv4-create.h     |  28 +++
 test-packv4.c (new) | 476 +++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 514 insertions(+), 474 deletions(-)
 create mode 100644 test-packv4.c

diff --git a/Makefile b/Makefile
index 22fc276..af2e3e3 100644
--- a/Makefile
+++ b/Makefile
@@ -550,7 +550,6 @@ PROGRAM_OBJS += shell.o
 PROGRAM_OBJS += show-index.o
 PROGRAM_OBJS += upload-pack.o
 PROGRAM_OBJS += remote-testsvn.o
-PROGRAM_OBJS += packv4-create.o
 
 # Binary suffix, set to .exe for Windows builds
 X =
@@ -568,6 +567,7 @@ TEST_PROGRAMS_NEED_X += test-line-buffer
 TEST_PROGRAMS_NEED_X += test-match-trees
 TEST_PROGRAMS_NEED_X += test-mergesort
 TEST_PROGRAMS_NEED_X += test-mktemp
+TEST_PROGRAMS_NEED_X += test-packv4
 TEST_PROGRAMS_NEED_X += test-parse-options
 TEST_PROGRAMS_NEED_X += test-path-utils
 TEST_PROGRAMS_NEED_X += test-prio-queue
@@ -702,6 +702,7 @@ LIB_H += notes.h
 LIB_H += object.h
 LIB_H += pack-revindex.h
 LIB_H += pack.h
+LIB_H += packv4-create.h
 LIB_H += packv4-parse.h
 LIB_H += parse-options.h
 LIB_H += patch-ids.h
@@ -839,6 +840,7 @@ LIB_OBJS += object.o
 LIB_OBJS += pack-check.o
 LIB_OBJS += pack-revindex.o
 LIB_OBJS += pack-write.o
+LIB_OBJS += packv4-create.o
 LIB_OBJS += packv4-parse.o
 LIB_OBJS += pager.o
 LIB_OBJS += parse-options.o
diff --git a/packv4-create.c b/packv4-create.c
index 920a0b4..83a6336 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -18,9 +18,9 @@
 #include "packv4-create.h"
 
 
-static int pack_compression_seen;
-static int pack_compression_level = Z_DEFAULT_COMPRESSION;
-static int min_tree_copy = 1;
+int pack_compression_seen;
+int pack_compression_level = Z_DEFAULT_COMPRESSION;
+int min_tree_copy = 1;
 
 struct data_entry {
 	unsigned offset;
@@ -139,7 +139,7 @@ static int cmp_dict_entries(const void *a_, const void *b_)
 	return diff;
 }
 
-static void sort_dict_entries_by_hits(struct dict_table *t)
+void sort_dict_entries_by_hits(struct dict_table *t)
 {
 	qsort(t->entry, t->nb_entries, sizeof(*t->entry), cmp_dict_entries);
 	t->hash_size = (t->nb_entries * 4 / 3) / 2;
@@ -208,7 +208,7 @@ int add_commit_dict_entries(struct dict_table *commit_ident_table,
 	return 0;
 }
 
-static int add_tree_dict_entries(struct dict_table *tree_path_table,
+int add_tree_dict_entries(struct dict_table *tree_path_table,
 				 void *buf, unsigned long size)
 {
 	struct tree_desc desc;
@@ -224,7 +224,7 @@ static int add_tree_dict_entries(struct dict_table *tree_path_table,
 	return 0;
 }
 
-void dump_dict_table(struct dict_table *t)
+static void dump_dict_table(struct dict_table *t)
 {
 	int i;
 
@@ -241,7 +241,7 @@ void dump_dict_table(struct dict_table *t)
 	}
 }
 
-static void dict_dump(struct packv4_tables *v4)
+void dict_dump(struct packv4_tables *v4)
 {
 	dump_dict_table(v4->commit_ident_table);
 	dump_dict_table(v4->tree_path_table);
@@ -611,103 +611,6 @@ void *pv4_encode_tree(const struct packv4_tables *v4,
 	return buffer;
 }
 
-static struct pack_idx_entry *get_packed_object_list(struct packed_git *p)
-{
-	unsigned i, nr_objects = p->num_objects;
-	struct pack_idx_entry *objects;
-
-	objects = xmalloc((nr_objects + 1) * sizeof(*objects));
-	objects[nr_objects].offset = p->pack_size - 20;
-	for (i = 0; i < nr_objects; i++) {
-		hashcpy(objects[i].sha1, nth_packed_object_sha1(p, i));
-		objects[i].offset = nth_packed_object_offset(p, i);
-	}
-
-	return objects;
-}
-
-static int sort_by_offset(const void *e1, const void *e2)
-{
-	const struct pack_idx_entry * const *entry1 = e1;
-	const struct pack_idx_entry * const *entry2 = e2;
-	if ((*entry1)->offset < (*entry2)->offset)
-		return -1;
-	if ((*entry1)->offset > (*entry2)->offset)
-		return 1;
-	return 0;
-}
-
-static struct pack_idx_entry **sort_objs_by_offset(struct pack_idx_entry *list,
-						    unsigned nr_objects)
-{
-	unsigned i;
-	struct pack_idx_entry **sorted;
-
-	sorted = xmalloc((nr_objects + 1) * sizeof(*sorted));
-	for (i = 0; i < nr_objects + 1; i++)
-		sorted[i] = &list[i];
-	qsort(sorted, nr_objects + 1, sizeof(*sorted), sort_by_offset);
-
-	return sorted;
-}
-
-static int create_pack_dictionaries(struct packv4_tables *v4,
-				    struct packed_git *p,
-				    struct pack_idx_entry **obj_list)
-{
-	struct progress *progress_state;
-	unsigned int i;
-
-	v4->commit_ident_table = create_dict_table();
-	v4->tree_path_table = create_dict_table();
-
-	progress_state = start_progress("Scanning objects", p->num_objects);
-	for (i = 0; i < p->num_objects; i++) {
-		struct pack_idx_entry *obj = obj_list[i];
-		void *data;
-		enum object_type type;
-		unsigned long size;
-		struct object_info oi = {};
-		int (*add_dict_entries)(struct dict_table *, void *, unsigned long);
-		struct dict_table *dict;
-
-		display_progress(progress_state, i+1);
-
-		oi.typep = &type;
-		oi.sizep = &size;
-		if (packed_object_info(p, obj->offset, &oi) < 0)
-			die("cannot get type of %s from %s",
-			    sha1_to_hex(obj->sha1), p->pack_name);
-
-		switch (type) {
-		case OBJ_COMMIT:
-			add_dict_entries = add_commit_dict_entries;
-			dict = v4->commit_ident_table;
-			break;
-		case OBJ_TREE:
-			add_dict_entries = add_tree_dict_entries;
-			dict = v4->tree_path_table;
-			break;
-		default:
-			continue;
-		}
-		data = unpack_entry(p, obj->offset, &type, &size);
-		if (!data)
-			die("cannot unpack %s from %s",
-			    sha1_to_hex(obj->sha1), p->pack_name);
-		if (check_sha1_signature(obj->sha1, data, size, typename(type)))
-			die("packed %s from %s is corrupt",
-			    sha1_to_hex(obj->sha1), p->pack_name);
-		if (add_dict_entries(dict, data, size) < 0)
-			die("can't process %s object %s",
-				typename(type), sha1_to_hex(obj->sha1));
-		free(data);
-	}
-
-	stop_progress(&progress_state);
-	return 0;
-}
-
 static unsigned long write_dict_table(struct sha1file *f, struct dict_table *t)
 {
 	unsigned char buffer[1024];
@@ -757,28 +660,6 @@ static unsigned long write_dict_table(struct sha1file *f, struct dict_table *t)
 	return hdrlen + datalen;
 }
 
-static struct sha1file * packv4_open(char *path)
-{
-	int fd;
-
-	fd = open(path, O_CREAT|O_EXCL|O_WRONLY, 0600);
-	if (fd < 0)
-		die_errno("unable to create '%s'", path);
-	return sha1fd(fd, path);
-}
-
-static unsigned int packv4_write_header(struct sha1file *f, unsigned nr_objects)
-{
-	struct pack_header hdr;
-
-	hdr.hdr_signature = htonl(PACK_SIGNATURE);
-	hdr.hdr_version = htonl(4);
-	hdr.hdr_entries = htonl(nr_objects);
-	sha1write(f, &hdr, sizeof(hdr));
-
-	return sizeof(hdr);
-}
-
 unsigned long packv4_write_tables(struct sha1file *f,
 				  const struct packv4_tables *v4)
 {
@@ -802,350 +683,3 @@ unsigned long packv4_write_tables(struct sha1file *f,
 
 	return written;
 }
-
-static int write_object_header(struct sha1file *f, enum object_type type, unsigned long size)
-{
-	unsigned char buf[16];
-	uint64_t val;
-	int len;
-
-	/*
-	 * We really have only one kind of delta object.
-	 */
-	if (type == OBJ_OFS_DELTA)
-		type = OBJ_REF_DELTA;
-
-	/*
-	 * We allocate 4 bits in the LSB for the object type which should
-	 * be good for quite a while, given that we effectively encodes
-	 * only 5 object types: commit, tree, blob, delta, tag.
-	 */
-	val = size;
-	if (MSB(val, 4))
-		die("fixme: the code doesn't currently cope with big sizes");
-	val <<= 4;
-	val |= type;
-	len = encode_varint(val, buf);
-	sha1write(f, buf, len);
-	return len;
-}
-
-static unsigned long copy_object_data(struct packv4_tables *v4,
-				      struct sha1file *f, struct packed_git *p,
-				      off_t offset)
-{
-	struct pack_window *w_curs = NULL;
-	struct revindex_entry *revidx;
-	enum object_type type;
-	unsigned long avail, size, datalen, written;
-	int hdrlen, reflen, idx_nr;
-	unsigned char *src, buf[24];
-
-	revidx = find_pack_revindex(p, offset);
-	idx_nr = revidx->nr;
-	datalen = revidx[1].offset - offset;
-
-	src = use_pack(p, &w_curs, offset, &avail);
-	hdrlen = unpack_object_header_buffer(src, avail, &type, &size);
-
-	written = write_object_header(f, type, size);
-
-	if (type == OBJ_OFS_DELTA) {
-		const unsigned char *cp = src + hdrlen;
-		off_t base_offset = decode_varint(&cp);
-		hdrlen = cp - src;
-		base_offset = offset - base_offset;
-		if (base_offset <= 0 || base_offset >= offset)
-			die("delta offset out of bound");
-		revidx = find_pack_revindex(p, base_offset);
-		reflen = encode_sha1ref(v4,
-					nth_packed_object_sha1(p, revidx->nr),
-					buf);
-		sha1write(f, buf, reflen);
-		written += reflen;
-	} else if (type == OBJ_REF_DELTA) {
-		reflen = encode_sha1ref(v4, src + hdrlen, buf);
-		hdrlen += 20;
-		sha1write(f, buf, reflen);
-		written += reflen;
-	}
-
-	if (p->index_version > 1 &&
-	    check_pack_crc(p, &w_curs, offset, datalen, idx_nr))
-		die("bad CRC for object at offset %"PRIuMAX" in %s",
-		    (uintmax_t)offset, p->pack_name);
-
-	offset += hdrlen;
-	datalen -= hdrlen;
-
-	while (datalen) {
-		src = use_pack(p, &w_curs, offset, &avail);
-		if (avail > datalen)
-			avail = datalen;
-		sha1write(f, src, avail);
-		written += avail;
-		offset += avail;
-		datalen -= avail;
-	}
-	unuse_pack(&w_curs);
-
-	return written;
-}
-
-static unsigned char *get_delta_base(struct packed_git *p, off_t offset,
-				     unsigned char *sha1_buf)
-{
-	struct pack_window *w_curs = NULL;
-	enum object_type type;
-	unsigned long avail, size;
-	int hdrlen;
-	unsigned char *src;
-	const unsigned char *base_sha1 = NULL; ;
-
-	src = use_pack(p, &w_curs, offset, &avail);
-	hdrlen = unpack_object_header_buffer(src, avail, &type, &size);
-
-	if (type == OBJ_OFS_DELTA) {
-		const unsigned char *cp = src + hdrlen;
-		off_t base_offset = decode_varint(&cp);
-		base_offset = offset - base_offset;
-		if (base_offset <= 0 || base_offset >= offset) {
-			error("delta offset out of bound");
-		} else {
-			struct revindex_entry *revidx;
-			revidx = find_pack_revindex(p, base_offset);
-			base_sha1 = nth_packed_object_sha1(p, revidx->nr);
-		}
-	} else if (type == OBJ_REF_DELTA) {
-		base_sha1 = src + hdrlen;
-	} else
-		error("expected to get a delta but got a %s", typename(type));
-
-	unuse_pack(&w_curs);
-
-	if (!base_sha1)
-		return NULL;
-	hashcpy(sha1_buf, base_sha1);
-	return sha1_buf;
-}
-
-static off_t packv4_write_object(struct packv4_tables *v4,
-				 struct sha1file *f, struct packed_git *p,
-				 struct pack_idx_entry *obj)
-{
-	void *src, *result;
-	struct object_info oi = {};
-	enum object_type type, packed_type;
-	unsigned long obj_size, buf_size;
-	unsigned int hdrlen;
-
-	oi.typep = &type;
-	oi.sizep = &obj_size;
-	packed_type = packed_object_info(p, obj->offset, &oi);
-	if (packed_type < 0)
-		die("cannot get type of %s from %s",
-		    sha1_to_hex(obj->sha1), p->pack_name);
-
-	/* Some objects are copied without decompression */
-	switch (type) {
-	case OBJ_COMMIT:
-	case OBJ_TREE:
-		break;
-	default:
-		return copy_object_data(v4, f, p, obj->offset);
-	}
-
-	/* The rest is converted into their new format */
-	src = unpack_entry(p, obj->offset, &type, &buf_size);
-	if (!src || obj_size != buf_size)
-		die("cannot unpack %s from %s",
-		    sha1_to_hex(obj->sha1), p->pack_name);
-	if (check_sha1_signature(obj->sha1, src, buf_size, typename(type)))
-		die("packed %s from %s is corrupt",
-		    sha1_to_hex(obj->sha1), p->pack_name);
-
-	switch (type) {
-	case OBJ_COMMIT:
-		result = pv4_encode_commit(v4, src, &buf_size);
-		break;
-	case OBJ_TREE:
-		if (packed_type != OBJ_TREE) {
-			unsigned char sha1_buf[20], *ref_sha1;
-			void *ref;
-			enum object_type ref_type;
-			unsigned long ref_size;
-
-			ref_sha1 = get_delta_base(p, obj->offset, sha1_buf);
-			if (!ref_sha1)
-				die("unable to get delta base sha1 for %s",
-						sha1_to_hex(obj->sha1));
-			ref = read_sha1_file(ref_sha1, &ref_type, &ref_size);
-			if (!ref || ref_type != OBJ_TREE)
-				die("cannot obtain delta base for %s",
-						sha1_to_hex(obj->sha1));
-			result = pv4_encode_tree(v4, src, &buf_size,
-						 ref, ref_size, ref_sha1);
-			free(ref);
-		} else {
-			result = pv4_encode_tree(v4, src, &buf_size,
-						 NULL, 0, NULL);
-		}
-		break;
-	default:
-		die("unexpected object type %d", type);
-	}
-	free(src);
-	if (!result) {
-		warning("can't convert %s object %s",
-			typename(type), sha1_to_hex(obj->sha1));
-		/* fall back to copy the object in its original form */
-		return copy_object_data(v4, f, p, obj->offset);
-	}
-
-	/* Use bit 3 to indicate a special type encoding */
-	type += 8;
-	hdrlen = write_object_header(f, type, obj_size);
-	sha1write(f, result, buf_size);
-	free(result);
-	return hdrlen + buf_size;
-}
-
-static char *normalize_pack_name(const char *path)
-{
-	char buf[PATH_MAX];
-	int len;
-
-	len = strlcpy(buf, path, PATH_MAX);
-	if (len >= PATH_MAX - 6)
-		die("name too long: %s", path);
-
-	/*
-	 * In addition to "foo.idx" we accept "foo.pack" and "foo";
-	 * normalize these forms to "foo.pack".
-	 */
-	if (has_extension(buf, ".idx")) {
-		strcpy(buf + len - 4, ".pack");
-		len++;
-	} else if (!has_extension(buf, ".pack")) {
-		strcpy(buf + len, ".pack");
-		len += 5;
-	}
-
-	return xstrdup(buf);
-}
-
-static struct packed_git *open_pack(const char *path)
-{
-	char *packname = normalize_pack_name(path);
-	int len = strlen(packname);
-	struct packed_git *p;
-
-	strcpy(packname + len - 5, ".idx");
-	p = add_packed_git(packname, len - 1, 1);
-	if (!p)
-		die("packfile %s not found.", packname);
-
-	install_packed_git(p);
-	if (open_pack_index(p))
-		die("packfile %s index not opened", p->pack_name);
-
-	free(packname);
-	return p;
-}
-
-static void process_one_pack(struct packv4_tables *v4, char *src_pack, char *dst_pack)
-{
-	struct packed_git *p;
-	struct sha1file *f;
-	struct pack_idx_entry *objs, **p_objs;
-	struct pack_idx_option idx_opts;
-	unsigned i, nr_objects;
-	off_t written = 0;
-	char *packname;
-	unsigned char pack_sha1[20];
-	struct progress *progress_state;
-
-	p = open_pack(src_pack);
-	if (!p)
-		die("unable to open source pack");
-
-	nr_objects = p->num_objects;
-	objs = get_packed_object_list(p);
-	p_objs = sort_objs_by_offset(objs, nr_objects);
-
-	create_pack_dictionaries(v4, p, p_objs);
-	sort_dict_entries_by_hits(v4->commit_ident_table);
-	sort_dict_entries_by_hits(v4->tree_path_table);
-
-	packname = normalize_pack_name(dst_pack);
-	f = packv4_open(packname);
-	if (!f)
-		die("unable to open destination pack");
-	written += packv4_write_header(f, nr_objects);
-	written += packv4_write_tables(f, v4);
-
-	/* Let's write objects out, updating the object index list in place */
-	progress_state = start_progress("Writing objects", nr_objects);
-	v4->all_objs = objs;
-	v4->all_objs_nr = nr_objects;
-	for (i = 0; i < nr_objects; i++) {
-		off_t obj_pos = written;
-		struct pack_idx_entry *obj = p_objs[i];
-		crc32_begin(f);
-		written += packv4_write_object(v4, f, p, obj);
-		obj->offset = obj_pos;
-		obj->crc32 = crc32_end(f);
-		display_progress(progress_state, i+1);
-	}
-	stop_progress(&progress_state);
-
-	sha1close(f, pack_sha1, CSUM_CLOSE | CSUM_FSYNC);
-
-	reset_pack_idx_option(&idx_opts);
-	idx_opts.version = 3;
-	strcpy(packname + strlen(packname) - 5, ".idx");
-	write_idx_file(packname, p_objs, nr_objects, &idx_opts, pack_sha1);
-
-	free(packname);
-}
-
-static int git_pack_config(const char *k, const char *v, void *cb)
-{
-	if (!strcmp(k, "pack.compression")) {
-		int level = git_config_int(k, v);
-		if (level == -1)
-			level = Z_DEFAULT_COMPRESSION;
-		else if (level < 0 || level > Z_BEST_COMPRESSION)
-			die("bad pack compression level %d", level);
-		pack_compression_level = level;
-		pack_compression_seen = 1;
-		return 0;
-	}
-	return git_default_config(k, v, cb);
-}
-
-int main(int argc, char *argv[])
-{
-	struct packv4_tables v4;
-	char *src_pack, *dst_pack;
-
-	if (argc == 3) {
-		src_pack = argv[1];
-		dst_pack = argv[2];
-	} else if (argc == 4 && !prefixcmp(argv[1], "--min-tree-copy=")) {
-		min_tree_copy = atoi(argv[1] + strlen("--min-tree-copy="));
-		src_pack = argv[2];
-		dst_pack = argv[3];
-	} else {
-		fprintf(stderr, "Usage: %s [--min-tree-copy=<n>] <src_packfile> <dst_packfile>\n", argv[0]);
-		exit(1);
-	}
-
-	git_config(git_pack_config, NULL);
-	if (!pack_compression_seen && core_compression_seen)
-		pack_compression_level = core_compression_level;
-	process_one_pack(&v4, src_pack, dst_pack);
-	if (0)
-		dict_dump(&v4);
-	return 0;
-}
diff --git a/packv4-create.h b/packv4-create.h
index 0c8c77b..ba4929a 100644
--- a/packv4-create.h
+++ b/packv4-create.h
@@ -8,4 +8,32 @@ struct packv4_tables {
 	struct dict_table *tree_path_table;
 };
 
+struct dict_table;
+struct sha1file;
+
+struct dict_table *create_dict_table(void);
+int dict_add_entry(struct dict_table *t, int val, const char *str, int str_len);
+void destroy_dict_table(struct dict_table *t);
+void dict_dump(struct packv4_tables *v4);
+
+int add_commit_dict_entries(struct dict_table *commit_ident_table,
+			    void *buf, unsigned long size);
+int add_tree_dict_entries(struct dict_table *tree_path_table,
+			  void *buf, unsigned long size);
+void sort_dict_entries_by_hits(struct dict_table *t);
+
+int encode_sha1ref(const struct packv4_tables *v4,
+		   const unsigned char *sha1, unsigned char *buf);
+unsigned long packv4_write_tables(struct sha1file *f,
+				  const struct packv4_tables *v4);
+void *pv4_encode_commit(const struct packv4_tables *v4,
+			void *buffer, unsigned long *sizep);
+void *pv4_encode_tree(const struct packv4_tables *v4,
+		      void *_buffer, unsigned long *sizep,
+		      void *delta, unsigned long delta_size,
+		      const unsigned char *delta_sha1);
+
+void process_one_pack(struct packv4_tables *v4,
+		      char *src_pack, char *dst_pack);
+
 #endif
diff --git a/test-packv4.c b/test-packv4.c
new file mode 100644
index 0000000..3b0d7a2
--- /dev/null
+++ b/test-packv4.c
@@ -0,0 +1,476 @@
+#include "cache.h"
+#include "pack.h"
+#include "pack-revindex.h"
+#include "progress.h"
+#include "varint.h"
+#include "packv4-create.h"
+
+extern int pack_compression_seen;
+extern int pack_compression_level;
+extern int min_tree_copy;
+
+static struct pack_idx_entry *get_packed_object_list(struct packed_git *p)
+{
+	unsigned i, nr_objects = p->num_objects;
+	struct pack_idx_entry *objects;
+
+	objects = xmalloc((nr_objects + 1) * sizeof(*objects));
+	objects[nr_objects].offset = p->pack_size - 20;
+	for (i = 0; i < nr_objects; i++) {
+		hashcpy(objects[i].sha1, nth_packed_object_sha1(p, i));
+		objects[i].offset = nth_packed_object_offset(p, i);
+	}
+
+	return objects;
+}
+
+static int sort_by_offset(const void *e1, const void *e2)
+{
+	const struct pack_idx_entry * const *entry1 = e1;
+	const struct pack_idx_entry * const *entry2 = e2;
+	if ((*entry1)->offset < (*entry2)->offset)
+		return -1;
+	if ((*entry1)->offset > (*entry2)->offset)
+		return 1;
+	return 0;
+}
+
+static struct pack_idx_entry **sort_objs_by_offset(struct pack_idx_entry *list,
+						    unsigned nr_objects)
+{
+	unsigned i;
+	struct pack_idx_entry **sorted;
+
+	sorted = xmalloc((nr_objects + 1) * sizeof(*sorted));
+	for (i = 0; i < nr_objects + 1; i++)
+		sorted[i] = &list[i];
+	qsort(sorted, nr_objects + 1, sizeof(*sorted), sort_by_offset);
+
+	return sorted;
+}
+
+static int create_pack_dictionaries(struct packv4_tables *v4,
+				    struct packed_git *p,
+				    struct pack_idx_entry **obj_list)
+{
+	struct progress *progress_state;
+	unsigned int i;
+
+	v4->commit_ident_table = create_dict_table();
+	v4->tree_path_table = create_dict_table();
+
+	progress_state = start_progress("Scanning objects", p->num_objects);
+	for (i = 0; i < p->num_objects; i++) {
+		struct pack_idx_entry *obj = obj_list[i];
+		void *data;
+		enum object_type type;
+		unsigned long size;
+		struct object_info oi = {};
+		int (*add_dict_entries)(struct dict_table *, void *, unsigned long);
+		struct dict_table *dict;
+
+		display_progress(progress_state, i+1);
+
+		oi.typep = &type;
+		oi.sizep = &size;
+		if (packed_object_info(p, obj->offset, &oi) < 0)
+			die("cannot get type of %s from %s",
+			    sha1_to_hex(obj->sha1), p->pack_name);
+
+		switch (type) {
+		case OBJ_COMMIT:
+			add_dict_entries = add_commit_dict_entries;
+			dict = v4->commit_ident_table;
+			break;
+		case OBJ_TREE:
+			add_dict_entries = add_tree_dict_entries;
+			dict = v4->tree_path_table;
+			break;
+		default:
+			continue;
+		}
+		data = unpack_entry(p, obj->offset, &type, &size);
+		if (!data)
+			die("cannot unpack %s from %s",
+			    sha1_to_hex(obj->sha1), p->pack_name);
+		if (check_sha1_signature(obj->sha1, data, size, typename(type)))
+			die("packed %s from %s is corrupt",
+			    sha1_to_hex(obj->sha1), p->pack_name);
+		if (add_dict_entries(dict, data, size) < 0)
+			die("can't process %s object %s",
+				typename(type), sha1_to_hex(obj->sha1));
+		free(data);
+	}
+
+	stop_progress(&progress_state);
+	return 0;
+}
+
+static struct sha1file * packv4_open(char *path)
+{
+	int fd;
+
+	fd = open(path, O_CREAT|O_EXCL|O_WRONLY, 0600);
+	if (fd < 0)
+		die_errno("unable to create '%s'", path);
+	return sha1fd(fd, path);
+}
+
+static unsigned int packv4_write_header(struct sha1file *f, unsigned nr_objects)
+{
+	struct pack_header hdr;
+
+	hdr.hdr_signature = htonl(PACK_SIGNATURE);
+	hdr.hdr_version = htonl(4);
+	hdr.hdr_entries = htonl(nr_objects);
+	sha1write(f, &hdr, sizeof(hdr));
+
+	return sizeof(hdr);
+}
+
+static int write_object_header(struct sha1file *f, enum object_type type, unsigned long size)
+{
+	unsigned char buf[16];
+	uint64_t val;
+	int len;
+
+	/*
+	 * We really have only one kind of delta object.
+	 */
+	if (type == OBJ_OFS_DELTA)
+		type = OBJ_REF_DELTA;
+
+	/*
+	 * We allocate 4 bits in the LSB for the object type which should
+	 * be good for quite a while, given that we effectively encodes
+	 * only 5 object types: commit, tree, blob, delta, tag.
+	 */
+	val = size;
+	if (MSB(val, 4))
+		die("fixme: the code doesn't currently cope with big sizes");
+	val <<= 4;
+	val |= type;
+	len = encode_varint(val, buf);
+	sha1write(f, buf, len);
+	return len;
+}
+
+static unsigned long copy_object_data(struct packv4_tables *v4,
+				      struct sha1file *f, struct packed_git *p,
+				      off_t offset)
+{
+	struct pack_window *w_curs = NULL;
+	struct revindex_entry *revidx;
+	enum object_type type;
+	unsigned long avail, size, datalen, written;
+	int hdrlen, reflen, idx_nr;
+	unsigned char *src, buf[24];
+
+	revidx = find_pack_revindex(p, offset);
+	idx_nr = revidx->nr;
+	datalen = revidx[1].offset - offset;
+
+	src = use_pack(p, &w_curs, offset, &avail);
+	hdrlen = unpack_object_header_buffer(src, avail, &type, &size);
+
+	written = write_object_header(f, type, size);
+
+	if (type == OBJ_OFS_DELTA) {
+		const unsigned char *cp = src + hdrlen;
+		off_t base_offset = decode_varint(&cp);
+		hdrlen = cp - src;
+		base_offset = offset - base_offset;
+		if (base_offset <= 0 || base_offset >= offset)
+			die("delta offset out of bound");
+		revidx = find_pack_revindex(p, base_offset);
+		reflen = encode_sha1ref(v4,
+					nth_packed_object_sha1(p, revidx->nr),
+					buf);
+		sha1write(f, buf, reflen);
+		written += reflen;
+	} else if (type == OBJ_REF_DELTA) {
+		reflen = encode_sha1ref(v4, src + hdrlen, buf);
+		hdrlen += 20;
+		sha1write(f, buf, reflen);
+		written += reflen;
+	}
+
+	if (p->index_version > 1 &&
+	    check_pack_crc(p, &w_curs, offset, datalen, idx_nr))
+		die("bad CRC for object at offset %"PRIuMAX" in %s",
+		    (uintmax_t)offset, p->pack_name);
+
+	offset += hdrlen;
+	datalen -= hdrlen;
+
+	while (datalen) {
+		src = use_pack(p, &w_curs, offset, &avail);
+		if (avail > datalen)
+			avail = datalen;
+		sha1write(f, src, avail);
+		written += avail;
+		offset += avail;
+		datalen -= avail;
+	}
+	unuse_pack(&w_curs);
+
+	return written;
+}
+
+static unsigned char *get_delta_base(struct packed_git *p, off_t offset,
+				     unsigned char *sha1_buf)
+{
+	struct pack_window *w_curs = NULL;
+	enum object_type type;
+	unsigned long avail, size;
+	int hdrlen;
+	unsigned char *src;
+	const unsigned char *base_sha1 = NULL; ;
+
+	src = use_pack(p, &w_curs, offset, &avail);
+	hdrlen = unpack_object_header_buffer(src, avail, &type, &size);
+
+	if (type == OBJ_OFS_DELTA) {
+		const unsigned char *cp = src + hdrlen;
+		off_t base_offset = decode_varint(&cp);
+		base_offset = offset - base_offset;
+		if (base_offset <= 0 || base_offset >= offset) {
+			error("delta offset out of bound");
+		} else {
+			struct revindex_entry *revidx;
+			revidx = find_pack_revindex(p, base_offset);
+			base_sha1 = nth_packed_object_sha1(p, revidx->nr);
+		}
+	} else if (type == OBJ_REF_DELTA) {
+		base_sha1 = src + hdrlen;
+	} else
+		error("expected to get a delta but got a %s", typename(type));
+
+	unuse_pack(&w_curs);
+
+	if (!base_sha1)
+		return NULL;
+	hashcpy(sha1_buf, base_sha1);
+	return sha1_buf;
+}
+
+static off_t packv4_write_object(struct packv4_tables *v4,
+				 struct sha1file *f, struct packed_git *p,
+				 struct pack_idx_entry *obj)
+{
+	void *src, *result;
+	struct object_info oi = {};
+	enum object_type type, packed_type;
+	unsigned long obj_size, buf_size;
+	unsigned int hdrlen;
+
+	oi.typep = &type;
+	oi.sizep = &obj_size;
+	packed_type = packed_object_info(p, obj->offset, &oi);
+	if (packed_type < 0)
+		die("cannot get type of %s from %s",
+		    sha1_to_hex(obj->sha1), p->pack_name);
+
+	/* Some objects are copied without decompression */
+	switch (type) {
+	case OBJ_COMMIT:
+	case OBJ_TREE:
+		break;
+	default:
+		return copy_object_data(v4, f, p, obj->offset);
+	}
+
+	/* The rest is converted into their new format */
+	src = unpack_entry(p, obj->offset, &type, &buf_size);
+	if (!src || obj_size != buf_size)
+		die("cannot unpack %s from %s",
+		    sha1_to_hex(obj->sha1), p->pack_name);
+	if (check_sha1_signature(obj->sha1, src, buf_size, typename(type)))
+		die("packed %s from %s is corrupt",
+		    sha1_to_hex(obj->sha1), p->pack_name);
+
+	switch (type) {
+	case OBJ_COMMIT:
+		result = pv4_encode_commit(v4, src, &buf_size);
+		break;
+	case OBJ_TREE:
+		if (packed_type != OBJ_TREE) {
+			unsigned char sha1_buf[20], *ref_sha1;
+			void *ref;
+			enum object_type ref_type;
+			unsigned long ref_size;
+
+			ref_sha1 = get_delta_base(p, obj->offset, sha1_buf);
+			if (!ref_sha1)
+				die("unable to get delta base sha1 for %s",
+						sha1_to_hex(obj->sha1));
+			ref = read_sha1_file(ref_sha1, &ref_type, &ref_size);
+			if (!ref || ref_type != OBJ_TREE)
+				die("cannot obtain delta base for %s",
+						sha1_to_hex(obj->sha1));
+			result = pv4_encode_tree(v4, src, &buf_size,
+						 ref, ref_size, ref_sha1);
+			free(ref);
+		} else {
+			result = pv4_encode_tree(v4, src, &buf_size,
+						 NULL, 0, NULL);
+		}
+		break;
+	default:
+		die("unexpected object type %d", type);
+	}
+	free(src);
+	if (!result) {
+		warning("can't convert %s object %s",
+			typename(type), sha1_to_hex(obj->sha1));
+		/* fall back to copy the object in its original form */
+		return copy_object_data(v4, f, p, obj->offset);
+	}
+
+	/* Use bit 3 to indicate a special type encoding */
+	type += 8;
+	hdrlen = write_object_header(f, type, obj_size);
+	sha1write(f, result, buf_size);
+	free(result);
+	return hdrlen + buf_size;
+}
+
+static char *normalize_pack_name(const char *path)
+{
+	char buf[PATH_MAX];
+	int len;
+
+	len = strlcpy(buf, path, PATH_MAX);
+	if (len >= PATH_MAX - 6)
+		die("name too long: %s", path);
+
+	/*
+	 * In addition to "foo.idx" we accept "foo.pack" and "foo";
+	 * normalize these forms to "foo.pack".
+	 */
+	if (has_extension(buf, ".idx")) {
+		strcpy(buf + len - 4, ".pack");
+		len++;
+	} else if (!has_extension(buf, ".pack")) {
+		strcpy(buf + len, ".pack");
+		len += 5;
+	}
+
+	return xstrdup(buf);
+}
+
+static struct packed_git *open_pack(const char *path)
+{
+	char *packname = normalize_pack_name(path);
+	int len = strlen(packname);
+	struct packed_git *p;
+
+	strcpy(packname + len - 5, ".idx");
+	p = add_packed_git(packname, len - 1, 1);
+	if (!p)
+		die("packfile %s not found.", packname);
+
+	install_packed_git(p);
+	if (open_pack_index(p))
+		die("packfile %s index not opened", p->pack_name);
+
+	free(packname);
+	return p;
+}
+
+void process_one_pack(struct packv4_tables *v4, char *src_pack, char *dst_pack)
+{
+	struct packed_git *p;
+	struct sha1file *f;
+	struct pack_idx_entry *objs, **p_objs;
+	struct pack_idx_option idx_opts;
+	unsigned i, nr_objects;
+	off_t written = 0;
+	char *packname;
+	unsigned char pack_sha1[20];
+	struct progress *progress_state;
+
+	p = open_pack(src_pack);
+	if (!p)
+		die("unable to open source pack");
+
+	nr_objects = p->num_objects;
+	objs = get_packed_object_list(p);
+	p_objs = sort_objs_by_offset(objs, nr_objects);
+
+	create_pack_dictionaries(v4, p, p_objs);
+	sort_dict_entries_by_hits(v4->commit_ident_table);
+	sort_dict_entries_by_hits(v4->tree_path_table);
+
+	packname = normalize_pack_name(dst_pack);
+	f = packv4_open(packname);
+	if (!f)
+		die("unable to open destination pack");
+	written += packv4_write_header(f, nr_objects);
+	written += packv4_write_tables(f, v4);
+
+	/* Let's write objects out, updating the object index list in place */
+	progress_state = start_progress("Writing objects", nr_objects);
+	v4->all_objs = objs;
+	v4->all_objs_nr = nr_objects;
+	for (i = 0; i < nr_objects; i++) {
+		off_t obj_pos = written;
+		struct pack_idx_entry *obj = p_objs[i];
+		crc32_begin(f);
+		written += packv4_write_object(v4, f, p, obj);
+		obj->offset = obj_pos;
+		obj->crc32 = crc32_end(f);
+		display_progress(progress_state, i+1);
+	}
+	stop_progress(&progress_state);
+
+	sha1close(f, pack_sha1, CSUM_CLOSE | CSUM_FSYNC);
+
+	reset_pack_idx_option(&idx_opts);
+	idx_opts.version = 3;
+	strcpy(packname + strlen(packname) - 5, ".idx");
+	write_idx_file(packname, p_objs, nr_objects, &idx_opts, pack_sha1);
+
+	free(packname);
+}
+
+static int git_pack_config(const char *k, const char *v, void *cb)
+{
+	if (!strcmp(k, "pack.compression")) {
+		int level = git_config_int(k, v);
+		if (level == -1)
+			level = Z_DEFAULT_COMPRESSION;
+		else if (level < 0 || level > Z_BEST_COMPRESSION)
+			die("bad pack compression level %d", level);
+		pack_compression_level = level;
+		pack_compression_seen = 1;
+		return 0;
+	}
+	return git_default_config(k, v, cb);
+}
+
+int main(int argc, char *argv[])
+{
+	struct packv4_tables v4;
+	char *src_pack, *dst_pack;
+
+	if (argc == 3) {
+		src_pack = argv[1];
+		dst_pack = argv[2];
+	} else if (argc == 4 && !prefixcmp(argv[1], "--min-tree-copy=")) {
+		min_tree_copy = atoi(argv[1] + strlen("--min-tree-copy="));
+		src_pack = argv[2];
+		dst_pack = argv[3];
+	} else {
+		fprintf(stderr, "Usage: %s [--min-tree-copy=<n>] <src_packfile> <dst_packfile>\n", argv[0]);
+		exit(1);
+	}
+
+	git_config(git_pack_config, NULL);
+	if (!pack_compression_seen && core_compression_seen)
+		pack_compression_level = core_compression_level;
+	process_one_pack(&v4, src_pack, dst_pack);
+	if (0)
+		dict_dump(&v4);
+	return 0;
+}
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH v2 04/16] pack v4: add version argument to write_pack_header
  2013-09-09 13:57   ` [PATCH v2 00/16] pack v4 support in pack-objects Nguyễn Thái Ngọc Duy
                       ` (2 preceding siblings ...)
  2013-09-09 13:57     ` [PATCH v2 03/16] pack v4: move packv4-create.c to libgit.a Nguyễn Thái Ngọc Duy
@ 2013-09-09 13:57     ` Nguyễn Thái Ngọc Duy
  2013-09-09 13:57     ` [PATCH v2 05/16] pack_write: tighten valid object type check in encode_in_pack_object_header Nguyễn Thái Ngọc Duy
                       ` (11 subsequent siblings)
  15 siblings, 0 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-09 13:57 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git, Nguyễn Thái Ngọc Duy


Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 2 +-
 bulk-checkin.c         | 2 +-
 pack-write.c           | 7 +++++--
 pack.h                 | 3 +--
 4 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index f069462..33faea8 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -735,7 +735,7 @@ static void write_pack_file(void)
 		else
 			f = create_tmp_packfile(&pack_tmp_name);
 
-		offset = write_pack_header(f, nr_remaining);
+		offset = write_pack_header(f, 2, nr_remaining);
 		if (!offset)
 			die_errno("unable to write pack header");
 		nr_written = 0;
diff --git a/bulk-checkin.c b/bulk-checkin.c
index 6b0b6d4..9d8f0d0 100644
--- a/bulk-checkin.c
+++ b/bulk-checkin.c
@@ -176,7 +176,7 @@ static void prepare_to_stream(struct bulk_checkin_state *state,
 	reset_pack_idx_option(&state->pack_idx_opts);
 
 	/* Pretend we are going to write only one object */
-	state->offset = write_pack_header(state->f, 1);
+	state->offset = write_pack_header(state->f, 2, 1);
 	if (!state->offset)
 		die_errno("unable to write pack header");
 }
diff --git a/pack-write.c b/pack-write.c
index 631007e..88e4788 100644
--- a/pack-write.c
+++ b/pack-write.c
@@ -186,12 +186,15 @@ const char *write_idx_file(const char *index_name, struct pack_idx_entry **objec
 	return index_name;
 }
 
-off_t write_pack_header(struct sha1file *f, uint32_t nr_entries)
+off_t write_pack_header(struct sha1file *f,
+			int version, uint32_t nr_entries)
 {
 	struct pack_header hdr;
 
 	hdr.hdr_signature = htonl(PACK_SIGNATURE);
-	hdr.hdr_version = htonl(PACK_VERSION);
+	hdr.hdr_version = htonl(version);
+	if (!pack_version_ok(hdr.hdr_version))
+		die(_("pack version %d is not supported"), version);
 	hdr.hdr_entries = htonl(nr_entries);
 	if (sha1write(f, &hdr, sizeof(hdr)))
 		return 0;
diff --git a/pack.h b/pack.h
index aa6ee7d..855f6c6 100644
--- a/pack.h
+++ b/pack.h
@@ -8,7 +8,6 @@
  * Packed object header
  */
 #define PACK_SIGNATURE 0x5041434b	/* "PACK" */
-#define PACK_VERSION 2
 #define pack_version_ok(v) ((v) == htonl(2) || (v) == htonl(3))
 struct pack_header {
 	uint32_t hdr_signature;
@@ -80,7 +79,7 @@ extern const char *write_idx_file(const char *index_name, struct pack_idx_entry
 extern int check_pack_crc(struct packed_git *p, struct pack_window **w_curs, off_t offset, off_t len, unsigned int nr);
 extern int verify_pack_index(struct packed_git *);
 extern int verify_pack(struct packed_git *, verify_fn fn, struct progress *, uint32_t);
-extern off_t write_pack_header(struct sha1file *f, uint32_t);
+extern off_t write_pack_header(struct sha1file *f, int, uint32_t);
 extern void fixup_pack_header_footer(int, unsigned char *, const char *, uint32_t, unsigned char *, off_t);
 extern char *index_pack_lockfile(int fd);
 extern int encode_in_pack_object_header(enum object_type, uintmax_t, unsigned char *);
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH v2 05/16] pack_write: tighten valid object type check in encode_in_pack_object_header
  2013-09-09 13:57   ` [PATCH v2 00/16] pack v4 support in pack-objects Nguyễn Thái Ngọc Duy
                       ` (3 preceding siblings ...)
  2013-09-09 13:57     ` [PATCH v2 04/16] pack v4: add version argument to write_pack_header Nguyễn Thái Ngọc Duy
@ 2013-09-09 13:57     ` Nguyễn Thái Ngọc Duy
  2013-09-09 13:57     ` [PATCH v2 06/16] pack-write.c: add pv4_encode_object_header Nguyễn Thái Ngọc Duy
                       ` (10 subsequent siblings)
  15 siblings, 0 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-09 13:57 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git, Nguyễn Thái Ngọc Duy


Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 pack-write.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/pack-write.c b/pack-write.c
index 88e4788..36b88a3 100644
--- a/pack-write.c
+++ b/pack-write.c
@@ -325,8 +325,17 @@ int encode_in_pack_object_header(enum object_type type, uintmax_t size, unsigned
 	int n = 1;
 	unsigned char c;
 
-	if (type < OBJ_COMMIT || type > OBJ_REF_DELTA)
+	switch (type) {
+	case OBJ_COMMIT:
+	case OBJ_TREE:
+	case OBJ_BLOB:
+	case OBJ_TAG:
+	case OBJ_OFS_DELTA:
+	case OBJ_REF_DELTA:
+		break;
+	default:
 		die("bad type %d", type);
+	}
 
 	c = (type << 4) | (size & 15);
 	size >>= 4;
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH v2 06/16] pack-write.c: add pv4_encode_object_header
  2013-09-09 13:57   ` [PATCH v2 00/16] pack v4 support in pack-objects Nguyễn Thái Ngọc Duy
                       ` (4 preceding siblings ...)
  2013-09-09 13:57     ` [PATCH v2 05/16] pack_write: tighten valid object type check in encode_in_pack_object_header Nguyễn Thái Ngọc Duy
@ 2013-09-09 13:57     ` Nguyễn Thái Ngọc Duy
  2013-09-09 13:57     ` [PATCH v2 07/16] pack-objects: add --version to specify written pack version Nguyễn Thái Ngọc Duy
                       ` (9 subsequent siblings)
  15 siblings, 0 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-09 13:57 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git, Nguyễn Thái Ngọc Duy


Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 pack-write.c | 33 +++++++++++++++++++++++++++++++++
 pack.h       |  1 +
 2 files changed, 34 insertions(+)

diff --git a/pack-write.c b/pack-write.c
index 36b88a3..c1e9da4 100644
--- a/pack-write.c
+++ b/pack-write.c
@@ -1,6 +1,7 @@
 #include "cache.h"
 #include "pack.h"
 #include "csum-file.h"
+#include "varint.h"
 
 void reset_pack_idx_option(struct pack_idx_option *opts)
 {
@@ -349,6 +350,38 @@ int encode_in_pack_object_header(enum object_type type, uintmax_t size, unsigned
 	return n;
 }
 
+int pv4_encode_object_header(enum object_type type,
+			     uintmax_t size, unsigned char *hdr)
+{
+	uintmax_t val;
+
+	switch (type) {
+	case OBJ_COMMIT:
+	case OBJ_TREE:
+	case OBJ_BLOB:
+	case OBJ_TAG:
+	case OBJ_REF_DELTA:
+	case OBJ_PV4_COMMIT:
+	case OBJ_PV4_TREE:
+		break;
+	default:
+		die("bad type %d", type);
+	}
+
+	/*
+	 * We allocate 4 bits in the LSB for the object type which
+	 * should be good for quite a while, given that we effectively
+	 * encodes only 5 object types: commit, tree, blob, delta,
+	 * tag.
+	 */
+	val = size;
+	if (MSB(val, 4))
+		die("fixme: the code doesn't currently cope with big sizes");
+	val <<= 4;
+	val |= type;
+	return encode_varint(val, hdr);
+}
+
 struct sha1file *create_tmp_packfile(char **pack_tmp_name)
 {
 	char tmpname[PATH_MAX];
diff --git a/pack.h b/pack.h
index 855f6c6..4f10fa4 100644
--- a/pack.h
+++ b/pack.h
@@ -83,6 +83,7 @@ extern off_t write_pack_header(struct sha1file *f, int, uint32_t);
 extern void fixup_pack_header_footer(int, unsigned char *, const char *, uint32_t, unsigned char *, off_t);
 extern char *index_pack_lockfile(int fd);
 extern int encode_in_pack_object_header(enum object_type, uintmax_t, unsigned char *);
+extern int pv4_encode_object_header(enum object_type, uintmax_t, unsigned char *);
 
 #define PH_ERROR_EOF		(-1)
 #define PH_ERROR_PACK_SIGNATURE	(-2)
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH v2 07/16] pack-objects: add --version to specify written pack version
  2013-09-09 13:57   ` [PATCH v2 00/16] pack v4 support in pack-objects Nguyễn Thái Ngọc Duy
                       ` (5 preceding siblings ...)
  2013-09-09 13:57     ` [PATCH v2 06/16] pack-write.c: add pv4_encode_object_header Nguyễn Thái Ngọc Duy
@ 2013-09-09 13:57     ` Nguyễn Thái Ngọc Duy
  2013-09-09 13:57     ` [PATCH v2 08/16] list-objects.c: add show_tree_entry callback to traverse_commit_list Nguyễn Thái Ngọc Duy
                       ` (8 subsequent siblings)
  15 siblings, 0 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-09 13:57 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git, Nguyễn Thái Ngọc Duy


Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 33faea8..ef68fc5 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -81,6 +81,7 @@ static int num_preferred_base;
 static struct progress *progress_state;
 static int pack_compression_level = Z_DEFAULT_COMPRESSION;
 static int pack_compression_seen;
+static int pack_version = 2;
 
 static unsigned long delta_cache_size = 0;
 static unsigned long max_delta_cache_size = 256 * 1024 * 1024;
@@ -735,7 +736,7 @@ static void write_pack_file(void)
 		else
 			f = create_tmp_packfile(&pack_tmp_name);
 
-		offset = write_pack_header(f, 2, nr_remaining);
+		offset = write_pack_header(f, pack_version, nr_remaining);
 		if (!offset)
 			die_errno("unable to write pack header");
 		nr_written = 0;
@@ -2455,6 +2456,7 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 		{ OPTION_CALLBACK, 0, "index-version", NULL, N_("version[,offset]"),
 		  N_("write the pack index file in the specified idx format version"),
 		  0, option_parse_index_version },
+		OPT_INTEGER(0, "version", &pack_version, N_("pack version")),
 		OPT_ULONG(0, "max-pack-size", &pack_size_limit,
 			  N_("maximum size of each output pack file")),
 		OPT_BOOL(0, "local", &local,
@@ -2525,6 +2527,8 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 	}
 	if (pack_to_stdout != !base_name || argc)
 		usage_with_options(pack_usage, pack_objects_options);
+	if (pack_version != 2)
+		die(_("pack version %d is not supported"), pack_version);
 
 	rp_av[rp_ac++] = "pack-objects";
 	if (thin) {
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH v2 08/16] list-objects.c: add show_tree_entry callback to traverse_commit_list
  2013-09-09 13:57   ` [PATCH v2 00/16] pack v4 support in pack-objects Nguyễn Thái Ngọc Duy
                       ` (6 preceding siblings ...)
  2013-09-09 13:57     ` [PATCH v2 07/16] pack-objects: add --version to specify written pack version Nguyễn Thái Ngọc Duy
@ 2013-09-09 13:57     ` Nguyễn Thái Ngọc Duy
  2013-09-09 13:58     ` [PATCH v2 09/16] pack-objects: do not cache delta for v4 trees Nguyễn Thái Ngọc Duy
                       ` (7 subsequent siblings)
  15 siblings, 0 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-09 13:57 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git, Nguyễn Thái Ngọc Duy

This helps construct tree dictionary in pack v4.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 2 +-
 builtin/rev-list.c     | 4 ++--
 list-objects.c         | 9 ++++++++-
 list-objects.h         | 3 ++-
 upload-pack.c          | 2 +-
 5 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index ef68fc5..b38d3dc 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -2380,7 +2380,7 @@ static void get_object_list(int ac, const char **av)
 	if (prepare_revision_walk(&revs))
 		die("revision walk setup failed");
 	mark_edges_uninteresting(revs.commits, &revs, show_edge);
-	traverse_commit_list(&revs, show_commit, show_object, NULL);
+	traverse_commit_list(&revs, show_commit, NULL, show_object, NULL);
 
 	if (keep_unreachable)
 		add_objects_in_unpacked_packs(&revs);
diff --git a/builtin/rev-list.c b/builtin/rev-list.c
index a5ec30d..b25f896 100644
--- a/builtin/rev-list.c
+++ b/builtin/rev-list.c
@@ -243,7 +243,7 @@ static int show_bisect_vars(struct rev_list_info *info, int reaches, int all)
 		strcpy(hex, sha1_to_hex(revs->commits->item->object.sha1));
 
 	if (flags & BISECT_SHOW_ALL) {
-		traverse_commit_list(revs, show_commit, show_object, info);
+		traverse_commit_list(revs, show_commit, NULL, show_object, info);
 		printf("------\n");
 	}
 
@@ -348,7 +348,7 @@ int cmd_rev_list(int argc, const char **argv, const char *prefix)
 			return show_bisect_vars(&info, reaches, all);
 	}
 
-	traverse_commit_list(&revs, show_commit, show_object, &info);
+	traverse_commit_list(&revs, show_commit, NULL, show_object, &info);
 
 	if (revs.count) {
 		if (revs.left_right && revs.cherry_mark)
diff --git a/list-objects.c b/list-objects.c
index 3dd4a96..6def897 100644
--- a/list-objects.c
+++ b/list-objects.c
@@ -61,6 +61,7 @@ static void process_gitlink(struct rev_info *revs,
 
 static void process_tree(struct rev_info *revs,
 			 struct tree *tree,
+			 show_tree_entry_fn show_tree_entry,
 			 show_object_fn show,
 			 struct name_path *path,
 			 struct strbuf *base,
@@ -107,9 +108,13 @@ static void process_tree(struct rev_info *revs,
 				continue;
 		}
 
+		if (show_tree_entry)
+			show_tree_entry(&entry, cb_data);
+
 		if (S_ISDIR(entry.mode))
 			process_tree(revs,
 				     lookup_tree(entry.sha1),
+				     show_tree_entry,
 				     show, &me, base, entry.path,
 				     cb_data);
 		else if (S_ISGITLINK(entry.mode))
@@ -167,6 +172,7 @@ static void add_pending_tree(struct rev_info *revs, struct tree *tree)
 
 void traverse_commit_list(struct rev_info *revs,
 			  show_commit_fn show_commit,
+			  show_tree_entry_fn show_tree_entry,
 			  show_object_fn show_object,
 			  void *data)
 {
@@ -196,7 +202,8 @@ void traverse_commit_list(struct rev_info *revs,
 			continue;
 		}
 		if (obj->type == OBJ_TREE) {
-			process_tree(revs, (struct tree *)obj, show_object,
+			process_tree(revs, (struct tree *)obj,
+				     show_tree_entry, show_object,
 				     NULL, &base, name, data);
 			continue;
 		}
diff --git a/list-objects.h b/list-objects.h
index 3db7bb6..297b2e0 100644
--- a/list-objects.h
+++ b/list-objects.h
@@ -2,8 +2,9 @@
 #define LIST_OBJECTS_H
 
 typedef void (*show_commit_fn)(struct commit *, void *);
+typedef void (*show_tree_entry_fn)(const struct name_entry *, void *);
 typedef void (*show_object_fn)(struct object *, const struct name_path *, const char *, void *);
-void traverse_commit_list(struct rev_info *, show_commit_fn, show_object_fn, void *);
+void traverse_commit_list(struct rev_info *, show_commit_fn, show_tree_entry_fn, show_object_fn, void *);
 
 typedef void (*show_edge_fn)(struct commit *);
 void mark_edges_uninteresting(struct commit_list *, struct rev_info *, show_edge_fn);
diff --git a/upload-pack.c b/upload-pack.c
index 127e59a..ccf76d9 100644
--- a/upload-pack.c
+++ b/upload-pack.c
@@ -125,7 +125,7 @@ static int do_rev_list(int in, int out, void *user_data)
 		for (i = 0; i < extra_edge_obj.nr; i++)
 			fprintf(pack_pipe, "-%s\n", sha1_to_hex(
 					extra_edge_obj.objects[i].item->sha1));
-	traverse_commit_list(&revs, show_commit, show_object, NULL);
+	traverse_commit_list(&revs, show_commit, NULL, show_object, NULL);
 	fflush(pack_pipe);
 	fclose(pack_pipe);
 	return 0;
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH v2 09/16] pack-objects: do not cache delta for v4 trees
  2013-09-09 13:57   ` [PATCH v2 00/16] pack v4 support in pack-objects Nguyễn Thái Ngọc Duy
                       ` (7 preceding siblings ...)
  2013-09-09 13:57     ` [PATCH v2 08/16] list-objects.c: add show_tree_entry callback to traverse_commit_list Nguyễn Thái Ngọc Duy
@ 2013-09-09 13:58     ` Nguyễn Thái Ngọc Duy
  2013-09-09 13:58     ` [PATCH v2 10/16] pack-objects: exclude commits out of delta objects in v4 Nguyễn Thái Ngọc Duy
                       ` (6 subsequent siblings)
  15 siblings, 0 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-09 13:58 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git, Nguyễn Thái Ngọc Duy


Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index b38d3dc..9613732 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -1756,8 +1756,12 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
 		 * and therefore it is best to go to the write phase ASAP
 		 * instead, as we can afford spending more time compressing
 		 * between writes at that moment.
+		 *
+		 * For v4 trees we'll need to delta differently anyway
+		 * so no cache. v4 commits simply do not delta.
 		 */
-		if (entry->delta_data && !pack_to_stdout) {
+		if (entry->delta_data && !pack_to_stdout &&
+		    (pack_version < 4 || entry->type == OBJ_BLOB)) {
 			entry->z_delta_size = do_compress(&entry->delta_data,
 							  entry->delta_size);
 			cache_lock();
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH v2 10/16] pack-objects: exclude commits out of delta objects in v4
  2013-09-09 13:57   ` [PATCH v2 00/16] pack v4 support in pack-objects Nguyễn Thái Ngọc Duy
                       ` (8 preceding siblings ...)
  2013-09-09 13:58     ` [PATCH v2 09/16] pack-objects: do not cache delta for v4 trees Nguyễn Thái Ngọc Duy
@ 2013-09-09 13:58     ` Nguyễn Thái Ngọc Duy
  2013-09-09 13:58     ` [PATCH v2 11/16] pack-objects: create pack v4 tables Nguyễn Thái Ngọc Duy
                       ` (5 subsequent siblings)
  15 siblings, 0 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-09 13:58 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git, Nguyễn Thái Ngọc Duy


Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 9613732..fb2394d 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -1329,7 +1329,8 @@ static void check_object(struct object_entry *entry)
 			break;
 		}
 
-		if (base_ref && (base_entry = locate_object_entry(base_ref))) {
+		if (base_ref && (base_entry = locate_object_entry(base_ref)) &&
+		    (pack_version < 4 || entry->type != OBJ_COMMIT)) {
 			/*
 			 * If base_ref was set above that means we wish to
 			 * reuse delta data, and we even found that base
@@ -1413,6 +1414,8 @@ static void get_object_details(void)
 		check_object(entry);
 		if (big_file_threshold < entry->size)
 			entry->no_try_delta = 1;
+		if (pack_version == 4 && entry->type == OBJ_COMMIT)
+			entry->no_try_delta = 1;
 	}
 
 	free(sorted_by_offset);
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH v2 11/16] pack-objects: create pack v4 tables
  2013-09-09 13:57   ` [PATCH v2 00/16] pack v4 support in pack-objects Nguyễn Thái Ngọc Duy
                       ` (9 preceding siblings ...)
  2013-09-09 13:58     ` [PATCH v2 10/16] pack-objects: exclude commits out of delta objects in v4 Nguyễn Thái Ngọc Duy
@ 2013-09-09 13:58     ` Nguyễn Thái Ngọc Duy
  2013-09-09 13:58     ` [PATCH v2 12/16] pack-objects: prepare SHA-1 table in v4 Nguyễn Thái Ngọc Duy
                       ` (4 subsequent siblings)
  15 siblings, 0 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-09 13:58 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git, Nguyễn Thái Ngọc Duy


Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 60 insertions(+), 2 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index fb2394d..60ea5a7 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -18,6 +18,7 @@
 #include "refs.h"
 #include "streaming.h"
 #include "thread-utils.h"
+#include "packv4-create.h"
 
 static const char *pack_usage[] = {
 	N_("git pack-objects --stdout [options...] [< ref-list | < object-list]"),
@@ -61,6 +62,8 @@ static struct object_entry *objects;
 static struct pack_idx_entry **written_list;
 static uint32_t nr_objects, nr_alloc, nr_result, nr_written;
 
+static struct packv4_tables v4;
+
 static int non_empty;
 static int reuse_delta = 1, reuse_object = 1;
 static int keep_unreachable, unpack_unreachable, include_tag;
@@ -2052,6 +2055,11 @@ static void prepare_pack(int window, int depth)
 	uint32_t i, nr_deltas;
 	unsigned n;
 
+	if (pack_version == 4) {
+		sort_dict_entries_by_hits(v4.commit_ident_table);
+		sort_dict_entries_by_hits(v4.tree_path_table);
+	}
+
 	get_object_details();
 
 	/*
@@ -2198,6 +2206,34 @@ static void read_object_list_from_stdin(void)
 
 		add_preferred_base_object(line+41);
 		add_object_entry(sha1, 0, line+41, 0);
+
+		if (pack_version == 4) {
+			void *data;
+			enum object_type type;
+			unsigned long size;
+			int (*add_dict_entries)(struct dict_table *, void *, unsigned long);
+			struct dict_table *dict;
+
+			switch (sha1_object_info(sha1, &size)) {
+			case OBJ_COMMIT:
+				add_dict_entries = add_commit_dict_entries;
+				dict = v4.commit_ident_table;
+				break;
+			case OBJ_TREE:
+				add_dict_entries = add_tree_dict_entries;
+				dict = v4.tree_path_table;
+				break;
+			default:
+				continue;
+			}
+			data = read_sha1_file(sha1, &type, &size);
+			if (!data)
+				die("cannot unpack %s", sha1_to_hex(sha1));
+			if (add_dict_entries(dict, data, size) < 0)
+				die("can't process %s object %s",
+				    typename(type), sha1_to_hex(sha1));
+			free(data);
+		}
 	}
 }
 
@@ -2205,10 +2241,26 @@ static void read_object_list_from_stdin(void)
 
 static void show_commit(struct commit *commit, void *data)
 {
+	if (pack_version == 4) {
+		unsigned long size;
+		enum object_type type;
+		unsigned char *buf;
+
+		/* commit->buffer is NULL most of the time, don't bother */
+		buf = read_sha1_file(commit->object.sha1, &type, &size);
+		add_commit_dict_entries(v4.commit_ident_table, buf, size);
+		free(buf);
+	}
 	add_object_entry(commit->object.sha1, OBJ_COMMIT, NULL, 0);
 	commit->object.flags |= OBJECT_ADDED;
 }
 
+static void show_tree_entry(const struct name_entry *entry, void *data)
+{
+	dict_add_entry(v4.tree_path_table, entry->mode, entry->path,
+		       tree_entry_len(entry));
+}
+
 static void show_object(struct object *obj,
 			const struct name_path *path, const char *last,
 			void *data)
@@ -2387,7 +2439,9 @@ static void get_object_list(int ac, const char **av)
 	if (prepare_revision_walk(&revs))
 		die("revision walk setup failed");
 	mark_edges_uninteresting(revs.commits, &revs, show_edge);
-	traverse_commit_list(&revs, show_commit, NULL, show_object, NULL);
+	traverse_commit_list(&revs, show_commit,
+			     pack_version == 4 ? show_tree_entry : NULL,
+			     show_object, NULL);
 
 	if (keep_unreachable)
 		add_objects_in_unpacked_packs(&revs);
@@ -2534,7 +2588,7 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 	}
 	if (pack_to_stdout != !base_name || argc)
 		usage_with_options(pack_usage, pack_objects_options);
-	if (pack_version != 2)
+	if (pack_version != 2 && pack_version != 4)
 		die(_("pack version %d is not supported"), pack_version);
 
 	rp_av[rp_ac++] = "pack-objects";
@@ -2586,6 +2640,10 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 		progress = 2;
 
 	prepare_packed_git();
+	if (pack_version == 4) {
+		v4.commit_ident_table = create_dict_table();
+		v4.tree_path_table = create_dict_table();
+	}
 
 	if (progress)
 		progress_state = start_progress("Counting objects", 0);
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH v2 12/16] pack-objects: prepare SHA-1 table in v4
  2013-09-09 13:57   ` [PATCH v2 00/16] pack v4 support in pack-objects Nguyễn Thái Ngọc Duy
                       ` (10 preceding siblings ...)
  2013-09-09 13:58     ` [PATCH v2 11/16] pack-objects: create pack v4 tables Nguyễn Thái Ngọc Duy
@ 2013-09-09 13:58     ` Nguyễn Thái Ngọc Duy
  2013-09-09 13:58     ` [PATCH v2 13/16] pack-objects: support writing pack v4 Nguyễn Thái Ngọc Duy
                       ` (3 subsequent siblings)
  15 siblings, 0 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-09 13:58 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git, Nguyễn Thái Ngọc Duy

SHA-1 table is trickier than ident or path tables because it must
contains exactly the number entries in pack. In the thin pack case it
must also cover bases that will be appended by index-pack.

The problem is not all preferred_base entries end up becoming actually
needed. So we do a fake write_one() round just to get what is written
and what is not. It also helps the case when the multiple packs are
written.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 52 insertions(+), 3 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 60ea5a7..055b59d 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -434,7 +434,7 @@ static unsigned long write_object(struct sha1file *f,
 	unsigned long limit, len;
 	int usable_delta, to_reuse;
 
-	if (!pack_to_stdout)
+	if (f && !pack_to_stdout)
 		crc32_begin(f);
 
 	/* apply size limit if limited packsize and not first object */
@@ -477,6 +477,12 @@ static unsigned long write_object(struct sha1file *f,
 				 * and we do not need to deltify it.
 				 */
 
+	if (!f) {
+		if (usable_delta && entry->delta->idx.offset < 2)
+			entry->delta->idx.offset = 2;
+		return 2;
+	}
+
 	if (!to_reuse)
 		len = write_no_reuse_object(f, entry, limit, usable_delta);
 	else
@@ -543,10 +549,14 @@ static enum write_one_status write_one(struct sha1file *f,
 		e->idx.offset = recursing;
 		return WRITE_ONE_BREAK;
 	}
+	if (!f) {
+		*offset += size;
+		return WRITE_ONE_WRITTEN;
+	}
 	written_list[nr_written++] = &e->idx;
 
 	/* make sure off_t is sufficiently large not to wrap */
-	if (signed_add_overflows(*offset, size))
+	if (f && signed_add_overflows(*offset, size))
 		die("pack too large for current definition of off_t");
 	*offset += size;
 	return WRITE_ONE_WRITTEN;
@@ -716,6 +726,39 @@ static struct object_entry **compute_write_order(void)
 	return wo;
 }
 
+static int sha1_idx_sort(const void *a_, const void *b_)
+{
+	const struct pack_idx_entry *a = a_;
+	const struct pack_idx_entry *b = b_;
+	return hashcmp(a->sha1, b->sha1);
+}
+
+/*
+ * Do a fake writting round to detemine what's in the SHA-1 table.
+ */
+static void prepare_sha1_table(uint32_t start, struct object_entry **write_order)
+{
+	int i = start;
+	off_t fake_offset = 2;
+	for (; i < nr_objects; i++) {
+		struct object_entry *e = write_order[i];
+		if (write_one(NULL, e, &fake_offset) == WRITE_ONE_BREAK)
+			break;
+	}
+
+	v4.all_objs_nr = 0;
+	for (i = 0; i < nr_objects; i++) {
+		struct object_entry *e = write_order[i];
+		if (e->idx.offset > 0) {
+			v4.all_objs[v4.all_objs_nr++] = e->idx;
+			fprintf(stderr, "%s in\n", sha1_to_hex(e->idx.sha1));
+			e->idx.offset = 0;
+		}
+	}
+	qsort(v4.all_objs, v4.all_objs_nr, sizeof(*v4.all_objs),
+	      sha1_idx_sort);
+}
+
 static void write_pack_file(void)
 {
 	uint32_t i = 0, j;
@@ -739,7 +782,12 @@ static void write_pack_file(void)
 		else
 			f = create_tmp_packfile(&pack_tmp_name);
 
-		offset = write_pack_header(f, pack_version, nr_remaining);
+		if (pack_version == 4)
+			prepare_sha1_table(i, write_order);
+
+		offset = write_pack_header(f, pack_version,
+					   pack_version < 4 ? nr_remaining : v4.all_objs_nr);
+
 		if (!offset)
 			die_errno("unable to write pack header");
 		nr_written = 0;
@@ -2058,6 +2106,7 @@ static void prepare_pack(int window, int depth)
 	if (pack_version == 4) {
 		sort_dict_entries_by_hits(v4.commit_ident_table);
 		sort_dict_entries_by_hits(v4.tree_path_table);
+		v4.all_objs = xmalloc(nr_objects * sizeof(*v4.all_objs));
 	}
 
 	get_object_details();
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH v2 13/16] pack-objects: support writing pack v4
  2013-09-09 13:57   ` [PATCH v2 00/16] pack v4 support in pack-objects Nguyễn Thái Ngọc Duy
                       ` (11 preceding siblings ...)
  2013-09-09 13:58     ` [PATCH v2 12/16] pack-objects: prepare SHA-1 table in v4 Nguyễn Thái Ngọc Duy
@ 2013-09-09 13:58     ` Nguyễn Thái Ngọc Duy
  2013-09-09 13:58     ` [PATCH v2 14/16] pack v4: support "end-of-pack" indicator in index-pack and pack-objects Nguyễn Thái Ngọc Duy
                       ` (2 subsequent siblings)
  15 siblings, 0 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-09 13:58 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git, Nguyễn Thái Ngọc Duy


Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/pack-objects.c | 85 +++++++++++++++++++++++++++++++++++++++++++++-----
 pack.h                 |  2 +-
 2 files changed, 78 insertions(+), 9 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 055b59d..12d9af4 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -254,6 +254,7 @@ static unsigned long write_no_reuse_object(struct sha1file *f, struct object_ent
 	enum object_type type;
 	void *buf;
 	struct git_istream *st = NULL;
+	char *result = "OK";
 
 	if (!usable_delta) {
 		if (entry->type == OBJ_BLOB &&
@@ -287,7 +288,37 @@ static unsigned long write_no_reuse_object(struct sha1file *f, struct object_ent
 
 	if (st)	/* large blob case, just assume we don't compress well */
 		datalen = size;
-	else if (entry->z_delta_size)
+	else if (pack_version == 4 && entry->type == OBJ_COMMIT) {
+		datalen = size;
+		result = pv4_encode_commit(&v4, buf, &datalen);
+		if (result) {
+			free(buf);
+			buf = result;
+			type = OBJ_PV4_COMMIT;
+		}
+	} else if (pack_version == 4 && entry->type == OBJ_TREE) {
+		datalen = size;
+		if (usable_delta) {
+			unsigned long base_size;
+			char *base_buf;
+			base_buf = read_sha1_file(entry->delta->idx.sha1, &type,
+						  &base_size);
+			if (!base_buf || type != OBJ_TREE)
+				die("unable to read %s",
+				    sha1_to_hex(entry->delta->idx.sha1));
+			result = pv4_encode_tree(&v4, buf, &datalen,
+						 base_buf, base_size,
+						 entry->delta->idx.sha1);
+			free(base_buf);
+		} else
+			result = pv4_encode_tree(&v4, buf, &datalen,
+						 NULL, 0, NULL);
+		if (result) {
+			free(buf);
+			buf = result;
+			type = OBJ_PV4_TREE;
+		}
+	} else if (entry->z_delta_size)
 		datalen = entry->z_delta_size;
 	else
 		datalen = do_compress(&buf, size);
@@ -296,7 +327,10 @@ static unsigned long write_no_reuse_object(struct sha1file *f, struct object_ent
 	 * The object header is a byte of 'type' followed by zero or
 	 * more bytes of length.
 	 */
-	hdrlen = encode_in_pack_object_header(type, size, header);
+	if (pack_version < 4)
+		hdrlen = encode_in_pack_object_header(type, size, header);
+	else
+		hdrlen = pv4_encode_object_header(type, size, header);
 
 	if (type == OBJ_OFS_DELTA) {
 		/*
@@ -318,7 +352,7 @@ static unsigned long write_no_reuse_object(struct sha1file *f, struct object_ent
 		sha1write(f, header, hdrlen);
 		sha1write(f, dheader + pos, sizeof(dheader) - pos);
 		hdrlen += sizeof(dheader) - pos;
-	} else if (type == OBJ_REF_DELTA) {
+	} else if (type == OBJ_REF_DELTA && pack_version < 4) {
 		/*
 		 * Deltas with a base reference contain
 		 * an additional 20 bytes for the base sha1.
@@ -332,6 +366,10 @@ static unsigned long write_no_reuse_object(struct sha1file *f, struct object_ent
 		sha1write(f, header, hdrlen);
 		sha1write(f, entry->delta->idx.sha1, 20);
 		hdrlen += 20;
+	} else if (type == OBJ_REF_DELTA && pack_version == 4) {
+		hdrlen += encode_sha1ref(&v4, entry->delta->idx.sha1,
+					header + hdrlen);
+		sha1write(f, header, hdrlen);
 	} else {
 		if (limit && hdrlen + datalen + 20 >= limit) {
 			if (st)
@@ -341,14 +379,26 @@ static unsigned long write_no_reuse_object(struct sha1file *f, struct object_ent
 		}
 		sha1write(f, header, hdrlen);
 	}
+
 	if (st) {
 		datalen = write_large_blob_data(st, f, entry->idx.sha1);
 		close_istream(st);
-	} else {
-		sha1write(f, buf, datalen);
-		free(buf);
+		return hdrlen + datalen;
 	}
 
+	if (!result) {
+		warning(_("can't convert %s object %s"),
+			typename(entry->type),
+			sha1_to_hex(entry->idx.sha1));
+		free(buf);
+		buf = read_sha1_file(entry->idx.sha1, &type, &size);
+		if (!buf)
+			die(_("unable to read %s"),
+			    sha1_to_hex(entry->idx.sha1));
+		datalen = do_compress(&buf, size);
+	}
+	sha1write(f, buf, datalen);
+	free(buf);
 	return hdrlen + datalen;
 }
 
@@ -368,7 +418,10 @@ static unsigned long write_reuse_object(struct sha1file *f, struct object_entry
 	if (entry->delta)
 		type = (allow_ofs_delta && entry->delta->idx.offset) ?
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
-	hdrlen = encode_in_pack_object_header(type, entry->size, header);
+	if (pack_version < 4)
+		hdrlen = encode_in_pack_object_header(type, entry->size, header);
+	else
+		hdrlen = pv4_encode_object_header(type, entry->size, header);
 
 	offset = entry->in_pack_offset;
 	revidx = find_pack_revindex(p, offset);
@@ -404,7 +457,7 @@ static unsigned long write_reuse_object(struct sha1file *f, struct object_entry
 		sha1write(f, dheader + pos, sizeof(dheader) - pos);
 		hdrlen += sizeof(dheader) - pos;
 		reused_delta++;
-	} else if (type == OBJ_REF_DELTA) {
+	} else if (type == OBJ_REF_DELTA && pack_version < 4) {
 		if (limit && hdrlen + 20 + datalen + 20 >= limit) {
 			unuse_pack(&w_curs);
 			return 0;
@@ -413,6 +466,11 @@ static unsigned long write_reuse_object(struct sha1file *f, struct object_entry
 		sha1write(f, entry->delta->idx.sha1, 20);
 		hdrlen += 20;
 		reused_delta++;
+	} else if (type == OBJ_REF_DELTA && pack_version == 4) {
+		hdrlen += encode_sha1ref(&v4, entry->delta->idx.sha1,
+					header + hdrlen);
+		sha1write(f, header, hdrlen);
+		reused_delta++;
 	} else {
 		if (limit && hdrlen + datalen + 20 >= limit) {
 			unuse_pack(&w_curs);
@@ -460,6 +518,9 @@ static unsigned long write_object(struct sha1file *f,
 	else
 		usable_delta = 0;	/* base could end up in another pack */
 
+	if (pack_version == 4 && entry->type == OBJ_TREE)
+		usable_delta = 0;
+
 	if (!reuse_object)
 		to_reuse = 0;	/* explicit */
 	else if (!entry->in_pack)
@@ -477,6 +538,10 @@ static unsigned long write_object(struct sha1file *f,
 				 * and we do not need to deltify it.
 				 */
 
+	if (pack_version == 4 &&
+	     (entry->type == OBJ_TREE || entry->type == OBJ_COMMIT))
+		to_reuse = 0;
+
 	if (!f) {
 		if (usable_delta && entry->delta->idx.offset < 2)
 			entry->delta->idx.offset = 2;
@@ -790,6 +855,8 @@ static void write_pack_file(void)
 
 		if (!offset)
 			die_errno("unable to write pack header");
+		if (pack_version == 4)
+			offset += packv4_write_tables(f, &v4);
 		nr_written = 0;
 		for (; i < nr_objects; i++) {
 			struct object_entry *e = write_order[i];
@@ -2107,6 +2174,8 @@ static void prepare_pack(int window, int depth)
 		sort_dict_entries_by_hits(v4.commit_ident_table);
 		sort_dict_entries_by_hits(v4.tree_path_table);
 		v4.all_objs = xmalloc(nr_objects * sizeof(*v4.all_objs));
+		pack_idx_opts.version = 3;
+		allow_ofs_delta = 0;
 	}
 
 	get_object_details();
diff --git a/pack.h b/pack.h
index 4f10fa4..ccefdbe 100644
--- a/pack.h
+++ b/pack.h
@@ -8,7 +8,7 @@
  * Packed object header
  */
 #define PACK_SIGNATURE 0x5041434b	/* "PACK" */
-#define pack_version_ok(v) ((v) == htonl(2) || (v) == htonl(3))
+#define pack_version_ok(v) ((v) == htonl(2) || (v) == htonl(3) || (v) == htonl(4))
 struct pack_header {
 	uint32_t hdr_signature;
 	uint32_t hdr_version;
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH v2 14/16] pack v4: support "end-of-pack" indicator in index-pack and pack-objects
  2013-09-09 13:57   ` [PATCH v2 00/16] pack v4 support in pack-objects Nguyễn Thái Ngọc Duy
                       ` (12 preceding siblings ...)
  2013-09-09 13:58     ` [PATCH v2 13/16] pack-objects: support writing pack v4 Nguyễn Thái Ngọc Duy
@ 2013-09-09 13:58     ` Nguyễn Thái Ngọc Duy
  2013-09-09 13:58     ` [PATCH v2 15/16] index-pack: use nr_objects_final as sha1_table size Nguyễn Thái Ngọc Duy
  2013-09-09 13:58     ` [PATCH v2 16/16] index-pack: support completing thin packs v4 Nguyễn Thái Ngọc Duy
  15 siblings, 0 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-09 13:58 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git, Nguyễn Thái Ngọc Duy

In v2, the number of objects in the pack header indicates how many
objects are sent. In v4 this is no longer true, that number includes
the base objects ommitted by pack-objects. An "end-of-pack" is
inserted just before the final SHA-1 to let index-pack knows when to
stop. The EOP is zero (in variable length encoding it means type zero,
OBJ_NONE, and size zero)

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/index-pack.c   | 29 +++++++++++++++++++++++++----
 builtin/pack-objects.c | 15 +++++++++++----
 2 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index 88340b5..9036f3e 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -1493,7 +1493,7 @@ static void parse_dictionaries(void)
  */
 static void parse_pack_objects(unsigned char *sha1)
 {
-	int i, nr_delays = 0;
+	int i, nr_delays = 0, eop = 0;
 	struct stat st;
 
 	if (verbose)
@@ -1502,7 +1502,28 @@ static void parse_pack_objects(unsigned char *sha1)
 				nr_objects);
 	for (i = 0; i < nr_objects; i++) {
 		struct object_entry *obj = &objects[i];
-		void *data = unpack_raw_entry(obj, obj->idx.sha1);
+		void *data;
+
+		if (packv4) {
+			unsigned char *eop_byte;
+			flush();
+			/* Got End-of-Pack signal? */
+			eop_byte = fill(1);
+			if (*eop_byte == 0) {
+				git_SHA1_Update(&input_ctx, eop_byte, 1);
+				use(1);
+				/*
+				 * consumed by is used to mark the end
+				 * of the object right after this
+				 * loop. Undo use() effect.
+				 */
+				consumed_bytes--;
+				eop = 1; /* so we don't flush() again */
+				break;
+			}
+		}
+
+		data = unpack_raw_entry(obj, obj->idx.sha1);
 		if (is_delta_type(obj->type) || is_delta_tree(obj)) {
 			/* delay sha1_object() until second pass */
 		} else if (!data) {
@@ -1521,8 +1542,8 @@ static void parse_pack_objects(unsigned char *sha1)
 	objects[i].idx.offset = consumed_bytes;
 	stop_progress(&progress);
 
-	/* Check pack integrity */
-	flush();
+	if (!eop)
+		flush();	/* Check pack integrity */
 	git_SHA1_Final(sha1, &input_ctx);
 	if (hashcmp(fill(20), sha1))
 		die(_("pack is corrupted (SHA1 mismatch)"));
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 12d9af4..1efb728 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -865,15 +865,22 @@ static void write_pack_file(void)
 			display_progress(progress_state, written);
 		}
 
-		/*
-		 * Did we write the wrong # entries in the header?
-		 * If so, rewrite it like in fast-import
-		 */
 		if (pack_to_stdout) {
+			unsigned char type_zero = 0;
+			/*
+			 * Pack v4 thin pack is terminated by a "type
+			 * 0, size 0" in variable length encoding
+			 */
+			if (pack_version == 4 && nr_written < nr_objects)
+				sha1write(f, &type_zero, 1);
 			sha1close(f, sha1, CSUM_CLOSE);
 		} else if (nr_written == nr_remaining) {
 			sha1close(f, sha1, CSUM_FSYNC);
 		} else {
+			/*
+			 * Did we write the wrong # entries in the header?
+			 * If so, rewrite it like in fast-import
+			 */
 			int fd = sha1close(f, sha1, 0);
 			fixup_pack_header_footer(fd, sha1, pack_tmp_name,
 						 nr_written, sha1, offset);
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH v2 15/16] index-pack: use nr_objects_final as sha1_table size
  2013-09-09 13:57   ` [PATCH v2 00/16] pack v4 support in pack-objects Nguyễn Thái Ngọc Duy
                       ` (13 preceding siblings ...)
  2013-09-09 13:58     ` [PATCH v2 14/16] pack v4: support "end-of-pack" indicator in index-pack and pack-objects Nguyễn Thái Ngọc Duy
@ 2013-09-09 13:58     ` Nguyễn Thái Ngọc Duy
  2013-09-09 15:01       ` Nicolas Pitre
  2013-09-09 13:58     ` [PATCH v2 16/16] index-pack: support completing thin packs v4 Nguyễn Thái Ngọc Duy
  15 siblings, 1 reply; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-09 13:58 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git, Nguyễn Thái Ngọc Duy

nr_objects in the next patch is used to reflect the number of actual
objects in the stream, which may be smaller than the number recorded
in pack header.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/index-pack.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index 9036f3e..dc9961b 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -80,6 +80,7 @@ static int nr_objects;
 static int nr_deltas;
 static int nr_resolved_deltas;
 static int nr_threads;
+static int nr_objects_final;
 
 static int from_stdin;
 static int strict;
@@ -297,7 +298,7 @@ static void check_against_sha1table(const unsigned char *sha1)
 	if (!packv4)
 		return;
 
-	found = bsearch(sha1, sha1_table, nr_objects, 20,
+	found = bsearch(sha1, sha1_table, nr_objects_final, 20,
 			(int (*)(const void *, const void *))hashcmp);
 	if (!found)
 		die(_("object %s not found in SHA-1 table"),
@@ -331,7 +332,7 @@ static const unsigned char *read_sha1ref(void)
 		return sha1;
 	}
 	index--;
-	if (index >= nr_objects)
+	if (index >= nr_objects_final)
 		bad_object(consumed_bytes,
 			   _("bad index in read_sha1ref"));
 	return sha1_table + index * 20;
@@ -340,7 +341,7 @@ static const unsigned char *read_sha1ref(void)
 static const unsigned char *read_sha1table_ref(void)
 {
 	const unsigned char *sha1 = read_sha1ref();
-	if (sha1 < sha1_table || sha1 >= sha1_table + nr_objects * 20)
+	if (sha1 < sha1_table || sha1 >= sha1_table + nr_objects_final * 20)
 		check_against_sha1table(sha1);
 	return sha1;
 }
@@ -392,7 +393,7 @@ static void parse_pack_header(void)
 		die(_("pack version %"PRIu32" unsupported"),
 			ntohl(hdr->hdr_version));
 
-	nr_objects = ntohl(hdr->hdr_entries);
+	nr_objects_final = nr_objects = ntohl(hdr->hdr_entries);
 	use(sizeof(struct pack_header));
 }
 
@@ -1472,9 +1473,9 @@ static void parse_dictionaries(void)
 	if (!packv4)
 		return;
 
-	sha1_table = xmalloc(20 * nr_objects);
+	sha1_table = xmalloc(20 * nr_objects_final);
 	hashcpy(sha1_table, fill_and_use(20));
-	for (i = 1; i < nr_objects; i++) {
+	for (i = 1; i < nr_objects_final; i++) {
 		unsigned char *p = sha1_table + i * 20;
 		hashcpy(p, fill_and_use(20));
 		if (hashcmp(p - 20, p) >= 0)
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* [PATCH v2 16/16] index-pack: support completing thin packs v4
  2013-09-09 13:57   ` [PATCH v2 00/16] pack v4 support in pack-objects Nguyễn Thái Ngọc Duy
                       ` (14 preceding siblings ...)
  2013-09-09 13:58     ` [PATCH v2 15/16] index-pack: use nr_objects_final as sha1_table size Nguyễn Thái Ngọc Duy
@ 2013-09-09 13:58     ` Nguyễn Thái Ngọc Duy
  15 siblings, 0 replies; 124+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-09 13:58 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git, Nguyễn Thái Ngọc Duy


Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/index-pack.c | 53 +++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 38 insertions(+), 15 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index dc9961b..8a6e2a3 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -97,7 +97,7 @@ static unsigned char input_buffer[4096];
 static unsigned int input_offset, input_len;
 static off_t consumed_bytes;
 static unsigned deepest_delta;
-static git_SHA_CTX input_ctx;
+static git_SHA_CTX input_ctx, output_ctx;
 static uint32_t input_crc32;
 static int input_fd, output_fd, pack_fd;
 
@@ -1511,6 +1511,7 @@ static void parse_pack_objects(unsigned char *sha1)
 			/* Got End-of-Pack signal? */
 			eop_byte = fill(1);
 			if (*eop_byte == 0) {
+				output_ctx = input_ctx;
 				git_SHA1_Update(&input_ctx, eop_byte, 1);
 				use(1);
 				/*
@@ -1540,7 +1541,8 @@ static void parse_pack_objects(unsigned char *sha1)
 		free(data);
 		display_progress(progress, i+1);
 	}
-	objects[i].idx.offset = consumed_bytes;
+	nr_objects = i;
+	objects[nr_objects].idx.offset = consumed_bytes;
 	stop_progress(&progress);
 
 	if (!eop)
@@ -1634,7 +1636,7 @@ static void conclude_pack(int fix_thin_pack, const char *curr_pack, unsigned cha
 		return;
 	}
 
-	if (fix_thin_pack) {
+	if (fix_thin_pack && !packv4) {
 		struct sha1file *f;
 		unsigned char read_sha1[20], tail_sha1[20];
 		struct strbuf msg = STRBUF_INIT;
@@ -1661,6 +1663,26 @@ static void conclude_pack(int fix_thin_pack, const char *curr_pack, unsigned cha
 		if (hashcmp(read_sha1, tail_sha1) != 0)
 			die(_("Unexpected tail checksum for %s "
 			      "(disk corruption?)"), curr_pack);
+	} else	if (fix_thin_pack && packv4) {
+		struct sha1file *f;
+		struct strbuf msg = STRBUF_INIT;
+		int nr_unresolved = nr_deltas - nr_resolved_deltas;
+		int nr_objects_initial = nr_objects;
+		if (nr_unresolved <= 0)
+			die(_("confusion beyond insanity"));
+		f = sha1fd(output_fd, curr_pack);
+		f->ctx = output_ctx; /* resume sha-1 from right before EOP */
+		fix_unresolved_deltas(f, nr_unresolved);
+		if (nr_objects != nr_objects_final)
+			die(_("pack number inconsistency, expected %u got %u"),
+			    nr_objects, nr_objects_final);
+		strbuf_addf(&msg, _("completed with %d local objects"),
+			    nr_objects_final - nr_objects_initial);
+		stop_progress_msg(&progress, msg.buf);
+		strbuf_release(&msg);
+		sha1close(f, pack_sha1, 0);
+		write_or_die(output_fd, pack_sha1, 20);
+		fsync_or_die(output_fd, f->name);
 	}
 	if (nr_deltas != nr_resolved_deltas)
 		die(Q_("pack has %d unresolved delta",
@@ -1700,16 +1722,15 @@ static struct object_entry *append_obj_to_pack(struct sha1file *f,
 {
 	struct object_entry *obj = &objects[nr_objects++];
 	unsigned char header[10];
-	unsigned long s = size;
-	int n = 0;
-	unsigned char c = (type << 4) | (s & 15);
-	s >>= 4;
-	while (s) {
-		header[n++] = c | 0x80;
-		c = s & 0x7f;
-		s >>= 7;
-	}
-	header[n++] = c;
+	int n;
+
+	if (packv4) {
+		if (nr_objects > nr_objects_final)
+			die(_("too many objects"));
+		/* TODO: convert OBJ_TREE to OBJ_PV4_TREE using pv4_encode_tree */
+		n = pv4_encode_object_header(type, size, header);
+	} else
+		n = encode_in_pack_object_header(type, size, header);
 	crc32_begin(f);
 	sha1write(f, header, n);
 	obj[0].size = size;
@@ -1748,7 +1769,8 @@ static void fix_unresolved_deltas(struct sha1file *f, int nr_unresolved)
 	 */
 	sorted_by_pos = xmalloc(nr_unresolved * sizeof(*sorted_by_pos));
 	for (i = 0; i < nr_deltas; i++) {
-		if (objects[deltas[i].obj_no].real_type != OBJ_REF_DELTA)
+		struct object_entry *obj = objects + deltas[i].obj_no;
+		if (obj->real_type != OBJ_REF_DELTA && !is_delta_tree(obj))
 			continue;
 		sorted_by_pos[n++] = &deltas[i];
 	}
@@ -1756,10 +1778,11 @@ static void fix_unresolved_deltas(struct sha1file *f, int nr_unresolved)
 
 	for (i = 0; i < n; i++) {
 		struct delta_entry *d = sorted_by_pos[i];
+		struct object_entry *obj = objects + d->obj_no;
 		enum object_type type;
 		struct base_data *base_obj = alloc_base_data();
 
-		if (objects[d->obj_no].real_type != OBJ_REF_DELTA)
+		if (obj->real_type != OBJ_REF_DELTA && !is_delta_tree(obj))
 			continue;
 		base_obj->data = read_sha1_file(d->base.sha1, &type, &base_obj->size);
 		if (!base_obj->data)
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 124+ messages in thread

* Re: [PATCH v2 15/16] index-pack: use nr_objects_final as sha1_table size
  2013-09-09 13:58     ` [PATCH v2 15/16] index-pack: use nr_objects_final as sha1_table size Nguyễn Thái Ngọc Duy
@ 2013-09-09 15:01       ` Nicolas Pitre
  2013-09-09 18:34         ` Junio C Hamano
  2013-09-10  0:45         ` Duy Nguyen
  0 siblings, 2 replies; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-09 15:01 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git

[-- Attachment #1: Type: TEXT/PLAIN, Size: 1219 bytes --]

On Mon, 9 Sep 2013, Nguyễn Thái Ngọc Duy wrote:

> nr_objects in the next patch is used to reflect the number of actual
> objects in the stream, which may be smaller than the number recorded
> in pack header.

This highlights an issue that has been nagging me for a while.

We decided to send the final number of objects in the thin pack header 
for two reasons:

1) it allows to properly size the SHA1 table upfront which already 
   contains entries for the omitted objects;

2) the whole pack doesn't have to be re-summed again after being 
   completed on the receiving end since we don't alter the header.

However this means that the progress meter will now be wrong and that's 
terrible !  Users *will* complain that the meter doesn't reach 100% and 
they'll protest for being denied the remaining objects during the 
transfer !

Joking aside, we should think about doing something about it.  I was 
wondering if some kind of prefix to the pack stream could be inserted 
onto the wire when sending a pack v4.  Something like:

'T', 'H', 'I', 'N', <actual_number_of_sent_objects_in_network_order>

This 8-byte prefix would simply be discarded by index-pack after being 
parsed.

What do you think?


Nicolas

^ permalink raw reply	[flat|nested] 124+ messages in thread

* Re: [PATCH 08/11] pack-objects: create pack v4 tables
  2013-09-09 13:07       ` Nicolas Pitre
@ 2013-09-09 15:21         ` Junio C Hamano
  0 siblings, 0 replies; 124+ messages in thread
From: Junio C Hamano @ 2013-09-09 15:21 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Duy Nguyen, Git Mailing List

Nicolas Pitre <nico@fluxnic.net> writes:

> Is anyone still using --max-pack-size ?
>
> I'm wondering if producing multiple packs from pack-objects is really 
> useful these days.  If I remember correctly, this was created to allow 
> the archiving of large packs onto CDROMs or the like.

I thought this was more about using a packfile on smaller
(e.g. 32-bit) systems, but I may be mistaken.  2b84b5a8 (Introduce
the config variable pack.packSizeLimit, 2008-02-05) mentions
"filesystem constraints":

    Introduce the config variable pack.packSizeLimit
    
    "git pack-objects" has the option --max-pack-size to limit the
    file size of the packs to a certain amount of bytes.  On
    platforms where the pack file size is limited by filesystem
    constraints, it is easy to forget this option, and this option
    does not exist for "git gc" to begin with.

^ permalink raw reply	[flat|nested] 124+ messages in thread

* Re: [PATCH v2 15/16] index-pack: use nr_objects_final as sha1_table size
  2013-09-09 15:01       ` Nicolas Pitre
@ 2013-09-09 18:34         ` Junio C Hamano
  2013-09-09 18:46           ` Nicolas Pitre
  2013-09-10  0:45         ` Duy Nguyen
  1 sibling, 1 reply; 124+ messages in thread
From: Junio C Hamano @ 2013-09-09 18:34 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Nguyễn Thái Ngọc Duy, git

Nicolas Pitre <nico@fluxnic.net> writes:

> On Mon, 9 Sep 2013, Nguyễn Thái Ngọc Duy wrote:
>
>> nr_objects in the next patch is used to reflect the number of actual
>> objects in the stream, which may be smaller than the number recorded
>> in pack header.
>
> This highlights an issue that has been nagging me for a while.
>
> We decided to send the final number of objects in the thin pack header 
> for two reasons:
>
> 1) it allows to properly size the SHA1 table upfront which already 
>    contains entries for the omitted objects;
>
> 2) the whole pack doesn't have to be re-summed again after being 
>    completed on the receiving end since we don't alter the header.
>
> However this means that the progress meter will now be wrong and that's 
> terrible !  Users *will* complain that the meter doesn't reach 100% and 
> they'll protest for being denied the remaining objects during the 
> transfer !
>
> Joking aside, we should think about doing something about it.  I was 
> wondering if some kind of prefix to the pack stream could be inserted 
> onto the wire when sending a pack v4.  Something like:
>
> 'T', 'H', 'I', 'N', <actual_number_of_sent_objects_in_network_order>
>
> This 8-byte prefix would simply be discarded by index-pack after being 
> parsed.
>
> What do you think?

I do not think it is _too_ bad if the meter jumped from 92% to 100%
when we finish reading from the other end ;-), as long as we can
reliably tell that we read the right thing.

Which brings me to a tangent.  Do we have a means to make sure that
the data received over the wire is bit-for-bit correct as a whole
when it is a thin pack stream?  When it is a non-thin pack stream,
we have the checksum at the end added by sha1close() which
index-pack.c::parse_pack_objects() can (and does) verify.

^ permalink raw reply	[flat|nested] 124+ messages in thread

* Re: [PATCH v2 15/16] index-pack: use nr_objects_final as sha1_table size
  2013-09-09 18:34         ` Junio C Hamano
@ 2013-09-09 18:46           ` Nicolas Pitre
  2013-09-09 18:56             ` Junio C Hamano
  0 siblings, 1 reply; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-09 18:46 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Nguyễn Thái Ngọc Duy, git

[-- Attachment #1: Type: TEXT/PLAIN, Size: 2246 bytes --]

On Mon, 9 Sep 2013, Junio C Hamano wrote:

> Nicolas Pitre <nico@fluxnic.net> writes:
> 
> > On Mon, 9 Sep 2013, Nguyễn Thái Ngọc Duy wrote:
> >
> >> nr_objects in the next patch is used to reflect the number of actual
> >> objects in the stream, which may be smaller than the number recorded
> >> in pack header.
> >
> > This highlights an issue that has been nagging me for a while.
> >
> > We decided to send the final number of objects in the thin pack header 
> > for two reasons:
> >
> > 1) it allows to properly size the SHA1 table upfront which already 
> >    contains entries for the omitted objects;
> >
> > 2) the whole pack doesn't have to be re-summed again after being 
> >    completed on the receiving end since we don't alter the header.
> >
> > However this means that the progress meter will now be wrong and that's 
> > terrible !  Users *will* complain that the meter doesn't reach 100% and 
> > they'll protest for being denied the remaining objects during the 
> > transfer !
> >
> > Joking aside, we should think about doing something about it.  I was 
> > wondering if some kind of prefix to the pack stream could be inserted 
> > onto the wire when sending a pack v4.  Something like:
> >
> > 'T', 'H', 'I', 'N', <actual_number_of_sent_objects_in_network_order>
> >
> > This 8-byte prefix would simply be discarded by index-pack after being 
> > parsed.
> >
> > What do you think?
> 
> I do not think it is _too_ bad if the meter jumped from 92% to 100%
> when we finish reading from the other end ;-), as long as we can
> reliably tell that we read the right thing.

Sure.  but eventually people will complain about this.  So while we're 
about to introduce a new pack format anyway, better think of this little 
cosmetic detail now when it can be included in the pack v4 capability 
negociation.

> Which brings me to a tangent.  Do we have a means to make sure that
> the data received over the wire is bit-for-bit correct as a whole
> when it is a thin pack stream?  When it is a non-thin pack stream,
> we have the checksum at the end added by sha1close() which
> index-pack.c::parse_pack_objects() can (and does) verify.

The trailing checksum is still there.  Nothing has changed in that 
regard.


Nicolas

^ permalink raw reply	[flat|nested] 124+ messages in thread

* Re: [PATCH v2 15/16] index-pack: use nr_objects_final as sha1_table size
  2013-09-09 18:46           ` Nicolas Pitre
@ 2013-09-09 18:56             ` Junio C Hamano
  2013-09-09 19:11               ` Nicolas Pitre
  0 siblings, 1 reply; 124+ messages in thread
From: Junio C Hamano @ 2013-09-09 18:56 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Nguyễn Thái Ngọc Duy, git

Nicolas Pitre <nico@fluxnic.net> writes:

>> > ...  I was 
>> > wondering if some kind of prefix to the pack stream could be inserted 
>> > onto the wire when sending a pack v4.  Something like:
>> >
>> > 'T', 'H', 'I', 'N', <actual_number_of_sent_objects_in_network_order>
>> >
>> > This 8-byte prefix would simply be discarded by index-pack after being 
>> > parsed.
>> >
>> > What do you think?
>> 
>> I do not think it is _too_ bad if the meter jumped from 92% to 100%
>> when we finish reading from the other end ;-), as long as we can
>> reliably tell that we read the right thing.
>
> Sure.  but eventually people will complain about this.  So while we're 
> about to introduce a new pack format anyway, better think of this little 
> cosmetic detail now when it can be included in the pack v4 capability 
> negociation.

Oh, I completely agree on that part.  When we send a self-contained
pack, would we send nothing?  That is, should the receiving end
expect and rely on that the sending end will send a thin pack and
never a fat pack when asked to send a thin pack (and vice versa)?

Also should we make the "even though we have negotiated the protocol
parameters, after enumerating the objects and deciding what the pack
stream would look like, we have a bit more information to tell you"
the sending side gives the receiver extensible?  I am wondering if
that prefix needs something like "end of prefix" marker (or "here
comes N-bytes worth of prefix information" upfront); we probably do
not need it, as the capability exchange will determine what kind of
information will be sent (e.g. "actual objects in the thin pack data
stream").

^ permalink raw reply	[flat|nested] 124+ messages in thread

* Re: [PATCH v2 15/16] index-pack: use nr_objects_final as sha1_table size
  2013-09-09 18:56             ` Junio C Hamano
@ 2013-09-09 19:11               ` Nicolas Pitre
  2013-09-09 19:30                 ` Junio C Hamano
  0 siblings, 1 reply; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-09 19:11 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Nguyễn Thái Ngọc Duy, git

On Mon, 9 Sep 2013, Junio C Hamano wrote:

> Nicolas Pitre <nico@fluxnic.net> writes:
> 
> >> > ...  I was 
> >> > wondering if some kind of prefix to the pack stream could be inserted 
> >> > onto the wire when sending a pack v4.  Something like:
> >> >
> >> > 'T', 'H', 'I', 'N', <actual_number_of_sent_objects_in_network_order>
> >> >
> >> > This 8-byte prefix would simply be discarded by index-pack after being 
> >> > parsed.
> >> >
> >> > What do you think?
> >> 
> >> I do not think it is _too_ bad if the meter jumped from 92% to 100%
> >> when we finish reading from the other end ;-), as long as we can
> >> reliably tell that we read the right thing.
> >
> > Sure.  but eventually people will complain about this.  So while we're 
> > about to introduce a new pack format anyway, better think of this little 
> > cosmetic detail now when it can be included in the pack v4 capability 
> > negociation.
> 
> Oh, I completely agree on that part.  When we send a self-contained
> pack, would we send nothing?  That is, should the receiving end
> expect and rely on that the sending end will send a thin pack and
> never a fat pack when asked to send a thin pack (and vice versa)?
> 
> Also should we make the "even though we have negotiated the protocol
> parameters, after enumerating the objects and deciding what the pack
> stream would look like, we have a bit more information to tell you"
> the sending side gives the receiver extensible?  I am wondering if
> that prefix needs something like "end of prefix" marker (or "here
> comes N-bytes worth of prefix information" upfront); we probably do
> not need it, as the capability exchange will determine what kind of
> information will be sent (e.g. "actual objects in the thin pack data
> stream").

Do we know the actual number of objects to send during the capability 
negociation?  I don't think so as this is known only after the 
"compressing objects" phase, and that already depends on the capability 
negociation before it can start.


Nicolas

^ permalink raw reply	[flat|nested] 124+ messages in thread

* Re: [PATCH v2 15/16] index-pack: use nr_objects_final as sha1_table size
  2013-09-09 19:11               ` Nicolas Pitre
@ 2013-09-09 19:30                 ` Junio C Hamano
  2013-09-09 19:56                   ` Nicolas Pitre
  0 siblings, 1 reply; 124+ messages in thread
From: Junio C Hamano @ 2013-09-09 19:30 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Nguyễn Thái Ngọc Duy, git

Nicolas Pitre <nico@fluxnic.net> writes:

> Do we know the actual number of objects to send during the capability 
> negociation?

No, and that is not what I meant.  We know upfront after capability
negotiation (by seeing a request to give them a thin-pack) that we
will send, in addition to the usual packfile, the prefix that
carries that information and that is the important part.  That lets
the receiver decide whether to _expect_ to see the prefix or no
prefix.  Without such, there needs some clue in the prefix part
itself if there are prefixes that carry information computed after
capability negotiation finished (i.e. after "object enumeration").

Sorry if I was unclear.

^ permalink raw reply	[flat|nested] 124+ messages in thread

* Re: [PATCH v2 15/16] index-pack: use nr_objects_final as sha1_table size
  2013-09-09 19:30                 ` Junio C Hamano
@ 2013-09-09 19:56                   ` Nicolas Pitre
  0 siblings, 0 replies; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-09 19:56 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Nguyễn Thái Ngọc Duy, git

On Mon, 9 Sep 2013, Junio C Hamano wrote:

> Nicolas Pitre <nico@fluxnic.net> writes:
> 
> > Do we know the actual number of objects to send during the capability 
> > negociation?
> 
> No, and that is not what I meant.  We know upfront after capability
> negotiation (by seeing a request to give them a thin-pack) that we
> will send, in addition to the usual packfile, the prefix that
> carries that information and that is the important part.  That lets
> the receiver decide whether to _expect_ to see the prefix or no
> prefix.  Without such, there needs some clue in the prefix part
> itself if there are prefixes that carry information computed after
> capability negotiation finished (i.e. after "object enumeration").

In this case, if negociation concludes on "thin" and "pack-version=4" 
then that could mean there is a prefix to be expected.


Nicolas

^ permalink raw reply	[flat|nested] 124+ messages in thread

* Re: [PATCH v2 15/16] index-pack: use nr_objects_final as sha1_table size
  2013-09-09 15:01       ` Nicolas Pitre
  2013-09-09 18:34         ` Junio C Hamano
@ 2013-09-10  0:45         ` Duy Nguyen
  2013-09-12 15:34           ` Nicolas Pitre
  1 sibling, 1 reply; 124+ messages in thread
From: Duy Nguyen @ 2013-09-10  0:45 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Git Mailing List

On Mon, Sep 9, 2013 at 10:01 PM, Nicolas Pitre <nico@fluxnic.net> wrote:
> However this means that the progress meter will now be wrong and that's
> terrible !  Users *will* complain that the meter doesn't reach 100% and
> they'll protest for being denied the remaining objects during the
> transfer !
>
> Joking aside, we should think about doing something about it.  I was
> wondering if some kind of prefix to the pack stream could be inserted
> onto the wire when sending a pack v4.  Something like:
>
> 'T', 'H', 'I', 'N', <actual_number_of_sent_objects_in_network_order>
>
> This 8-byte prefix would simply be discarded by index-pack after being
> parsed.
>
> What do you think?

I have no problem with this. Although I rather we generalize the case
to support multiple packs in the same stream (in some case the server
can just stream away one big existing pack, followed by a smaller pack
of recent updates), where "thin" is just a special pack that is not
saved on disk. So except for the signature difference, it should at
least follow the pack header (sig, version, nr_objects)
-- 
Duy

^ permalink raw reply	[flat|nested] 124+ messages in thread

* Re: [PATCH v2 15/16] index-pack: use nr_objects_final as sha1_table size
  2013-09-10  0:45         ` Duy Nguyen
@ 2013-09-12 15:34           ` Nicolas Pitre
  0 siblings, 0 replies; 124+ messages in thread
From: Nicolas Pitre @ 2013-09-12 15:34 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Git Mailing List

On Tue, 10 Sep 2013, Duy Nguyen wrote:

> On Mon, Sep 9, 2013 at 10:01 PM, Nicolas Pitre <nico@fluxnic.net> wrote:
> > However this means that the progress meter will now be wrong and that's
> > terrible !  Users *will* complain that the meter doesn't reach 100% and
> > they'll protest for being denied the remaining objects during the
> > transfer !
> >
> > Joking aside, we should think about doing something about it.  I was
> > wondering if some kind of prefix to the pack stream could be inserted
> > onto the wire when sending a pack v4.  Something like:
> >
> > 'T', 'H', 'I', 'N', <actual_number_of_sent_objects_in_network_order>
> >
> > This 8-byte prefix would simply be discarded by index-pack after being
> > parsed.
> >
> > What do you think?
> 
> I have no problem with this. Although I rather we generalize the case
> to support multiple packs in the same stream (in some case the server
> can just stream away one big existing pack, followed by a smaller pack
> of recent updates), where "thin" is just a special pack that is not
> saved on disk. So except for the signature difference, it should at
> least follow the pack header (sig, version, nr_objects)

Except in this case this is not a separate pack.  This prefix is there 
to provide information that is valid only for the pack to follow and 
therefore cannot be considered as some independent data.


Nicolas

^ permalink raw reply	[flat|nested] 124+ messages in thread

end of thread, other threads:[~2013-09-12 15:34 UTC | newest]

Thread overview: 124+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-09-05  6:19 [PATCH 00/38] pack version 4 basic functionalities Nicolas Pitre
2013-09-05  6:19 ` [PATCH 01/38] pack v4: initial pack dictionary structure and code Nicolas Pitre
2013-09-05  6:19 ` [PATCH 02/38] export packed_object_info() Nicolas Pitre
2013-09-05  6:19 ` [PATCH 03/38] pack v4: scan tree objects Nicolas Pitre
2013-09-05  6:19 ` [PATCH 04/38] pack v4: add tree entry mode support to dictionary entries Nicolas Pitre
2013-09-05  6:19 ` [PATCH 05/38] pack v4: add commit object parsing Nicolas Pitre
2013-09-05 10:30   ` SZEDER Gábor
2013-09-05 17:30     ` Nicolas Pitre
2013-09-05  6:19 ` [PATCH 06/38] pack v4: split the object list and dictionary creation Nicolas Pitre
2013-09-05  6:19 ` [PATCH 07/38] pack v4: move to struct pack_idx_entry and get rid of our own struct idx_entry Nicolas Pitre
2013-09-05  6:19 ` [PATCH 08/38] pack v4: basic SHA1 reference encoding Nicolas Pitre
2013-09-05  6:19 ` [PATCH 09/38] introduce get_sha1_lowhex() Nicolas Pitre
2013-09-05  6:19 ` [PATCH 10/38] pack v4: commit object encoding Nicolas Pitre
2013-09-06  6:57   ` Junio C Hamano
2013-09-06 21:28     ` Nicolas Pitre
2013-09-06 22:08       ` Junio C Hamano
2013-09-07  4:41         ` Nicolas Pitre
2013-09-05  6:19 ` [PATCH 11/38] pack v4: tree " Nicolas Pitre
2013-09-05  6:19 ` [PATCH 12/38] pack v4: dictionary table output Nicolas Pitre
2013-09-05  6:19 ` [PATCH 13/38] pack v4: creation code Nicolas Pitre
2013-09-05  6:19 ` [PATCH 14/38] pack v4: object headers Nicolas Pitre
2013-09-05  6:19 ` [PATCH 15/38] pack v4: object data copy Nicolas Pitre
2013-09-05  6:19 ` [PATCH 16/38] pack v4: object writing Nicolas Pitre
2013-09-05  6:19 ` [PATCH 17/38] pack v4: tree object delta encoding Nicolas Pitre
2013-09-05  6:19 ` [PATCH 18/38] pack v4: load delta candidate for encoding tree objects Nicolas Pitre
2013-09-05  6:19 ` [PATCH 19/38] packv4-create: optimize delta encoding Nicolas Pitre
2013-09-05  6:19 ` [PATCH 20/38] pack v4: honor pack.compression config option Nicolas Pitre
2013-09-05  6:19 ` [PATCH 21/38] pack v4: relax commit parsing a bit Nicolas Pitre
2013-09-05  6:19 ` [PATCH 22/38] pack index v3 Nicolas Pitre
2013-09-05  6:19 ` [PATCH 23/38] packv4-create: normalize pack name to properly generate the pack index file name Nicolas Pitre
2013-09-05  6:19 ` [PATCH 24/38] packv4-create: add progress display Nicolas Pitre
2013-09-05  6:19 ` [PATCH 25/38] pack v4: initial pack index v3 support on the read side Nicolas Pitre
2013-09-05  6:19 ` [PATCH 26/38] pack v4: object header decode Nicolas Pitre
2013-09-05  6:19 ` [PATCH 27/38] pack v4: code to obtain a SHA1 from a sha1ref Nicolas Pitre
2013-09-05  6:19 ` [PATCH 28/38] pack v4: code to load and prepare a pack dictionary table for use Nicolas Pitre
2013-09-05  6:19 ` [PATCH 29/38] pack v4: code to retrieve a name Nicolas Pitre
2013-09-05  6:19 ` [PATCH 30/38] pack v4: code to recreate a canonical commit object Nicolas Pitre
2013-09-05  6:19 ` [PATCH 31/38] sha1_file.c: make use of decode_varint() Nicolas Pitre
2013-09-05  7:35   ` SZEDER Gábor
2013-09-05  6:19 ` [PATCH 32/38] pack v4: parse delta base reference Nicolas Pitre
2013-09-05  6:19 ` [PATCH 33/38] pack v4: we can read commit objects now Nicolas Pitre
2013-09-05  6:19 ` [PATCH 34/38] pack v4: code to retrieve a path component Nicolas Pitre
2013-09-05  6:19 ` [PATCH 35/38] pack v4: decode tree objects Nicolas Pitre
2013-09-05  6:19 ` [PATCH 36/38] pack v4: get " Nicolas Pitre
2013-09-05  6:20 ` [PATCH 37/38] pack v4: introduce "escape hatches" in the name and path indexes Nicolas Pitre
2013-09-05 19:02   ` Nicolas Pitre
2013-09-05 21:48     ` Nicolas Pitre
2013-09-05 23:57     ` Duy Nguyen
2013-09-05  6:20 ` [PATCH 38/38] packv4-create: add a command line argument to limit tree copy sequences Nicolas Pitre
2013-09-07 10:43 ` [PATCH 00/12] pack v4 support in index-pack Nguyễn Thái Ngọc Duy
2013-09-07 10:43   ` [PATCH 01/12] pack v4: split pv4_create_dict() out of load_dict() Nguyễn Thái Ngọc Duy
2013-09-07 10:43   ` [PATCH 02/12] index-pack: split out varint decoding code Nguyễn Thái Ngọc Duy
2013-09-07 10:43   ` [PATCH 03/12] index-pack: do not allocate buffer for unpacking deltas in the first pass Nguyễn Thái Ngọc Duy
2013-09-07 10:43   ` [PATCH 04/12] index-pack: split inflate/digest code out of unpack_entry_data Nguyễn Thái Ngọc Duy
2013-09-07 10:43   ` [PATCH 05/12] index-pack: parse v4 header and dictionaries Nguyễn Thái Ngọc Duy
2013-09-08  2:14     ` Nicolas Pitre
2013-09-07 10:43   ` [PATCH 06/12] index-pack: make sure all objects are registered in v4's SHA-1 table Nguyễn Thái Ngọc Duy
2013-09-07 10:43   ` [PATCH 07/12] index-pack: parse v4 commit format Nguyễn Thái Ngọc Duy
2013-09-07 10:43   ` [PATCH 08/12] index-pack: parse v4 tree format Nguyễn Thái Ngọc Duy
2013-09-08  2:52     ` Nicolas Pitre
2013-09-07 10:43   ` [PATCH 09/12] index-pack: move delta base queuing code to unpack_raw_entry Nguyễn Thái Ngọc Duy
2013-09-07 10:43   ` [PATCH 10/12] index-pack: record all delta bases in v4 (tree and ref-delta) Nguyễn Thái Ngọc Duy
2013-09-07 10:43   ` [PATCH 11/12] index-pack: skip looking for ofs-deltas in v4 as they are not allowed Nguyễn Thái Ngọc Duy
2013-09-07 10:43   ` [PATCH 12/12] index-pack: resolve v4 one-base trees Nguyễn Thái Ngọc Duy
2013-09-08  3:28     ` Nicolas Pitre
2013-09-08  3:44       ` Duy Nguyen
2013-09-08  7:22   ` [PATCH v2 00/14] pack v4 support in index-pack Nguyễn Thái Ngọc Duy
2013-09-08  7:22     ` [PATCH v2 01/14] pack v4: split pv4_create_dict() out of load_dict() Nguyễn Thái Ngọc Duy
2013-09-08  7:22     ` [PATCH v2 02/14] pack v4: add pv4_free_dict() Nguyễn Thái Ngọc Duy
2013-09-08  7:22     ` [PATCH v2 03/14] index-pack: add more comments on some big functions Nguyễn Thái Ngọc Duy
2013-09-08  7:22     ` [PATCH v2 04/14] index-pack: split out varint decoding code Nguyễn Thái Ngọc Duy
2013-09-08  7:22     ` [PATCH v2 05/14] index-pack: do not allocate buffer for unpacking deltas in the first pass Nguyễn Thái Ngọc Duy
2013-09-08  7:22     ` [PATCH v2 06/14] index-pack: split inflate/digest code out of unpack_entry_data Nguyễn Thái Ngọc Duy
2013-09-08  7:22     ` [PATCH v2 07/14] index-pack: parse v4 header and dictionaries Nguyễn Thái Ngọc Duy
2013-09-08  7:22     ` [PATCH v2 08/14] index-pack: make sure all objects are registered in v4's SHA-1 table Nguyễn Thái Ngọc Duy
2013-09-08  7:22     ` [PATCH v2 09/14] index-pack: parse v4 commit format Nguyễn Thái Ngọc Duy
2013-09-08  7:22     ` [PATCH v2 10/14] index-pack: parse v4 tree format Nguyễn Thái Ngọc Duy
2013-09-08  7:22     ` [PATCH v2 11/14] index-pack: move delta base queuing code to unpack_raw_entry Nguyễn Thái Ngọc Duy
2013-09-08  7:22     ` [PATCH v2 12/14] index-pack: record all delta bases in v4 (tree and ref-delta) Nguyễn Thái Ngọc Duy
2013-09-08  7:22     ` [PATCH v2 13/14] index-pack: skip looking for ofs-deltas in v4 as they are not allowed Nguyễn Thái Ngọc Duy
2013-09-08  7:22     ` [PATCH v2 14/14] index-pack: resolve v4 one-base trees Nguyễn Thái Ngọc Duy
2013-09-08 15:04 ` [PATCH 00/11] pack v4 support in pack-objects Nguyễn Thái Ngọc Duy
2013-09-08 15:04   ` [PATCH 01/11] pack v4: allocate dicts from the beginning Nguyễn Thái Ngọc Duy
2013-09-08 15:04   ` [PATCH 02/11] pack v4: stop using static/global variables in packv4-create.c Nguyễn Thái Ngọc Duy
2013-09-08 15:04   ` [PATCH 03/11] pack v4: move packv4-create.c to libgit.a Nguyễn Thái Ngọc Duy
2013-09-08 20:56     ` Nicolas Pitre
2013-09-08 15:04   ` [PATCH 04/11] pack v4: add version argument to write_pack_header Nguyễn Thái Ngọc Duy
2013-09-08 15:04   ` [PATCH 05/11] pack-write.c: add pv4_encode_in_pack_object_header Nguyễn Thái Ngọc Duy
2013-09-08 20:51     ` Nicolas Pitre
2013-09-08 15:04   ` [PATCH 06/11] pack-objects: add --version to specify written pack version Nguyễn Thái Ngọc Duy
2013-09-08 15:04   ` [PATCH 07/11] list-objects.c: add show_tree_entry callback to traverse_commit_list Nguyễn Thái Ngọc Duy
2013-09-08 15:04   ` [PATCH 08/11] pack-objects: create pack v4 tables Nguyễn Thái Ngọc Duy
2013-09-09 10:40     ` Duy Nguyen
2013-09-09 13:07       ` Nicolas Pitre
2013-09-09 15:21         ` Junio C Hamano
2013-09-08 15:04   ` [PATCH 09/11] pack-objects: do not cache delta for v4 trees Nguyễn Thái Ngọc Duy
2013-09-08 15:04   ` [PATCH 10/11] pack-objects: exclude commits out of delta objects in v4 Nguyễn Thái Ngọc Duy
2013-09-08 15:04   ` [PATCH 11/11] pack-objects: support writing pack v4 Nguyễn Thái Ngọc Duy
2013-09-09 13:57   ` [PATCH v2 00/16] pack v4 support in pack-objects Nguyễn Thái Ngọc Duy
2013-09-09 13:57     ` [PATCH v2 01/16] pack v4: allocate dicts from the beginning Nguyễn Thái Ngọc Duy
2013-09-09 13:57     ` [PATCH v2 02/16] pack v4: stop using static/global variables in packv4-create.c Nguyễn Thái Ngọc Duy
2013-09-09 13:57     ` [PATCH v2 03/16] pack v4: move packv4-create.c to libgit.a Nguyễn Thái Ngọc Duy
2013-09-09 13:57     ` [PATCH v2 04/16] pack v4: add version argument to write_pack_header Nguyễn Thái Ngọc Duy
2013-09-09 13:57     ` [PATCH v2 05/16] pack_write: tighten valid object type check in encode_in_pack_object_header Nguyễn Thái Ngọc Duy
2013-09-09 13:57     ` [PATCH v2 06/16] pack-write.c: add pv4_encode_object_header Nguyễn Thái Ngọc Duy
2013-09-09 13:57     ` [PATCH v2 07/16] pack-objects: add --version to specify written pack version Nguyễn Thái Ngọc Duy
2013-09-09 13:57     ` [PATCH v2 08/16] list-objects.c: add show_tree_entry callback to traverse_commit_list Nguyễn Thái Ngọc Duy
2013-09-09 13:58     ` [PATCH v2 09/16] pack-objects: do not cache delta for v4 trees Nguyễn Thái Ngọc Duy
2013-09-09 13:58     ` [PATCH v2 10/16] pack-objects: exclude commits out of delta objects in v4 Nguyễn Thái Ngọc Duy
2013-09-09 13:58     ` [PATCH v2 11/16] pack-objects: create pack v4 tables Nguyễn Thái Ngọc Duy
2013-09-09 13:58     ` [PATCH v2 12/16] pack-objects: prepare SHA-1 table in v4 Nguyễn Thái Ngọc Duy
2013-09-09 13:58     ` [PATCH v2 13/16] pack-objects: support writing pack v4 Nguyễn Thái Ngọc Duy
2013-09-09 13:58     ` [PATCH v2 14/16] pack v4: support "end-of-pack" indicator in index-pack and pack-objects Nguyễn Thái Ngọc Duy
2013-09-09 13:58     ` [PATCH v2 15/16] index-pack: use nr_objects_final as sha1_table size Nguyễn Thái Ngọc Duy
2013-09-09 15:01       ` Nicolas Pitre
2013-09-09 18:34         ` Junio C Hamano
2013-09-09 18:46           ` Nicolas Pitre
2013-09-09 18:56             ` Junio C Hamano
2013-09-09 19:11               ` Nicolas Pitre
2013-09-09 19:30                 ` Junio C Hamano
2013-09-09 19:56                   ` Nicolas Pitre
2013-09-10  0:45         ` Duy Nguyen
2013-09-12 15:34           ` Nicolas Pitre
2013-09-09 13:58     ` [PATCH v2 16/16] index-pack: support completing thin packs v4 Nguyễn Thái Ngọc Duy

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).