All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 00/23] Preliminary pack v4 support
@ 2013-08-27  4:25 Nicolas Pitre
  2013-08-27  4:25 ` [PATCH 01/23] pack v4: initial pack dictionary structure and code Nicolas Pitre
                   ` (24 more replies)
  0 siblings, 25 replies; 83+ messages in thread
From: Nicolas Pitre @ 2013-08-27  4:25 UTC (permalink / raw)
  To: git

Subject says it all... at last !

This can also be fetched here:

	git://git.linaro.org/people/nico/git

Demonstration of what it does at the moment:

	http://article.gmane.org/gmane.comp.version-control.git/233038

I'd like to preserve the author time stamps as they relate to where in
the world I was when the corresponding code was written.  You'll notice
I didn't work on the code in the same order as it is now presented.

Still open question: what to do with a thin pack.  Should we really
complete it with local objects upon reception, or were we only over
paranoid at the time we imposed this rule?  Fixing a thin pack version 4
could be possible but not very eleguant.  Hence my pondering about
relaxing this rule.

Comments welcome !


Nicolas

^ permalink raw reply	[flat|nested] 83+ messages in thread

* [PATCH 01/23] pack v4: initial pack dictionary structure and code
  2013-08-27  4:25 [PATCH 00/23] Preliminary pack v4 support Nicolas Pitre
@ 2013-08-27  4:25 ` Nicolas Pitre
  2013-08-27 15:08   ` Junio C Hamano
  2013-08-27  4:25 ` [PATCH 02/23] export packed_object_info() Nicolas Pitre
                   ` (23 subsequent siblings)
  24 siblings, 1 reply; 83+ messages in thread
From: Nicolas Pitre @ 2013-08-27  4:25 UTC (permalink / raw)
  To: git

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 packv4-create.c | 137 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 137 insertions(+)
 create mode 100644 packv4-create.c

diff --git a/packv4-create.c b/packv4-create.c
new file mode 100644
index 0000000..2de6d41
--- /dev/null
+++ b/packv4-create.c
@@ -0,0 +1,137 @@
+/*
+ * packv4-create.c: management of dictionary tables used in pack v4
+ *
+ * (C) Nicolas Pitre <nico@fluxnic.net>
+ *
+ * This code is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include "cache.h"
+
+struct data_entry {
+	unsigned offset;
+	unsigned hits;
+};
+
+struct dict_table {
+	char *data;
+	unsigned ptr;
+	unsigned size;
+	struct data_entry *entry;
+	unsigned nb_entries;
+	unsigned max_entries;
+	unsigned *hash;
+	unsigned hash_size;
+};
+
+struct dict_table *create_dict_table(void)
+{
+	return xcalloc(sizeof(struct dict_table), 1);
+}
+
+void destroy_dict_table(struct dict_table *t)
+{
+	free(t->data);
+	free(t->entry);
+	free(t->hash);
+	free(t);
+}
+
+static int locate_entry(struct dict_table *t, const char *str)
+{
+	int i = 0;
+	const unsigned char *s = (const unsigned char *) str;
+
+	while (*s)
+		i = i * 111 + *s++;
+	i = (unsigned)i % t->hash_size;
+
+	while (t->hash[i]) {
+		unsigned n = t->hash[i] - 1;
+		if (!strcmp(str, t->data + t->entry[n].offset))
+			return n;
+		if (++i >= t->hash_size)
+			i = 0;
+	}
+	return -1 - i;
+}
+
+static void rehash_entries(struct dict_table *t)
+{
+	unsigned n;
+
+	t->hash_size *= 2;
+	if (t->hash_size < 1024)
+		t->hash_size = 1024;
+	t->hash = xrealloc(t->hash, t->hash_size * sizeof(*t->hash));
+	memset(t->hash, 0, t->hash_size * sizeof(*t->hash));
+
+	for (n = 0; n < t->nb_entries; n++) {
+		int i = locate_entry(t, t->data + t->entry[n].offset);
+		if (i < 0)
+			t->hash[-1 - i] = n + 1;
+	}
+}
+
+int dict_add_entry(struct dict_table *t, const char *str)
+{
+	int i, len = strlen(str) + 1;
+
+	if (t->ptr + len >= t->size) {
+		t->size = (t->size + len + 1024) * 3 / 2;
+		t->data = xrealloc(t->data, t->size);
+	}
+	memcpy(t->data + t->ptr, str, len);
+
+	i = (t->nb_entries) ? locate_entry(t, t->data + t->ptr) : -1;
+	if (i >= 0) {
+		t->entry[i].hits++;
+		return i;
+	}
+
+	if (t->nb_entries >= t->max_entries) {
+		t->max_entries = (t->max_entries + 1024) * 3 / 2;
+		t->entry = xrealloc(t->entry, t->max_entries * sizeof(*t->entry));
+	}
+	t->entry[t->nb_entries].offset = t->ptr;
+	t->entry[t->nb_entries].hits = 1;
+	t->ptr += len + 1;
+	t->nb_entries++;
+
+	if (t->hash_size * 3 <= t->nb_entries * 4)
+		rehash_entries(t);
+	else
+		t->hash[-1 - i] = t->nb_entries;
+
+	return t->nb_entries - 1;
+}
+
+static int cmp_dict_entries(const void *a_, const void *b_)
+{
+	const struct data_entry *a = a_;
+	const struct data_entry *b = b_;
+	int diff = b->hits - a->hits;
+	if (!diff)
+		diff = a->offset - b->offset;
+	return diff;
+}
+
+static void sort_dict_entries_by_hits(struct dict_table *t)
+{
+	qsort(t->entry, t->nb_entries, sizeof(*t->entry), cmp_dict_entries);
+	t->hash_size = (t->nb_entries * 4 / 3) / 2;
+	rehash_entries(t);
+}
+
+void dict_dump(struct dict_table *t)
+{
+	int i;
+
+	sort_dict_entries_by_hits(t);
+	for (i = 0; i < t->nb_entries; i++)
+		printf("%d\t%s\n",
+			t->entry[i].hits,
+			t->data + t->entry[i].offset);
+}
-- 
1.8.4.22.g54757b7

^ permalink raw reply related	[flat|nested] 83+ messages in thread

* [PATCH 02/23] export packed_object_info()
  2013-08-27  4:25 [PATCH 00/23] Preliminary pack v4 support Nicolas Pitre
  2013-08-27  4:25 ` [PATCH 01/23] pack v4: initial pack dictionary structure and code Nicolas Pitre
@ 2013-08-27  4:25 ` Nicolas Pitre
  2013-08-27  4:25 ` [PATCH 03/23] pack v4: scan tree objects Nicolas Pitre
                   ` (22 subsequent siblings)
  24 siblings, 0 replies; 83+ messages in thread
From: Nicolas Pitre @ 2013-08-27  4:25 UTC (permalink / raw)
  To: git

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 cache.h     | 1 +
 sha1_file.c | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/cache.h b/cache.h
index 85b544f..b6634c4 100644
--- a/cache.h
+++ b/cache.h
@@ -1160,6 +1160,7 @@ struct object_info {
 	} u;
 };
 extern int sha1_object_info_extended(const unsigned char *, struct object_info *);
+extern int packed_object_info(struct packed_git *, off_t, struct object_info *);
 
 /* Dumb servers support */
 extern int update_server_info(int);
diff --git a/sha1_file.c b/sha1_file.c
index 8e27db1..c2020d0 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -1782,8 +1782,8 @@ unwind:
 	goto out;
 }
 
-static int packed_object_info(struct packed_git *p, off_t obj_offset,
-			      struct object_info *oi)
+int packed_object_info(struct packed_git *p, off_t obj_offset,
+		       struct object_info *oi)
 {
 	struct pack_window *w_curs = NULL;
 	unsigned long size;
-- 
1.8.4.22.g54757b7

^ permalink raw reply related	[flat|nested] 83+ messages in thread

* [PATCH 03/23] pack v4: scan tree objects
  2013-08-27  4:25 [PATCH 00/23] Preliminary pack v4 support Nicolas Pitre
  2013-08-27  4:25 ` [PATCH 01/23] pack v4: initial pack dictionary structure and code Nicolas Pitre
  2013-08-27  4:25 ` [PATCH 02/23] export packed_object_info() Nicolas Pitre
@ 2013-08-27  4:25 ` Nicolas Pitre
  2013-08-27  4:25 ` [PATCH 04/23] pack v4: add tree entry mode support to dictionary entries Nicolas Pitre
                   ` (21 subsequent siblings)
  24 siblings, 0 replies; 83+ messages in thread
From: Nicolas Pitre @ 2013-08-27  4:25 UTC (permalink / raw)
  To: git

From: Nicolas Pitre <nico@lenovo.(none)>

Let's read a pack to feed our dictionary with all the path strings
contained in all the tree objects.

Dump the resulting dictionary sorted by frequency to stdout.

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 Makefile        |   1 +
 packv4-create.c | 137 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 138 insertions(+)

diff --git a/Makefile b/Makefile
index 3588ca1..4716113 100644
--- a/Makefile
+++ b/Makefile
@@ -550,6 +550,7 @@ PROGRAM_OBJS += shell.o
 PROGRAM_OBJS += show-index.o
 PROGRAM_OBJS += upload-pack.o
 PROGRAM_OBJS += remote-testsvn.o
+PROGRAM_OBJS += packv4-create.o
 
 # Binary suffix, set to .exe for Windows builds
 X =
diff --git a/packv4-create.c b/packv4-create.c
index 2de6d41..00762a5 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -9,6 +9,8 @@
  */
 
 #include "cache.h"
+#include "object.h"
+#include "tree-walk.h"
 
 struct data_entry {
 	unsigned offset;
@@ -125,6 +127,22 @@ static void sort_dict_entries_by_hits(struct dict_table *t)
 	rehash_entries(t);
 }
 
+static struct dict_table *tree_path_table;
+
+static int add_tree_dict_entries(void *buf, unsigned long size)
+{
+	struct tree_desc desc;
+	struct name_entry name_entry;
+
+	if (!tree_path_table)
+		tree_path_table = create_dict_table();
+
+	init_tree_desc(&desc, buf, size);
+	while (tree_entry(&desc, &name_entry))
+		dict_add_entry(tree_path_table, name_entry.path);
+	return 0;
+}
+
 void dict_dump(struct dict_table *t)
 {
 	int i;
@@ -135,3 +153,122 @@ void dict_dump(struct dict_table *t)
 			t->entry[i].hits,
 			t->data + t->entry[i].offset);
 }
+
+struct idx_entry
+{
+	off_t                offset;
+	const unsigned char *sha1;
+};
+
+static int sort_by_offset(const void *e1, const void *e2)
+{
+	const struct idx_entry *entry1 = e1;
+	const struct idx_entry *entry2 = e2;
+	if (entry1->offset < entry2->offset)
+		return -1;
+	if (entry1->offset > entry2->offset)
+		return 1;
+	return 0;
+}
+static int create_pack_dictionaries(struct packed_git *p)
+{
+	uint32_t nr_objects, i;
+	struct idx_entry *objects;
+
+	nr_objects = p->num_objects;
+	objects = xmalloc((nr_objects + 1) * sizeof(*objects));
+	objects[nr_objects].offset = p->index_size - 40;
+	for (i = 0; i < nr_objects; i++) {
+		objects[i].sha1 = nth_packed_object_sha1(p, i);
+		objects[i].offset = nth_packed_object_offset(p, i);
+	}
+	qsort(objects, nr_objects, sizeof(*objects), sort_by_offset);
+
+	for (i = 0; i < nr_objects; i++) {
+		void *data;
+		enum object_type type;
+		unsigned long size;
+		struct object_info oi = {};
+
+		oi.typep = &type;
+		oi.sizep = &size;
+		if (packed_object_info(p, objects[i].offset, &oi) < 0)
+			die("cannot get type of %s from %s",
+			    sha1_to_hex(objects[i].sha1), p->pack_name);
+
+		switch (type) {
+		case OBJ_TREE:
+			break;
+		default:
+			continue;
+		}
+		data = unpack_entry(p, objects[i].offset, &type, &size);
+		if (!data)
+			die("cannot unpack %s from %s",
+			    sha1_to_hex(objects[i].sha1), p->pack_name);
+		if (check_sha1_signature(objects[i].sha1, data, size, typename(type)))
+			die("packed %s from %s is corrupt",
+			    sha1_to_hex(objects[i].sha1), p->pack_name);
+		if (add_tree_dict_entries(data, size) < 0)
+			die("can't process %s object %s",
+				typename(type), sha1_to_hex(objects[i].sha1));
+		free(data);
+	}
+	free(objects);
+
+	return 0;
+}
+
+static int process_one_pack(const char *path)
+{
+	char arg[PATH_MAX];
+	int len;
+	struct packed_git *p;
+
+	len = strlcpy(arg, path, PATH_MAX);
+	if (len >= PATH_MAX)
+		return error("name too long: %s", path);
+
+	/*
+	 * In addition to "foo.idx" we accept "foo.pack" and "foo";
+	 * normalize these forms to "foo.idx" for add_packed_git().
+	 */
+	if (has_extension(arg, ".pack")) {
+		strcpy(arg + len - 5, ".idx");
+		len--;
+	} else if (!has_extension(arg, ".idx")) {
+		if (len + 4 >= PATH_MAX)
+			return error("name too long: %s.idx", arg);
+		strcpy(arg + len, ".idx");
+		len += 4;
+	}
+
+	/*
+	 * add_packed_git() uses our buffer (containing "foo.idx") to
+	 * build the pack filename ("foo.pack").  Make sure it fits.
+	 */
+	if (len + 1 >= PATH_MAX) {
+		arg[len - 4] = '\0';
+		return error("name too long: %s.pack", arg);
+	}
+
+	p = add_packed_git(arg, len, 1);
+	if (!p)
+		return error("packfile %s not found.", arg);
+
+	install_packed_git(p);
+	if (open_pack_index(p))
+		return error("packfile %s index not opened", p->pack_name);
+	return create_pack_dictionaries(p);
+}
+
+int main(int argc, char *argv[])
+{
+	if (argc != 2) {
+		fprintf(stderr, "Usage: %s <packfile>\n", argv[0]);
+		exit(1);
+	}
+	process_one_pack(argv[1]);
+	dict_dump(tree_path_table);
+	return 0;
+}
-- 
1.8.4.22.g54757b7

^ permalink raw reply related	[flat|nested] 83+ messages in thread

* [PATCH 04/23] pack v4: add tree entry mode support to dictionary entries
  2013-08-27  4:25 [PATCH 00/23] Preliminary pack v4 support Nicolas Pitre
                   ` (2 preceding siblings ...)
  2013-08-27  4:25 ` [PATCH 03/23] pack v4: scan tree objects Nicolas Pitre
@ 2013-08-27  4:25 ` Nicolas Pitre
  2013-08-27  4:25 ` [PATCH 05/23] pack v4: add commit object parsing Nicolas Pitre
                   ` (20 subsequent siblings)
  24 siblings, 0 replies; 83+ messages in thread
From: Nicolas Pitre @ 2013-08-27  4:25 UTC (permalink / raw)
  To: git

Augment dict entries with a 16-bit prefix in order to store the file
mode value of tree entries.

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 packv4-create.c | 56 ++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 36 insertions(+), 20 deletions(-)

diff --git a/packv4-create.c b/packv4-create.c
index 00762a5..eccd9fc 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -14,11 +14,12 @@
 
 struct data_entry {
 	unsigned offset;
+	unsigned size;
 	unsigned hits;
 };
 
 struct dict_table {
-	char *data;
+	unsigned char *data;
 	unsigned ptr;
 	unsigned size;
 	struct data_entry *entry;
@@ -41,18 +42,19 @@ void destroy_dict_table(struct dict_table *t)
 	free(t);
 }
 
-static int locate_entry(struct dict_table *t, const char *str)
+static int locate_entry(struct dict_table *t, const void *data, int size)
 {
-	int i = 0;
-	const unsigned char *s = (const unsigned char *) str;
+	int i = 0, len = size;
+	const unsigned char *p = data;
 
-	while (*s)
-		i = i * 111 + *s++;
+	while (len--)
+		i = i * 111 + *p++;
 	i = (unsigned)i % t->hash_size;
 
 	while (t->hash[i]) {
 		unsigned n = t->hash[i] - 1;
-		if (!strcmp(str, t->data + t->entry[n].offset))
+		if (t->entry[n].size == size &&
+		    memcmp(t->data + t->entry[n].offset, data, size) == 0)
 			return n;
 		if (++i >= t->hash_size)
 			i = 0;
@@ -71,23 +73,28 @@ static void rehash_entries(struct dict_table *t)
 	memset(t->hash, 0, t->hash_size * sizeof(*t->hash));
 
 	for (n = 0; n < t->nb_entries; n++) {
-		int i = locate_entry(t, t->data + t->entry[n].offset);
+		int i = locate_entry(t, t->data + t->entry[n].offset,
+					t->entry[n].size);
 		if (i < 0)
 			t->hash[-1 - i] = n + 1;
 	}
 }
 
-int dict_add_entry(struct dict_table *t, const char *str)
+int dict_add_entry(struct dict_table *t, int val, const char *str)
 {
-	int i, len = strlen(str) + 1;
+	int i, val_len = 2, str_len = strlen(str) + 1;
 
-	if (t->ptr + len >= t->size) {
-		t->size = (t->size + len + 1024) * 3 / 2;
+	if (t->ptr + val_len + str_len > t->size) {
+		t->size = (t->size + val_len + str_len + 1024) * 3 / 2;
 		t->data = xrealloc(t->data, t->size);
 	}
-	memcpy(t->data + t->ptr, str, len);
 
-	i = (t->nb_entries) ? locate_entry(t, t->data + t->ptr) : -1;
+	t->data[t->ptr] = val >> 8;
+	t->data[t->ptr + 1] = val;
+	memcpy(t->data + t->ptr + val_len, str, str_len);
+
+	i = (t->nb_entries) ?
+		locate_entry(t, t->data + t->ptr, val_len + str_len) : -1;
 	if (i >= 0) {
 		t->entry[i].hits++;
 		return i;
@@ -98,8 +105,9 @@ int dict_add_entry(struct dict_table *t, const char *str)
 		t->entry = xrealloc(t->entry, t->max_entries * sizeof(*t->entry));
 	}
 	t->entry[t->nb_entries].offset = t->ptr;
+	t->entry[t->nb_entries].size = val_len + str_len;
 	t->entry[t->nb_entries].hits = 1;
-	t->ptr += len + 1;
+	t->ptr += val_len + str_len;
 	t->nb_entries++;
 
 	if (t->hash_size * 3 <= t->nb_entries * 4)
@@ -139,7 +147,8 @@ static int add_tree_dict_entries(void *buf, unsigned long size)
 
 	init_tree_desc(&desc, buf, size);
 	while (tree_entry(&desc, &name_entry))
-		dict_add_entry(tree_path_table, name_entry.path);
+		dict_add_entry(tree_path_table, name_entry.mode,
+			       name_entry.path);
 	return 0;
 }
 
@@ -148,10 +157,16 @@ void dict_dump(struct dict_table *t)
 	int i;
 
 	sort_dict_entries_by_hits(t);
-	for (i = 0; i < t->nb_entries; i++)
-		printf("%d\t%s\n",
-			t->entry[i].hits,
-			t->data + t->entry[i].offset);
+	for (i = 0; i < t->nb_entries; i++) {
+		int16_t val;
+		uint16_t uval;
+		val = t->data[t->entry[i].offset] << 8;
+		val |= t->data[t->entry[i].offset + 1];
+		uval = val;
+		printf("%d\t%d\t%o\t%s\n",
+			t->entry[i].hits, val, uval,
+			t->data + t->entry[i].offset + 2);
+	}
 }
 
 struct idx_entry
@@ -170,6 +185,7 @@ static int sort_by_offset(const void *e1, const void *e2)
 		return 1;
 	return 0;
 }
+
 static int create_pack_dictionaries(struct packed_git *p)
 {
 	uint32_t nr_objects, i;
-- 
1.8.4.22.g54757b7

^ permalink raw reply related	[flat|nested] 83+ messages in thread

* [PATCH 05/23] pack v4: add commit object parsing
  2013-08-27  4:25 [PATCH 00/23] Preliminary pack v4 support Nicolas Pitre
                   ` (3 preceding siblings ...)
  2013-08-27  4:25 ` [PATCH 04/23] pack v4: add tree entry mode support to dictionary entries Nicolas Pitre
@ 2013-08-27  4:25 ` Nicolas Pitre
  2013-08-27 15:26   ` Junio C Hamano
  2013-08-27  4:25 ` [PATCH 06/23] pack v4: split the object list and dictionary creation Nicolas Pitre
                   ` (19 subsequent siblings)
  24 siblings, 1 reply; 83+ messages in thread
From: Nicolas Pitre @ 2013-08-27  4:25 UTC (permalink / raw)
  To: git

Let's create another dictionary table to hold the author and committer
entries.  We use the same table format used for tree entries where the
16 bit data prefix is conveniently used to store the timezone value.

In order to copy straight from a commit object buffer, dict_add_entry()
is modified to get the string length as the provided string pointer is
not always be null terminated.

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 packv4-create.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 89 insertions(+), 9 deletions(-)

diff --git a/packv4-create.c b/packv4-create.c
index eccd9fc..5c08871 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -1,5 +1,5 @@
 /*
- * packv4-create.c: management of dictionary tables used in pack v4
+ * packv4-create.c: creation of dictionary tables and objects used in pack v4
  *
  * (C) Nicolas Pitre <nico@fluxnic.net>
  *
@@ -80,9 +80,9 @@ static void rehash_entries(struct dict_table *t)
 	}
 }
 
-int dict_add_entry(struct dict_table *t, int val, const char *str)
+int dict_add_entry(struct dict_table *t, int val, const char *str, int str_len)
 {
-	int i, val_len = 2, str_len = strlen(str) + 1;
+	int i, val_len = 2;
 
 	if (t->ptr + val_len + str_len > t->size) {
 		t->size = (t->size + val_len + str_len + 1024) * 3 / 2;
@@ -92,6 +92,7 @@ int dict_add_entry(struct dict_table *t, int val, const char *str)
 	t->data[t->ptr] = val >> 8;
 	t->data[t->ptr + 1] = val;
 	memcpy(t->data + t->ptr + val_len, str, str_len);
+	t->data[t->ptr + val_len + str_len] = 0;
 
 	i = (t->nb_entries) ?
 		locate_entry(t, t->data + t->ptr, val_len + str_len) : -1;
@@ -107,7 +108,7 @@ int dict_add_entry(struct dict_table *t, int val, const char *str)
 	t->entry[t->nb_entries].offset = t->ptr;
 	t->entry[t->nb_entries].size = val_len + str_len;
 	t->entry[t->nb_entries].hits = 1;
-	t->ptr += val_len + str_len;
+	t->ptr += val_len + str_len + 1;
 	t->nb_entries++;
 
 	if (t->hash_size * 3 <= t->nb_entries * 4)
@@ -135,8 +136,73 @@ static void sort_dict_entries_by_hits(struct dict_table *t)
 	rehash_entries(t);
 }
 
+static struct dict_table *commit_name_table;
 static struct dict_table *tree_path_table;
 
+/*
+ * Parse the author/committer line from a canonical commit object.
+ * The 'from' argument points right after the "author " or "committer "
+ * string.  The time zone is parsed and stored in *tz_val.  The returned
+ * pointer is right after the end of the email address which is also just
+ * before the time value, or NULL if a parsing error is encountered.
+ */
+static char *get_nameend_and_tz(char *from, int *tz_val)
+{
+	char *end, *tz;
+
+	tz = strchr(from, '\n');
+	/* let's assume the smallest possible string to be "x <x> 0 +0000\n" */
+	if (!tz || tz - from < 13)
+		return NULL;
+	tz -= 4;
+	end = tz - 4;
+	while (end - from > 5 && *end != ' ')
+		end--;
+	if (end[-1] != '>' || end[0] != ' ' || tz[-2] != ' ')
+		return NULL;
+	*tz_val = (tz[0] - '0') * 1000 +
+		  (tz[1] - '0') * 100 +
+		  (tz[2] - '0') * 10 +
+		  (tz[3] - '0');
+	switch (tz[-1]) {
+	default:	return NULL;
+	case '+':	break;
+	case '-':	*tz_val = -*tz_val;
+	}
+	return end;
+}
+
+static int add_commit_dict_entries(void *buf, unsigned long size)
+{
+	char *name, *end = NULL;
+	int tz_val;
+
+	if (!commit_name_table)
+		commit_name_table = create_dict_table();
+
+	/* parse and add author info */
+	name = strstr(buf, "\nauthor ");
+	if (name) {
+		name += 8;
+		end = get_nameend_and_tz(name, &tz_val);
+	}
+	if (!name || !end)
+		return -1;
+	dict_add_entry(commit_name_table, tz_val, name, end - name);
+
+	/* parse and add committer info */
+	name = strstr(end, "\ncommitter ");
+	if (name) {
+	       name += 11;
+	       end = get_nameend_and_tz(name, &tz_val);
+	}
+	if (!name || !end)
+		return -1;
+	dict_add_entry(commit_name_table, tz_val, name, end - name);
+
+	return 0;
+}
+
 static int add_tree_dict_entries(void *buf, unsigned long size)
 {
 	struct tree_desc desc;
@@ -146,13 +212,16 @@ static int add_tree_dict_entries(void *buf, unsigned long size)
 		tree_path_table = create_dict_table();
 
 	init_tree_desc(&desc, buf, size);
-	while (tree_entry(&desc, &name_entry))
+	while (tree_entry(&desc, &name_entry)) {
+		int pathlen = tree_entry_len(&name_entry);
 		dict_add_entry(tree_path_table, name_entry.mode,
-			       name_entry.path);
+				name_entry.path, pathlen);
+	}
+
 	return 0;
 }
 
-void dict_dump(struct dict_table *t)
+void dump_dict_table(struct dict_table *t)
 {
 	int i;
 
@@ -169,6 +238,12 @@ void dict_dump(struct dict_table *t)
 	}
 }
 
+static void dict_dump(void)
+{
+	dump_dict_table(commit_name_table);
+	dump_dict_table(tree_path_table);
+}
+
 struct idx_entry
 {
 	off_t                offset;
@@ -205,6 +280,7 @@ static int create_pack_dictionaries(struct packed_git *p)
 		enum object_type type;
 		unsigned long size;
 		struct object_info oi = {};
+		int (*add_dict_entries)(void *, unsigned long);
 
 		oi.typep = &type;
 		oi.sizep = &size;
@@ -213,7 +289,11 @@ static int create_pack_dictionaries(struct packed_git *p)
 			    sha1_to_hex(objects[i].sha1), p->pack_name);
 
 		switch (type) {
+		case OBJ_COMMIT:
+			add_dict_entries = add_commit_dict_entries;
+			break;
 		case OBJ_TREE:
+			add_dict_entries = add_tree_dict_entries;
 			break;
 		default:
 			continue;
@@ -225,7 +305,7 @@ static int create_pack_dictionaries(struct packed_git *p)
 		if (check_sha1_signature(objects[i].sha1, data, size, typename(type)))
 			die("packed %s from %s is corrupt",
 			    sha1_to_hex(objects[i].sha1), p->pack_name);
-		if (add_tree_dict_entries(data, size) < 0)
+		if (add_dict_entries(data, size) < 0)
 			die("can't process %s object %s",
 				typename(type), sha1_to_hex(objects[i].sha1));
 		free(data);
@@ -285,6 +365,6 @@ int main(int argc, char *argv[])
 		exit(1);
 	}
 	process_one_pack(argv[1]);
-	dict_dump(tree_path_table);
+	dict_dump();
 	return 0;
 }
-- 
1.8.4.22.g54757b7

^ permalink raw reply related	[flat|nested] 83+ messages in thread

* [PATCH 06/23] pack v4: split the object list and dictionary creation
  2013-08-27  4:25 [PATCH 00/23] Preliminary pack v4 support Nicolas Pitre
                   ` (4 preceding siblings ...)
  2013-08-27  4:25 ` [PATCH 05/23] pack v4: add commit object parsing Nicolas Pitre
@ 2013-08-27  4:25 ` Nicolas Pitre
  2013-08-27  4:25 ` [PATCH 07/23] pack v4: move to struct pack_idx_entry and get rid of our own struct idx_entry Nicolas Pitre
                   ` (18 subsequent siblings)
  24 siblings, 0 replies; 83+ messages in thread
From: Nicolas Pitre @ 2013-08-27  4:25 UTC (permalink / raw)
  To: git

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 packv4-create.c | 58 +++++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 44 insertions(+), 14 deletions(-)

diff --git a/packv4-create.c b/packv4-create.c
index 5c08871..20d97a4 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -261,7 +261,7 @@ static int sort_by_offset(const void *e1, const void *e2)
 	return 0;
 }
 
-static int create_pack_dictionaries(struct packed_git *p)
+static struct idx_entry *get_packed_object_list(struct packed_git *p)
 {
 	uint32_t nr_objects, i;
 	struct idx_entry *objects;
@@ -275,7 +275,15 @@ static int create_pack_dictionaries(struct packed_git *p)
 	}
 	qsort(objects, nr_objects, sizeof(*objects), sort_by_offset);
 
-	for (i = 0; i < nr_objects; i++) {
+	return objects;
+}
+
+static int create_pack_dictionaries(struct packed_git *p,
+				    struct idx_entry *objects)
+{
+	unsigned int i;
+
+	for (i = 0; i < p->num_objects; i++) {
 		void *data;
 		enum object_type type;
 		unsigned long size;
@@ -310,20 +318,21 @@ static int create_pack_dictionaries(struct packed_git *p)
 				typename(type), sha1_to_hex(objects[i].sha1));
 		free(data);
 	}
-	free(objects);
 
 	return 0;
 }
 
-static int process_one_pack(const char *path)
+static struct packed_git *open_pack(const char *path)
 {
 	char arg[PATH_MAX];
 	int len;
 	struct packed_git *p;
 
 	len = strlcpy(arg, path, PATH_MAX);
-	if (len >= PATH_MAX)
-		return error("name too long: %s", path);
+	if (len >= PATH_MAX) {
+		error("name too long: %s", path);
+		return NULL;
+	}
 
 	/*
 	 * In addition to "foo.idx" we accept "foo.pack" and "foo";
@@ -333,8 +342,10 @@ static int process_one_pack(const char *path)
 		strcpy(arg + len - 5, ".idx");
 		len--;
 	} else if (!has_extension(arg, ".idx")) {
-		if (len + 4 >= PATH_MAX)
-			return error("name too long: %s.idx", arg);
+		if (len + 4 >= PATH_MAX) {
+			error("name too long: %s.idx", arg);
+			return NULL;
+		}
 		strcpy(arg + len, ".idx");
 		len += 4;
 	}
@@ -345,17 +356,36 @@ static int process_one_pack(const char *path)
 	 */
 	if (len + 1 >= PATH_MAX) {
 		arg[len - 4] = '\0';
-		return error("name too long: %s.pack", arg);
+		error("name too long: %s.pack", arg);
+		return NULL;
 	}
 
 	p = add_packed_git(arg, len, 1);
-	if (!p)
-		return error("packfile %s not found.", arg);
+	if (!p) {
+		error("packfile %s not found.", arg);
+		return NULL;
+	}
 
 	install_packed_git(p);
-	if (open_pack_index(p))
-		return error("packfile %s index not opened", p->pack_name);
-	return create_pack_dictionaries(p);
+	if (open_pack_index(p)) {
+		error("packfile %s index not opened", p->pack_name);
+		return NULL;
+	}
+
+	return p;
+}
+
+static void process_one_pack(char *src_pack)
+{
+	struct packed_git *p;
+	struct idx_entry *objs;
+
+	p = open_pack(src_pack);
+	if (!p)
+		die("unable to open source pack");
+
+	objs = get_packed_object_list(p);
+	create_pack_dictionaries(p, objs);
 }
 
 int main(int argc, char *argv[])
-- 
1.8.4.22.g54757b7

^ permalink raw reply related	[flat|nested] 83+ messages in thread

* [PATCH 07/23] pack v4: move to struct pack_idx_entry and get rid of our own struct idx_entry
  2013-08-27  4:25 [PATCH 00/23] Preliminary pack v4 support Nicolas Pitre
                   ` (5 preceding siblings ...)
  2013-08-27  4:25 ` [PATCH 06/23] pack v4: split the object list and dictionary creation Nicolas Pitre
@ 2013-08-27  4:25 ` Nicolas Pitre
  2013-08-27  4:25 ` [PATCH 08/23] pack v4: basic references encoding Nicolas Pitre
                   ` (17 subsequent siblings)
  24 siblings, 0 replies; 83+ messages in thread
From: Nicolas Pitre @ 2013-08-27  4:25 UTC (permalink / raw)
  To: git

Let's create a struct pack_idx_entry list with sorted sha1 which will
be useful later.  The offset sorted list is now a separate indirect
list.

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 packv4-create.c | 72 +++++++++++++++++++++++++++++++++------------------------
 1 file changed, 42 insertions(+), 30 deletions(-)

diff --git a/packv4-create.c b/packv4-create.c
index 20d97a4..012129b 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -11,6 +11,7 @@
 #include "cache.h"
 #include "object.h"
 #include "tree-walk.h"
+#include "pack.h"
 
 struct data_entry {
 	unsigned offset;
@@ -244,46 +245,53 @@ static void dict_dump(void)
 	dump_dict_table(tree_path_table);
 }
 
-struct idx_entry
+static struct pack_idx_entry *get_packed_object_list(struct packed_git *p)
 {
-	off_t                offset;
-	const unsigned char *sha1;
-};
+	unsigned i, nr_objects = p->num_objects;
+	struct pack_idx_entry *objects;
+
+	objects = xmalloc((nr_objects + 1) * sizeof(*objects));
+	objects[nr_objects].offset = p->pack_size - 20;
+	for (i = 0; i < nr_objects; i++) {
+		hashcpy(objects[i].sha1, nth_packed_object_sha1(p, i));
+		objects[i].offset = nth_packed_object_offset(p, i);
+	}
+
+	return objects;
+}
 
 static int sort_by_offset(const void *e1, const void *e2)
 {
-	const struct idx_entry *entry1 = e1;
-	const struct idx_entry *entry2 = e2;
-	if (entry1->offset < entry2->offset)
+	const struct pack_idx_entry * const *entry1 = e1;
+	const struct pack_idx_entry * const *entry2 = e2;
+	if ((*entry1)->offset < (*entry2)->offset)
 		return -1;
-	if (entry1->offset > entry2->offset)
+	if ((*entry1)->offset > (*entry2)->offset)
 		return 1;
 	return 0;
 }
 
-static struct idx_entry *get_packed_object_list(struct packed_git *p)
+static struct pack_idx_entry **sort_objs_by_offset(struct pack_idx_entry *list,
+						    unsigned nr_objects)
 {
-	uint32_t nr_objects, i;
-	struct idx_entry *objects;
+	unsigned i;
+	struct pack_idx_entry **sorted;
 
-	nr_objects = p->num_objects;
-	objects = xmalloc((nr_objects + 1) * sizeof(*objects));
-	objects[nr_objects].offset = p->index_size - 40;
-	for (i = 0; i < nr_objects; i++) {
-		objects[i].sha1 = nth_packed_object_sha1(p, i);
-		objects[i].offset = nth_packed_object_offset(p, i);
-	}
-	qsort(objects, nr_objects, sizeof(*objects), sort_by_offset);
+	sorted = xmalloc((nr_objects + 1) * sizeof(*sorted));
+	for (i = 0; i < nr_objects + 1; i++)
+		sorted[i] = &list[i];
+	qsort(sorted, nr_objects + 1, sizeof(*sorted), sort_by_offset);
 
-	return objects;
+	return sorted;
 }
 
 static int create_pack_dictionaries(struct packed_git *p,
-				    struct idx_entry *objects)
+				    struct pack_idx_entry **obj_list)
 {
 	unsigned int i;
 
 	for (i = 0; i < p->num_objects; i++) {
+		struct pack_idx_entry *obj = obj_list[i];
 		void *data;
 		enum object_type type;
 		unsigned long size;
@@ -292,9 +300,9 @@ static int create_pack_dictionaries(struct packed_git *p,
 
 		oi.typep = &type;
 		oi.sizep = &size;
-		if (packed_object_info(p, objects[i].offset, &oi) < 0)
+		if (packed_object_info(p, obj->offset, &oi) < 0)
 			die("cannot get type of %s from %s",
-			    sha1_to_hex(objects[i].sha1), p->pack_name);
+			    sha1_to_hex(obj->sha1), p->pack_name);
 
 		switch (type) {
 		case OBJ_COMMIT:
@@ -306,16 +314,16 @@ static int create_pack_dictionaries(struct packed_git *p,
 		default:
 			continue;
 		}
-		data = unpack_entry(p, objects[i].offset, &type, &size);
+		data = unpack_entry(p, obj->offset, &type, &size);
 		if (!data)
 			die("cannot unpack %s from %s",
-			    sha1_to_hex(objects[i].sha1), p->pack_name);
-		if (check_sha1_signature(objects[i].sha1, data, size, typename(type)))
+			    sha1_to_hex(obj->sha1), p->pack_name);
+		if (check_sha1_signature(obj->sha1, data, size, typename(type)))
 			die("packed %s from %s is corrupt",
-			    sha1_to_hex(objects[i].sha1), p->pack_name);
+			    sha1_to_hex(obj->sha1), p->pack_name);
 		if (add_dict_entries(data, size) < 0)
 			die("can't process %s object %s",
-				typename(type), sha1_to_hex(objects[i].sha1));
+				typename(type), sha1_to_hex(obj->sha1));
 		free(data);
 	}
 
@@ -378,14 +386,18 @@ static struct packed_git *open_pack(const char *path)
 static void process_one_pack(char *src_pack)
 {
 	struct packed_git *p;
-	struct idx_entry *objs;
+	struct pack_idx_entry *objs, **p_objs;
+	unsigned nr_objects;
 
 	p = open_pack(src_pack);
 	if (!p)
 		die("unable to open source pack");
 
+	nr_objects = p->num_objects;
 	objs = get_packed_object_list(p);
-	create_pack_dictionaries(p, objs);
+	p_objs = sort_objs_by_offset(objs, nr_objects);
+
+	create_pack_dictionaries(p, p_objs);
 }
 
 int main(int argc, char *argv[])
-- 
1.8.4.22.g54757b7

^ permalink raw reply related	[flat|nested] 83+ messages in thread

* [PATCH 08/23] pack v4: basic references encoding
  2013-08-27  4:25 [PATCH 00/23] Preliminary pack v4 support Nicolas Pitre
                   ` (6 preceding siblings ...)
  2013-08-27  4:25 ` [PATCH 07/23] pack v4: move to struct pack_idx_entry and get rid of our own struct idx_entry Nicolas Pitre
@ 2013-08-27  4:25 ` Nicolas Pitre
  2013-08-27 15:29   ` Junio C Hamano
  2013-08-27  4:25 ` [PATCH 09/23] pack v4: commit object encoding Nicolas Pitre
                   ` (16 subsequent siblings)
  24 siblings, 1 reply; 83+ messages in thread
From: Nicolas Pitre @ 2013-08-27  4:25 UTC (permalink / raw)
  To: git

Add variable length number encoding.  Let's apply the same encoding
currently used for the offset to base of OBJ_OFS_DELTA objects which
is the most efficient way to do this, and apply it to everything with
pack version 4.

Also add SHA1 reference encoding.  This one is either an index into a
SHA1 table encoded using the variable length number encoding, or the
literal 20 bytes SHA1 prefixed with a 0.

The index 0 discriminates between an actual index value or the literal
SHA1.  Therefore when the index is used its value must be increased by 1.

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 packv4-create.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/packv4-create.c b/packv4-create.c
index 012129b..bf33d15 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -245,6 +245,50 @@ static void dict_dump(void)
 	dump_dict_table(tree_path_table);
 }
 
+/*
+ * Encode a numerical value with variable length into the destination buffer
+ */
+static unsigned char *add_number(unsigned char *dst, uint64_t value)
+{
+	unsigned char buf[20];
+	unsigned pos = sizeof(buf) - 1;
+	buf[pos] = value & 127;
+	while (value >>= 7)
+		buf[--pos] = 128 | (--value & 127);
+	do {
+		*dst++ = buf[pos++];
+	} while (pos < sizeof(buf));
+	return dst;
+}
+
+/*
+ * Encode an object SHA1 reference with either an object index into the
+ * pack SHA1 table incremented by 1, or the literal SHA1 value prefixed
+ * with a zero byte if the needed SHA1 is not available in the table.
+ */
+static struct pack_idx_entry *all_objs;
+static unsigned all_objs_nr;
+static unsigned char *add_sha1_ref(unsigned char *dst, const unsigned char *sha1)
+{
+	unsigned lo = 0, hi = all_objs_nr;
+
+	do {
+		unsigned mi = (lo + hi) / 2;
+		int cmp = hashcmp(all_objs[mi].sha1, sha1);
+
+		if (cmp == 0)
+			return add_number(dst, mi + 1);
+		if (cmp > 0)
+			hi = mi;
+		else
+			lo = mi+1;
+	} while (lo < hi);
+
+	*dst++ = 0;
+	hashcpy(dst, sha1);
+	return dst + 20;
+}
+
 static struct pack_idx_entry *get_packed_object_list(struct packed_git *p)
 {
 	unsigned i, nr_objects = p->num_objects;
-- 
1.8.4.22.g54757b7

^ permalink raw reply related	[flat|nested] 83+ messages in thread

* [PATCH 09/23] pack v4: commit object encoding
  2013-08-27  4:25 [PATCH 00/23] Preliminary pack v4 support Nicolas Pitre
                   ` (7 preceding siblings ...)
  2013-08-27  4:25 ` [PATCH 08/23] pack v4: basic references encoding Nicolas Pitre
@ 2013-08-27  4:25 ` Nicolas Pitre
  2013-08-27 15:39   ` Junio C Hamano
  2013-09-02 20:48   ` Duy Nguyen
  2013-08-27  4:25 ` [PATCH 10/23] pack v4: tree " Nicolas Pitre
                   ` (15 subsequent siblings)
  24 siblings, 2 replies; 83+ messages in thread
From: Nicolas Pitre @ 2013-08-27  4:25 UTC (permalink / raw)
  To: git

This goes as follows:

- Tree reference: either variable length encoding of the index
  into the SHA1 table or the literal SHA1 prefixed by 0 (see
  add_sha1_ref()).

- Parent count: variable length encoding of the number of parents.
  This is normally going to occupy a single byte but doesn't have to.

- List of parent references: a list of add_sha1_ref() encoded references,
  or nothing if the parent count was zero.

- Author reference: variable length encoding of an index into the author
  string dictionary table which also covers the time zone.  To make the
  overall encoding efficient, the author table is already sorted by usage
  frequency so the most used names are first and require the shortest
  index encoding.

- Author time stamp: variable length encoded.  Year 2038 ready!

- Committer reference: same as author reference.

- Committer time stamp: same as author time stamp.

The remainder of the canonical commit object content is then zlib
compressed and appended to the above.

Rationale: The most important commit object data is densely encoded while
requiring no zlib inflate processing, and all SHA1 references are most
likely to be direct indices into the pack index file requiring no SHA1
search into the pack index file.

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 packv4-create.c | 119 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 119 insertions(+)

diff --git a/packv4-create.c b/packv4-create.c
index bf33d15..cedbbd9 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -13,6 +13,9 @@
 #include "tree-walk.h"
 #include "pack.h"
 
+
+static int pack_compression_level = Z_DEFAULT_COMPRESSION;
+
 struct data_entry {
 	unsigned offset;
 	unsigned size;
@@ -289,6 +292,122 @@ static unsigned char *add_sha1_ref(unsigned char *dst, const unsigned char *sha1
 	return dst + 20;
 }
 
+/*
+ * This converts a canonical commit object buffer into its
+ * tightly packed representation using the already populated
+ * and sorted commit_name_table dictionary.  The parsing is
+ * strict so to ensure the canonical version may always be
+ * regenerated and produce the same hash.
+ */
+void * conv_to_dict_commit(void *buffer, unsigned long *psize)
+{
+	unsigned long size = *psize;
+	char *in, *tail, *end;
+	unsigned char *out;
+	unsigned char sha1[20];
+	int nb_parents, index, tz_val;
+	unsigned long time;
+	z_stream stream;
+	int status;
+
+	/*
+	 * It is guaranteed that the output is always going to be smaller
+	 * than the input.  We could even do this conversion in place.
+	 */
+	in = buffer;
+	tail = in + size;
+	buffer = xmalloc(size);
+	out = buffer;
+
+	/* parse the "tree" line */
+	if (in + 46 >= tail || memcmp(in, "tree ", 5) || in[45] != '\n')
+		goto bad_data;
+	if (get_sha1_hex(in + 5, sha1) < 0)
+		goto bad_data;
+	in += 46;
+	out = add_sha1_ref(out, sha1);
+
+	/* count how many "parent" lines */
+	nb_parents = 0;
+	while (in + 48 < tail && !memcmp(in, "parent ", 7) && in[47] == '\n') {
+		nb_parents++;
+		in += 48;
+	}
+	out = add_number(out, nb_parents);
+
+	/* rewind and parse the "parent" lines */
+	in -= 48 * nb_parents;
+	while (nb_parents--) {
+		if (get_sha1_hex(in + 7, sha1))
+			goto bad_data;
+		out = add_sha1_ref(out, sha1);
+		in += 48;
+	}
+
+	/* parse the "author" line */
+	/* it must be at least "author x <x> 0 +0000\n" i.e. 21 chars */
+	if (in + 21 >= tail || memcmp(in, "author ", 7))
+		goto bad_data;
+	in += 7;
+	end = get_nameend_and_tz(in, &tz_val);
+	if (!end)
+		goto bad_data;
+	index = dict_add_entry(commit_name_table, tz_val, in, end - in);
+	if (index < 0)
+		goto bad_dict;
+	out = add_number(out, index);
+	time = strtoul(end, &end, 10);
+	if (!end || end[0] != ' ' || end[6] != '\n')
+		goto bad_data;
+	out = add_number(out, time);
+	in = end + 7;
+
+	/* parse the "committer" line */
+	/* it must be at least "committer x <x> 0 +0000\n" i.e. 24 chars */
+	if (in + 24 >= tail || memcmp(in, "committer ", 7))
+		goto bad_data;
+	in += 10;
+	end = get_nameend_and_tz(in, &tz_val);
+	if (!end)
+		goto bad_data;
+	index = dict_add_entry(commit_name_table, tz_val, in, end - in);
+	if (index < 0)
+		goto bad_dict;
+	out = add_number(out, index);
+	time = strtoul(end, &end, 10);
+	if (!end || end[0] != ' ' || end[6] != '\n')
+		goto bad_data;
+	out = add_number(out, time);
+	in = end + 7;
+
+	/* finally, deflate the remaining data */
+	memset(&stream, 0, sizeof(stream));
+	deflateInit(&stream, pack_compression_level);
+	stream.next_in = (unsigned char *)in;
+	stream.avail_in = tail - in;
+	stream.next_out = (unsigned char *)out;
+	stream.avail_out = size - (out - (unsigned char *)buffer);
+	status = deflate(&stream, Z_FINISH);
+	end = (char *)stream.next_out;
+	deflateEnd(&stream);
+	if (status != Z_STREAM_END) {
+		error("deflate error status %d", status);
+		goto bad;
+	}
+
+	*psize = end - (char *)buffer;
+	return buffer;
+
+bad_data:
+	error("bad commit data");
+	goto bad;
+bad_dict:
+	error("bad dict entry");
+bad:
+	free(buffer);
+	return NULL;
+}
+
 static struct pack_idx_entry *get_packed_object_list(struct packed_git *p)
 {
 	unsigned i, nr_objects = p->num_objects;
-- 
1.8.4.22.g54757b7

^ permalink raw reply related	[flat|nested] 83+ messages in thread

* [PATCH 10/23] pack v4: tree object encoding
  2013-08-27  4:25 [PATCH 00/23] Preliminary pack v4 support Nicolas Pitre
                   ` (8 preceding siblings ...)
  2013-08-27  4:25 ` [PATCH 09/23] pack v4: commit object encoding Nicolas Pitre
@ 2013-08-27  4:25 ` Nicolas Pitre
  2013-08-27 15:44   ` Junio C Hamano
  2013-08-27  4:25 ` [PATCH 11/23] pack v4: dictionary table output Nicolas Pitre
                   ` (14 subsequent siblings)
  24 siblings, 1 reply; 83+ messages in thread
From: Nicolas Pitre @ 2013-08-27  4:25 UTC (permalink / raw)
  To: git

This goes as follows:

- Number of tree entries: variable length encoded.

Then for each tree entry:

- Path component reference: variable length encoded index into the path
  string dictionary table which also covers the entry mode.  To make the
  overall encoding efficient, the author table is already sorted by usage
  frequency so the most used names are first and require the shortest
  index encoding.

- SHA1 reference: either variable length encoding of the index into the
  SHA1 table or the literal SHA1 prefixed by 0 (see add_sha1_ref()).

Rationale: all the tree object data is densely encoded while requiring
no zlib inflate processing, and all SHA1 references are most likely to
be direct indices into the pack index file requiring no SHA1 search.
Path filtering can be accomplished on the path index directly without
any string comparison during the tree traversal.

Still lacking is some kind of delta encoding for multiple tree objects
with only small differences between them.  But that'll come later.

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 packv4-create.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)

diff --git a/packv4-create.c b/packv4-create.c
index cedbbd9..f46fbd9 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -408,6 +408,69 @@ bad:
 	return NULL;
 }
 
+/*
+ * This converts a canonical tree object buffer into its
+ * tightly packed representation using the already populated
+ * and sorted tree_path_table dictionary.  The parsing is
+ * strict so to ensure the canonical version may always be
+ * regenerated and produce the same hash.
+ */
+void * conv_to_dict_tree(void *buffer, unsigned long *psize)
+{
+	unsigned long size = *psize;
+	unsigned char *in, *out, *end;
+	struct tree_desc desc;
+	struct name_entry name_entry;
+	int nb_entries;
+
+	if (!size)
+		return NULL;
+
+	/*
+	 * We can't make sure the result will always be smaller.
+	 * The smallest possible entry is "0 x\0<40 byte SHA1>"
+	 * or 44 bytes.  The output entry may have a realistic path index
+	 * encoding using up to 3 bytes, and a non indexable SHA1 meaning
+	 * 41 bytes.  And the output data already has the and nb_entries
+	 * headers.  In practice the output size will be significantly
+	 * smaller but for now let's make it simple.
+	 */
+	in = buffer;
+	out = xmalloc(size);
+	end = out + size;
+	buffer = out;
+
+	/* let's count how many entries there are */
+	init_tree_desc(&desc, in, size);
+	nb_entries = 0;
+	while (tree_entry(&desc, &name_entry))
+		nb_entries++;
+	out = add_number(out, nb_entries);
+
+	init_tree_desc(&desc, in, size);
+	while (tree_entry(&desc, &name_entry)) {
+		int pathlen = tree_entry_len(&name_entry);
+		int index = dict_add_entry(tree_path_table, name_entry.mode,
+					   name_entry.path, pathlen);
+		if (index < 0) {
+			error("missing tree dict entry");
+			free(buffer);
+			return NULL;
+		}
+		if (end - out < 45) {
+			unsigned long sofar = out - (unsigned char *)buffer;
+			buffer = xrealloc(buffer, sofar + 45);
+			out = (unsigned char *)buffer + sofar;
+			end = out + 45;
+		}
+		out = add_number(out, index);
+		out = add_sha1_ref(out, name_entry.sha1);
+	}
+
+	*psize = out - (unsigned char *)buffer;
+	return buffer;
+}
+
 static struct pack_idx_entry *get_packed_object_list(struct packed_git *p)
 {
 	unsigned i, nr_objects = p->num_objects;
-- 
1.8.4.22.g54757b7

^ permalink raw reply related	[flat|nested] 83+ messages in thread

* [PATCH 11/23] pack v4: dictionary table output
  2013-08-27  4:25 [PATCH 00/23] Preliminary pack v4 support Nicolas Pitre
                   ` (9 preceding siblings ...)
  2013-08-27  4:25 ` [PATCH 10/23] pack v4: tree " Nicolas Pitre
@ 2013-08-27  4:25 ` Nicolas Pitre
  2013-08-27  4:25 ` [PATCH 12/23] pack v4: creation code Nicolas Pitre
                   ` (13 subsequent siblings)
  24 siblings, 0 replies; 83+ messages in thread
From: Nicolas Pitre @ 2013-08-27  4:25 UTC (permalink / raw)
  To: git

Here's the code to dump a table into a pack.  Table entries are written
according to the current sort order. This is important as objects use
this order to index into the table.

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 packv4-create.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/packv4-create.c b/packv4-create.c
index f46fbd9..2956fda 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -556,6 +556,55 @@ static int create_pack_dictionaries(struct packed_git *p,
 	return 0;
 }
 
+static unsigned long write_dict_table(struct sha1file *f, struct dict_table *t)
+{
+	unsigned char buffer[1024], *end;
+	unsigned hdrlen;
+	unsigned long size, datalen;
+	z_stream stream;
+	int i, status;
+
+	/*
+	 * Stored dict table format: uncompressed data length followed by
+	 * compressed content.
+	 */
+
+	datalen = t->ptr;
+	end = add_number(buffer, datalen);
+	hdrlen = end - buffer;
+	sha1write(f, buffer, hdrlen);
+
+	memset(&stream, 0, sizeof(stream));
+	deflateInit(&stream, pack_compression_level);
+
+	for (i = 0; i < t->nb_entries; i++) {
+		stream.next_in = t->data + t->entry[i].offset;
+		stream.avail_in = 2 + strlen((char *)t->data + t->entry[i].offset + 2) + 1;
+		do {
+			stream.next_out = (unsigned char *)buffer;
+			stream.avail_out = sizeof(buffer);
+			status = deflate(&stream, 0);
+			size = stream.next_out - (unsigned char *)buffer;
+			sha1write(f, buffer, size);
+		} while (status == Z_OK);
+	}
+	do {
+		stream.next_out = (unsigned char *)buffer;
+		stream.avail_out = sizeof(buffer);
+		status = deflate(&stream, Z_FINISH);
+		size = stream.next_out - (unsigned char *)buffer;
+		sha1write(f, buffer, size);
+	} while (status == Z_OK);
+	if (status != Z_STREAM_END)
+		die("unable to deflate dictionary table (%d)", status);
+	if (stream.total_in != datalen)
+		die("dict data size mismatch (%ld vs %ld)",
+		    stream.total_in, datalen);
+	deflateEnd(&stream);
+
+	return hdrlen + datalen;
+}
+
 static struct packed_git *open_pack(const char *path)
 {
 	char arg[PATH_MAX];
-- 
1.8.4.22.g54757b7

^ permalink raw reply related	[flat|nested] 83+ messages in thread

* [PATCH 12/23] pack v4: creation code
  2013-08-27  4:25 [PATCH 00/23] Preliminary pack v4 support Nicolas Pitre
                   ` (10 preceding siblings ...)
  2013-08-27  4:25 ` [PATCH 11/23] pack v4: dictionary table output Nicolas Pitre
@ 2013-08-27  4:25 ` Nicolas Pitre
  2013-08-27 15:48   ` Junio C Hamano
  2013-08-27  4:25 ` [PATCH 13/23] pack v4: object headers Nicolas Pitre
                   ` (12 subsequent siblings)
  24 siblings, 1 reply; 83+ messages in thread
From: Nicolas Pitre @ 2013-08-27  4:25 UTC (permalink / raw)
  To: git

Let's actually open the destination pack file and write the header and
the tables.

The header isn't much different from pack v3, except for the pack version
number of course.

The first table is the sorted SHA1 table normally found in the pack index
file.  With pack v4 we write this table in the main pack file instead as
it is index referenced by subsequent objects in the pack.  Doing so has
many advantages:

- The SHA1 references used to be duplicated on disk: once in the pack
  index file, and then at least once or more within commit and tree
  objects referencing them.  The only SHA1 which is not being listed more
  than once this way is the one for a branch tip commit object and those
  are normally very few.  Now all that SHA1 data is represented only once.

- The SHA1 references found in commit and tree objects can be obtained
  on disk directly without having to deflate those objects first.

The SHA1 table size is obtained by multiplying the number of objects by 20.

And then the commit and path dictionary tables are written right after
the SHA1 table.

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 packv4-create.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 55 insertions(+), 5 deletions(-)

diff --git a/packv4-create.c b/packv4-create.c
index 2956fda..5211f9c 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -605,6 +605,48 @@ static unsigned long write_dict_table(struct sha1file *f, struct dict_table *t)
 	return hdrlen + datalen;
 }
 
+static struct sha1file * packv4_open(char *path)
+{
+	int fd;
+
+	fd = open(path, O_CREAT|O_EXCL|O_WRONLY, 0600);
+	if (fd < 0)
+		die_errno("unable to create '%s'", path);
+	return sha1fd(fd, path);
+}
+
+static unsigned int packv4_write_header(struct sha1file *f, unsigned nr_objects)
+{
+	struct pack_header hdr;
+
+	hdr.hdr_signature = htonl(PACK_SIGNATURE);
+	hdr.hdr_version = htonl(4);
+	hdr.hdr_entries = htonl(nr_objects);
+	sha1write(f, &hdr, sizeof(hdr));
+
+	return sizeof(hdr);
+}
+
+static unsigned long packv4_write_tables(struct sha1file *f, unsigned nr_objects,
+					 struct pack_idx_entry *objs)
+{
+	unsigned i;
+	unsigned long written = 0;
+
+	/* The sorted list of object SHA1's is always first */
+	for (i = 0; i < nr_objects; i++)
+		sha1write(f, objs[i].sha1, 20);
+	written = 20 * nr_objects;
+
+	/* Then the commit dictionary table */
+	written += write_dict_table(f, commit_name_table);
+
+	/* Followed by the path component dictionary table */
+	written += write_dict_table(f, tree_path_table);
+
+	return written;
+}
+
 static struct packed_git *open_pack(const char *path)
 {
 	char arg[PATH_MAX];
@@ -658,9 +700,10 @@ static struct packed_git *open_pack(const char *path)
 	return p;
 }
 
-static void process_one_pack(char *src_pack)
+static void process_one_pack(char *src_pack, char *dst_pack)
 {
 	struct packed_git *p;
+	struct sha1file *f;
 	struct pack_idx_entry *objs, **p_objs;
 	unsigned nr_objects;
 
@@ -673,15 +716,22 @@ static void process_one_pack(char *src_pack)
 	p_objs = sort_objs_by_offset(objs, nr_objects);
 
 	create_pack_dictionaries(p, p_objs);
+
+	f = packv4_open(dst_pack);
+	if (!f)
+		die("unable to open destination pack");
+	packv4_write_header(f, nr_objects);
+	packv4_write_tables(f, nr_objects, objs);
 }
 
 int main(int argc, char *argv[])
 {
-	if (argc != 2) {
-		fprintf(stderr, "Usage: %s <packfile>\n", argv[0]);
+	if (argc != 3) {
+		fprintf(stderr, "Usage: %s <src_packfile> <dst_packfile>\n", argv[0]);
 		exit(1);
 	}
-	process_one_pack(argv[1]);
-	dict_dump();
+	process_one_pack(argv[1], argv[2]);
+	if (0)
+		dict_dump();
 	return 0;
 }
-- 
1.8.4.22.g54757b7

^ permalink raw reply related	[flat|nested] 83+ messages in thread

* [PATCH 13/23] pack v4: object headers
  2013-08-27  4:25 [PATCH 00/23] Preliminary pack v4 support Nicolas Pitre
                   ` (11 preceding siblings ...)
  2013-08-27  4:25 ` [PATCH 12/23] pack v4: creation code Nicolas Pitre
@ 2013-08-27  4:25 ` Nicolas Pitre
  2013-08-27  4:25 ` [PATCH 14/23] pack v4: object data copy Nicolas Pitre
                   ` (11 subsequent siblings)
  24 siblings, 0 replies; 83+ messages in thread
From: Nicolas Pitre @ 2013-08-27  4:25 UTC (permalink / raw)
  To: git

In pack v4 the object size and type is encoded differently from pack v3.
The object size uses the same efficient variable length number encoding
already used elsewhere.

The object type has 4 bits allocated to it compared to 3 bits in pack v3.
This should be quite sufficient for the foreseeable future, especially
since pack v4 has only one type of delta object instead of two.

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 packv4-create.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/packv4-create.c b/packv4-create.c
index 5211f9c..6e0bb1d 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -647,6 +647,32 @@ static unsigned long packv4_write_tables(struct sha1file *f, unsigned nr_objects
 	return written;
 }
 
+static unsigned int write_object_header(struct sha1file *f, enum object_type type, unsigned long size)
+{
+	unsigned char buf[30], *end;
+	uint64_t val;
+
+	/*
+	 * We really have only one kind of delta object.
+	 */
+	if (type == OBJ_OFS_DELTA)
+		type = OBJ_REF_DELTA;
+
+	/*
+	 * We allocate 4 bits in the LSB for the object type which should
+	 * be good for quite a while, given that we effectively encodes
+	 * only 5 object types: commit, tree, blob, delta, tag.
+	 */
+	val = size;
+	if (MSB(val, 4))
+		die("fixme: the code doesn't currently cope with big sizes");
+	val <<= 4;
+	val |= type;
+	end = add_number(buf, val);
+	sha1write(f, buf, end - buf);
+	return end - buf;
+}
+
 static struct packed_git *open_pack(const char *path)
 {
 	char arg[PATH_MAX];
-- 
1.8.4.22.g54757b7

^ permalink raw reply related	[flat|nested] 83+ messages in thread

* [PATCH 14/23] pack v4: object data copy
  2013-08-27  4:25 [PATCH 00/23] Preliminary pack v4 support Nicolas Pitre
                   ` (12 preceding siblings ...)
  2013-08-27  4:25 ` [PATCH 13/23] pack v4: object headers Nicolas Pitre
@ 2013-08-27  4:25 ` Nicolas Pitre
  2013-08-27 15:53   ` Junio C Hamano
  2013-08-27  4:25 ` [PATCH 15/23] pack v4: object writing Nicolas Pitre
                   ` (10 subsequent siblings)
  24 siblings, 1 reply; 83+ messages in thread
From: Nicolas Pitre @ 2013-08-27  4:25 UTC (permalink / raw)
  To: git

Blob and tag objects have no particular changes except for their object
header.

Delta objects are also copied as is, except for their delta base reference
which is converted to the new way as used elsewhere in pack v4 encoding
i.e. an index into the SHA1 table or a literal SHA1 prefixed by 0 if not
found in the table (see add_sha1_ref).  This is true for both REF_DELTA
as well as OFS_DELTA.

Object payload is validated against the recorded CRC32 in the source
pack index file when possible.

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 packv4-create.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/packv4-create.c b/packv4-create.c
index 6e0bb1d..a6dc818 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -12,6 +12,7 @@
 #include "object.h"
 #include "tree-walk.h"
 #include "pack.h"
+#include "pack-revindex.h"
 
 
 static int pack_compression_level = Z_DEFAULT_COMPRESSION;
@@ -673,6 +674,71 @@ static unsigned int write_object_header(struct sha1file *f, enum object_type typ
 	return end - buf;
 }
 
+static unsigned long copy_object_data(struct sha1file *f, struct packed_git *p,
+				      off_t offset)
+{
+	struct pack_window *w_curs = NULL;
+	struct revindex_entry *revidx;
+	enum object_type type;
+	unsigned long avail, size, datalen, written;
+	int hdrlen, idx_nr;
+	unsigned char *src, *end, buf[24];
+
+	revidx = find_pack_revindex(p, offset);
+	idx_nr = revidx->nr;
+	datalen = revidx[1].offset - offset;
+
+	src = use_pack(p, &w_curs, offset, &avail);
+	hdrlen = unpack_object_header_buffer(src, avail, &type, &size);
+
+	written = write_object_header(f, type, size);
+
+	if (type == OBJ_OFS_DELTA) {
+		unsigned char c = src[hdrlen++];
+		off_t base_offset = c & 127;
+		while (c & 128) {
+			base_offset += 1;
+			if (!base_offset || MSB(base_offset, 7))
+				die("delta offset overflow");
+			c = src[hdrlen++];
+			base_offset = (base_offset << 7) + (c & 127);
+		}
+		base_offset = offset - base_offset;
+		if (base_offset <= 0 || base_offset >= offset)
+			die("delta offset out of bound");
+		revidx = find_pack_revindex(p, base_offset);
+		end = add_sha1_ref(buf, nth_packed_object_sha1(p, revidx->nr));
+		sha1write(f, buf, end - buf);
+		written += end - buf;
+	} else if (type == OBJ_REF_DELTA) {
+		end = add_sha1_ref(buf, src + hdrlen);
+		hdrlen += 20;
+		sha1write(f, buf, end - buf);
+		written += end - buf;
+	}
+
+	if (p->index_version > 1 &&
+	    check_pack_crc(p, &w_curs, offset, datalen, idx_nr))
+		die("bad CRC for object at offset %"PRIuMAX" in %s",
+		    (uintmax_t)offset, p->pack_name);
+
+	offset += hdrlen;
+	datalen -= hdrlen;
+
+	while (datalen) {
+		src = use_pack(p, &w_curs, offset, &avail);
+		if (avail > datalen)
+			avail = datalen;
+		sha1write(f, src, avail);
+		written += avail;
+		offset += avail;
+		datalen -= avail;
+	}
+	unuse_pack(&w_curs);
+
+	return written;
+}
+
 static struct packed_git *open_pack(const char *path)
 {
 	char arg[PATH_MAX];
-- 
1.8.4.22.g54757b7

^ permalink raw reply related	[flat|nested] 83+ messages in thread

* [PATCH 15/23] pack v4: object writing
  2013-08-27  4:25 [PATCH 00/23] Preliminary pack v4 support Nicolas Pitre
                   ` (13 preceding siblings ...)
  2013-08-27  4:25 ` [PATCH 14/23] pack v4: object data copy Nicolas Pitre
@ 2013-08-27  4:25 ` Nicolas Pitre
  2013-08-27  4:26 ` [PATCH 16/23] pack v4: tree object delta encoding Nicolas Pitre
                   ` (9 subsequent siblings)
  24 siblings, 0 replies; 83+ messages in thread
From: Nicolas Pitre @ 2013-08-27  4:25 UTC (permalink / raw)
  To: git

This adds the missing code to finally be able to produce a complete
pack file version 4.  We trap commit and tree objects as those have
a completely new encoding.  Other object types are copied almost
unchanged.

As we go the pack index entries are updated  in place to store the new
object offsets once they're written to the destination file.  This will
be needed later for writing the pack index file.

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 packv4-create.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 71 insertions(+), 3 deletions(-)

diff --git a/packv4-create.c b/packv4-create.c
index a6dc818..a9e4fb3 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -739,6 +739,59 @@ static unsigned long copy_object_data(struct sha1file *f, struct packed_git *p,
 	return written;
 }
 
+static off_t packv4_write_object(struct sha1file *f, struct packed_git *p,
+				 struct pack_idx_entry *obj)
+{
+	void *src, *result;
+	struct object_info oi = {};
+	enum object_type type;
+	unsigned long size;
+	unsigned int hdrlen;
+
+	oi.typep = &type;
+	oi.sizep = &size;
+	if (packed_object_info(p, obj->offset, &oi) < 0)
+		die("cannot get type of %s from %s",
+		    sha1_to_hex(obj->sha1), p->pack_name);
+
+	/* Some objects are copied without decompression */
+	switch (type) {
+	case OBJ_COMMIT:
+	case OBJ_TREE:
+		break;
+	default:
+		return copy_object_data(f, p, obj->offset);
+	}
+
+	/* The rest is converted into their new format */
+	src = unpack_entry(p, obj->offset, &type, &size);
+	if (!src)
+		die("cannot unpack %s from %s",
+		    sha1_to_hex(obj->sha1), p->pack_name);
+	if (check_sha1_signature(obj->sha1, src, size, typename(type)))
+		die("packed %s from %s is corrupt",
+		    sha1_to_hex(obj->sha1), p->pack_name);
+
+	switch (type) {
+	case OBJ_COMMIT:
+		result = conv_to_dict_commit(src, &size);
+		break;
+	case OBJ_TREE:
+		result = conv_to_dict_tree(src, &size);
+		break;
+	default:
+		die("unexpected object type %d", type);
+	}
+	free(src);
+	if (!result && size)
+		die("can't convert %s object %s",
+		    typename(type), sha1_to_hex(obj->sha1));
+	hdrlen = write_object_header(f, type, size);
+	sha1write(f, result, size);
+	free(result);
+	return hdrlen + size;
+}
+
 static struct packed_git *open_pack(const char *path)
 {
 	char arg[PATH_MAX];
@@ -797,7 +850,8 @@ static void process_one_pack(char *src_pack, char *dst_pack)
 	struct packed_git *p;
 	struct sha1file *f;
 	struct pack_idx_entry *objs, **p_objs;
-	unsigned nr_objects;
+	unsigned i, nr_objects;
+	off_t written = 0;
 
 	p = open_pack(src_pack);
 	if (!p)
@@ -808,12 +862,26 @@ static void process_one_pack(char *src_pack, char *dst_pack)
 	p_objs = sort_objs_by_offset(objs, nr_objects);
 
 	create_pack_dictionaries(p, p_objs);
+	sort_dict_entries_by_hits(commit_name_table);
+	sort_dict_entries_by_hits(tree_path_table);
 
 	f = packv4_open(dst_pack);
 	if (!f)
 		die("unable to open destination pack");
-	packv4_write_header(f, nr_objects);
-	packv4_write_tables(f, nr_objects, objs);
+	written += packv4_write_header(f, nr_objects);
+	written += packv4_write_tables(f, nr_objects, objs);
+
+	/* Let's write objects out, updating the object index list in place */
+	all_objs = objs;
+	all_objs_nr = nr_objects;
+	for (i = 0; i < nr_objects; i++) {
+		off_t obj_pos = written;
+		struct pack_idx_entry *obj = p_objs[i];
+		written += packv4_write_object(f, p, obj);
+		obj->offset = obj_pos;
+	}
+
+	sha1close(f, NULL, CSUM_CLOSE | CSUM_FSYNC);
 }
 
 int main(int argc, char *argv[])
-- 
1.8.4.22.g54757b7

^ permalink raw reply related	[flat|nested] 83+ messages in thread

* [PATCH 16/23] pack v4: tree object delta encoding
  2013-08-27  4:25 [PATCH 00/23] Preliminary pack v4 support Nicolas Pitre
                   ` (14 preceding siblings ...)
  2013-08-27  4:25 ` [PATCH 15/23] pack v4: object writing Nicolas Pitre
@ 2013-08-27  4:26 ` Nicolas Pitre
  2013-08-27  4:26 ` [PATCH 17/23] pack v4: load delta candidate for encoding tree objects Nicolas Pitre
                   ` (8 subsequent siblings)
  24 siblings, 0 replies; 83+ messages in thread
From: Nicolas Pitre @ 2013-08-27  4:26 UTC (permalink / raw)
  To: git

In order to be able to quickly walk tree objects, let's encode their
"delta" as a range of entries into another tree object.

The encoding allows for the base object to change so multiple base
objects can be borrowed from.  The code doesn't try to exploit this
possibility at the moment though.

The code isn't optimal at the moment as it doesn't consider the case
where a borrow sequence could be larger than the local sequence it
means to replace.

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 packv4-create.c | 115 ++++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 104 insertions(+), 11 deletions(-)

diff --git a/packv4-create.c b/packv4-create.c
index a9e4fb3..744514c 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -409,24 +409,53 @@ bad:
 	return NULL;
 }
 
+static int compare_tree_entries(struct name_entry *e1, struct name_entry *e2)
+{
+	int len1 = tree_entry_len(e1);
+	int len2 = tree_entry_len(e2);
+	int len = len1 < len2 ? len1 : len2;
+	unsigned char c1, c2;
+	int cmp;
+
+	cmp = memcmp(e1->path, e2->path, len);
+	if (cmp)
+		return cmp;
+	c1 = e1->path[len];
+	c2 = e2->path[len];
+	if (!c1 && S_ISDIR(e1->mode))
+		c1 = '/';
+	if (!c2 && S_ISDIR(e2->mode))
+		c2 = '/';
+	return c1 - c2;
+}
+
 /*
  * This converts a canonical tree object buffer into its
  * tightly packed representation using the already populated
  * and sorted tree_path_table dictionary.  The parsing is
  * strict so to ensure the canonical version may always be
  * regenerated and produce the same hash.
+ *
+ * If a delta buffer is provided, we may encode multiple ranges of tree
+ * entries against that buffer.
  */
-void * conv_to_dict_tree(void *buffer, unsigned long *psize)
+void * conv_to_dict_tree(void *buffer, unsigned long *psize,
+			 void *delta, unsigned long delta_size,
+			 const unsigned char *delta_sha1)
 {
 	unsigned long size = *psize;
 	unsigned char *in, *out, *end;
-	struct tree_desc desc;
-	struct name_entry name_entry;
+	struct tree_desc desc, delta_desc;
+	struct name_entry name_entry, delta_entry;
 	int nb_entries;
+	unsigned int delta_start, delta_pos = 0, delta_match = 0, first_delta = 1;
 
 	if (!size)
 		return NULL;
 
+	if (!delta_size)
+		delta = NULL;
+
 	/*
 	 * We can't make sure the result will always be smaller.
 	 * The smallest possible entry is "0 x\0<40 byte SHA1>"
@@ -449,21 +478,85 @@ void * conv_to_dict_tree(void *buffer, unsigned long *psize)
 	out = add_number(out, nb_entries);
 
 	init_tree_desc(&desc, in, size);
+	if (delta) {
+		init_tree_desc(&delta_desc, delta, delta_size);
+		if (!tree_entry(&delta_desc, &delta_entry))
+			delta = NULL;
+	}
+
 	while (tree_entry(&desc, &name_entry)) {
-		int pathlen = tree_entry_len(&name_entry);
-		int index = dict_add_entry(tree_path_table, name_entry.mode,
-					   name_entry.path, pathlen);
-		if (index < 0) {
-			error("missing tree dict entry");
-			free(buffer);
-			return NULL;
+		int pathlen, index;
+
+		/*
+		 * Try to match entries against our delta object.
+		 */
+		if (delta) {
+			int ret;
+
+			do {
+				ret = compare_tree_entries(&name_entry, &delta_entry);
+				if (ret < 1)
+					break;
+				if (tree_entry(&delta_desc, &delta_entry))
+					delta_pos++;
+				else
+					delta = NULL;
+			} while (delta);
+
+			if (ret == 0) {
+				if (!delta_match)
+					delta_start = delta_pos;
+				delta_match++;
+				continue;
+			}
+		}
+
+		if (delta_match) {
+			/*
+			 * Let's write a sequence indicating we're copying
+			 * entries from another object:
+			 *
+			 * 0 + start_entry + nr_entries + object_ref
+			 *
+			 * However, if object_ref is the same as the
+			 * preceeding one, we can omit it and save some
+			 * more space, especially if that ends up being a
+			 * full sha1 reference.  Let's steal the LSB
+			 * of delta_start for that purpose.
+			 */
+			if (end - out < (first_delta ? 48 : 7)) {
+				unsigned long sofar = out - (unsigned char *)buffer;
+				buffer = xrealloc(buffer, sofar + 48);
+				out = (unsigned char *)buffer + sofar;
+				end = out + 48;
+			}
+
+			delta_start <<= 1;
+			delta_start |= first_delta;
+			out = add_number(out, 0);
+			out = add_number(out, delta_start);
+			out = add_number(out, delta_match);
+			if (first_delta)
+				out = add_sha1_ref(out, delta_sha1);
+			delta_match = 0;
+			first_delta = 0;
 		}
+
 		if (end - out < 45) {
 			unsigned long sofar = out - (unsigned char *)buffer;
 			buffer = xrealloc(buffer, sofar + 45);
 			out = (unsigned char *)buffer + sofar;
 			end = out + 45;
 		}
+
+		pathlen = tree_entry_len(&name_entry);
+		index = dict_add_entry(tree_path_table, name_entry.mode,
+				       name_entry.path, pathlen);
+		if (index < 0) {
+			error("missing tree dict entry");
+			free(buffer);
+			return NULL;
+		}
 		out = add_number(out, index);
 		out = add_sha1_ref(out, name_entry.sha1);
 	}
@@ -777,7 +870,7 @@ static off_t packv4_write_object(struct sha1file *f, struct packed_git *p,
 		result = conv_to_dict_commit(src, &size);
 		break;
 	case OBJ_TREE:
-		result = conv_to_dict_tree(src, &size);
+		result = conv_to_dict_tree(src, &size, NULL, 0, NULL);
 		break;
 	default:
 		die("unexpected object type %d", type);
-- 
1.8.4.22.g54757b7

^ permalink raw reply related	[flat|nested] 83+ messages in thread

* [PATCH 17/23] pack v4: load delta candidate for encoding tree objects
  2013-08-27  4:25 [PATCH 00/23] Preliminary pack v4 support Nicolas Pitre
                   ` (15 preceding siblings ...)
  2013-08-27  4:26 ` [PATCH 16/23] pack v4: tree object delta encoding Nicolas Pitre
@ 2013-08-27  4:26 ` Nicolas Pitre
  2013-08-27  4:26 ` [PATCH 18/23] pack v4: honor pack.compression config option Nicolas Pitre
                   ` (7 subsequent siblings)
  24 siblings, 0 replies; 83+ messages in thread
From: Nicolas Pitre @ 2013-08-27  4:26 UTC (permalink / raw)
  To: git

The SHA1 of the base object is retrieved and the corresponding object
is loaded in memory for conv_to_dict_tree() to look at.  Simple but
effective.  Obviously this relies on the delta matching already performed
during the pack v3 delta search.  Some native delta search for pack v4
could be investigated eventually.

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 packv4-create.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 66 insertions(+), 3 deletions(-)

diff --git a/packv4-create.c b/packv4-create.c
index 744514c..ed67eb6 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -832,18 +832,62 @@ static unsigned long copy_object_data(struct sha1file *f, struct packed_git *p,
 	return written;
 }
 
+static unsigned char *get_delta_base(struct packed_git *p, off_t offset,
+				     unsigned char *sha1_buf)
+{
+	struct pack_window *w_curs = NULL;
+	enum object_type type;
+	unsigned long avail, size;
+	int hdrlen;
+	unsigned char *src;
+	const unsigned char *base_sha1 = NULL; ;
+
+	src = use_pack(p, &w_curs, offset, &avail);
+	hdrlen = unpack_object_header_buffer(src, avail, &type, &size);
+
+	if (type == OBJ_OFS_DELTA) {
+		struct revindex_entry *revidx;
+		unsigned char c = src[hdrlen++];
+		off_t base_offset = c & 127;
+		while (c & 128) {
+			base_offset += 1;
+			if (!base_offset || MSB(base_offset, 7))
+				error("delta offset overflow");
+			c = src[hdrlen++];
+			base_offset = (base_offset << 7) + (c & 127);
+		}
+		base_offset = offset - base_offset;
+		if (base_offset <= 0 || base_offset >= offset)
+			error("delta offset out of bound");
+		revidx = find_pack_revindex(p, base_offset);
+		base_sha1 = nth_packed_object_sha1(p, revidx->nr);
+	} else if (type == OBJ_REF_DELTA) {
+		base_sha1 = src + hdrlen;
+	} else {
+		error("expected to get a delta but got a %s", typename(type));
+	}
+
+	unuse_pack(&w_curs);
+
+	if (!base_sha1)
+		return NULL;
+	hashcpy(sha1_buf, base_sha1);
+	return sha1_buf;
+}
+
 static off_t packv4_write_object(struct sha1file *f, struct packed_git *p,
 				 struct pack_idx_entry *obj)
 {
 	void *src, *result;
 	struct object_info oi = {};
-	enum object_type type;
+	enum object_type type, packed_type;
 	unsigned long size;
 	unsigned int hdrlen;
 
 	oi.typep = &type;
 	oi.sizep = &size;
-	if (packed_object_info(p, obj->offset, &oi) < 0)
+	packed_type = packed_object_info(p, obj->offset, &oi);
+	if (packed_type < 0)
 		die("cannot get type of %s from %s",
 		    sha1_to_hex(obj->sha1), p->pack_name);
 
@@ -870,7 +914,26 @@ static off_t packv4_write_object(struct sha1file *f, struct packed_git *p,
 		result = conv_to_dict_commit(src, &size);
 		break;
 	case OBJ_TREE:
-		result = conv_to_dict_tree(src, &size, NULL, 0, NULL);
+		if (packed_type != OBJ_TREE) {
+			unsigned char sha1_buf[20], *ref_sha1;
+			void *ref;
+			enum object_type ref_type;
+			unsigned long ref_size;
+
+			ref_sha1 = get_delta_base(p, obj->offset, sha1_buf);
+			if (!ref_sha1)
+				die("unable to get delta base sha1 for %s",
+						sha1_to_hex(obj->sha1));
+			ref = read_sha1_file(ref_sha1, &ref_type, &ref_size);
+			if (!ref || ref_type != OBJ_TREE)
+				die("cannot obtain delta base for %s",
+						sha1_to_hex(obj->sha1));
+			result = conv_to_dict_tree(src, &size,
+					ref, ref_size, ref_sha1);
+			free(ref);
+		} else {
+			result = conv_to_dict_tree(src, &size, NULL, 0, NULL);
+		}
 		break;
 	default:
 		die("unexpected object type %d", type);
-- 
1.8.4.22.g54757b7

^ permalink raw reply related	[flat|nested] 83+ messages in thread

* [PATCH 18/23] pack v4: honor pack.compression config option
  2013-08-27  4:25 [PATCH 00/23] Preliminary pack v4 support Nicolas Pitre
                   ` (16 preceding siblings ...)
  2013-08-27  4:26 ` [PATCH 17/23] pack v4: load delta candidate for encoding tree objects Nicolas Pitre
@ 2013-08-27  4:26 ` Nicolas Pitre
  2013-08-27  4:26 ` [PATCH 19/23] pack v4: relax commit parsing a bit Nicolas Pitre
                   ` (6 subsequent siblings)
  24 siblings, 0 replies; 83+ messages in thread
From: Nicolas Pitre @ 2013-08-27  4:26 UTC (permalink / raw)
  To: git

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 packv4-create.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/packv4-create.c b/packv4-create.c
index ed67eb6..2d46d11 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -15,6 +15,7 @@
 #include "pack-revindex.h"
 
 
+static int pack_compression_seen;
 static int pack_compression_level = Z_DEFAULT_COMPRESSION;
 
 struct data_entry {
@@ -1040,12 +1041,30 @@ static void process_one_pack(char *src_pack, char *dst_pack)
 	sha1close(f, NULL, CSUM_CLOSE | CSUM_FSYNC);
 }
 
+static int git_pack_config(const char *k, const char *v, void *cb)
+{
+	if (!strcmp(k, "pack.compression")) {
+		int level = git_config_int(k, v);
+		if (level == -1)
+			level = Z_DEFAULT_COMPRESSION;
+		else if (level < 0 || level > Z_BEST_COMPRESSION)
+			die("bad pack compression level %d", level);
+		pack_compression_level = level;
+		pack_compression_seen = 1;
+		return 0;
+	}
+	return git_default_config(k, v, cb);
+}
+
 int main(int argc, char *argv[])
 {
 	if (argc != 3) {
 		fprintf(stderr, "Usage: %s <src_packfile> <dst_packfile>\n", argv[0]);
 		exit(1);
 	}
+	git_config(git_pack_config, NULL);
+	if (!pack_compression_seen && core_compression_seen)
+		pack_compression_level = core_compression_level;
 	process_one_pack(argv[1], argv[2]);
 	if (0)
 		dict_dump();
-- 
1.8.4.22.g54757b7

^ permalink raw reply related	[flat|nested] 83+ messages in thread

* [PATCH 19/23] pack v4: relax commit parsing a bit
  2013-08-27  4:25 [PATCH 00/23] Preliminary pack v4 support Nicolas Pitre
                   ` (17 preceding siblings ...)
  2013-08-27  4:26 ` [PATCH 18/23] pack v4: honor pack.compression config option Nicolas Pitre
@ 2013-08-27  4:26 ` Nicolas Pitre
  2013-08-27  4:26 ` [PATCH 20/23] pack index v3 Nicolas Pitre
                   ` (5 subsequent siblings)
  24 siblings, 0 replies; 83+ messages in thread
From: Nicolas Pitre @ 2013-08-27  4:26 UTC (permalink / raw)
  To: git

At least commit af25e94d4dcfb9608846242fabdd4e6014e5c9f0 in the Linux
kernel repository has "author  <> 1120285620 -0700"

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 packv4-create.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/packv4-create.c b/packv4-create.c
index 2d46d11..63dc3d2 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -157,12 +157,12 @@ static char *get_nameend_and_tz(char *from, int *tz_val)
 	char *end, *tz;
 
 	tz = strchr(from, '\n');
-	/* let's assume the smallest possible string to be "x <x> 0 +0000\n" */
-	if (!tz || tz - from < 13)
+	/* let's assume the smallest possible string to be " <> 0 +0000\n" */
+	if (!tz || tz - from < 11)
 		return NULL;
 	tz -= 4;
 	end = tz - 4;
-	while (end - from > 5 && *end != ' ')
+	while (end - from > 3 && *end != ' ')
 		end--;
 	if (end[-1] != '>' || end[0] != ' ' || tz[-2] != ' ')
 		return NULL;
-- 
1.8.4.22.g54757b7

^ permalink raw reply related	[flat|nested] 83+ messages in thread

* [PATCH 20/23] pack index v3
  2013-08-27  4:25 [PATCH 00/23] Preliminary pack v4 support Nicolas Pitre
                   ` (18 preceding siblings ...)
  2013-08-27  4:26 ` [PATCH 19/23] pack v4: relax commit parsing a bit Nicolas Pitre
@ 2013-08-27  4:26 ` Nicolas Pitre
  2013-08-27  4:26 ` [PATCH 21/23] pack v4: normalize pack name to properly generate the pack index file name Nicolas Pitre
                   ` (4 subsequent siblings)
  24 siblings, 0 replies; 83+ messages in thread
From: Nicolas Pitre @ 2013-08-27  4:26 UTC (permalink / raw)
  To: git

This is a minor change over pack index v2.  Since pack v4 already contains
the sorted SHA1 table, it is therefore ommitted from the index file.

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 pack-write.c    |  6 +++++-
 packv4-create.c | 10 +++++++++-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/pack-write.c b/pack-write.c
index ca9e63b..631007e 100644
--- a/pack-write.c
+++ b/pack-write.c
@@ -87,6 +87,8 @@ const char *write_idx_file(const char *index_name, struct pack_idx_entry **objec
 
 	/* if last object's offset is >= 2^31 we should use index V2 */
 	index_version = need_large_offset(last_obj_offset, opts) ? 2 : opts->version;
+	if (index_version < opts->version)
+		index_version = opts->version;
 
 	/* index versions 2 and above need a header */
 	if (index_version >= 2) {
@@ -127,7 +129,9 @@ const char *write_idx_file(const char *index_name, struct pack_idx_entry **objec
 			uint32_t offset = htonl(obj->offset);
 			sha1write(f, &offset, 4);
 		}
-		sha1write(f, obj->sha1, 20);
+		/* Pack v4 (using index v3) carries the SHA1 table already */
+		if (index_version < 3)
+			sha1write(f, obj->sha1, 20);
 		git_SHA1_Update(&ctx, obj->sha1, 20);
 		if ((opts->flags & WRITE_IDX_STRICT) &&
 		    (i && !hashcmp(list[-2]->sha1, obj->sha1)))
diff --git a/packv4-create.c b/packv4-create.c
index 63dc3d2..bb171c5 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -1007,8 +1007,10 @@ static void process_one_pack(char *src_pack, char *dst_pack)
 	struct packed_git *p;
 	struct sha1file *f;
 	struct pack_idx_entry *objs, **p_objs;
+	struct pack_idx_option idx_opts;
 	unsigned i, nr_objects;
 	off_t written = 0;
+	unsigned char pack_sha1[20];
 
 	p = open_pack(src_pack);
 	if (!p)
@@ -1034,11 +1036,17 @@ static void process_one_pack(char *src_pack, char *dst_pack)
 	for (i = 0; i < nr_objects; i++) {
 		off_t obj_pos = written;
 		struct pack_idx_entry *obj = p_objs[i];
+		crc32_begin(f);
 		written += packv4_write_object(f, p, obj);
 		obj->offset = obj_pos;
+		obj->crc32 = crc32_end(f);
 	}
 
-	sha1close(f, NULL, CSUM_CLOSE | CSUM_FSYNC);
+	sha1close(f, pack_sha1, CSUM_CLOSE | CSUM_FSYNC);
+
+	reset_pack_idx_option(&idx_opts);
+	idx_opts.version = 3;
+	write_idx_file(dst_pack, p_objs, nr_objects, &idx_opts, pack_sha1);
 }
 
 static int git_pack_config(const char *k, const char *v, void *cb)
-- 
1.8.4.22.g54757b7

^ permalink raw reply related	[flat|nested] 83+ messages in thread

* [PATCH 21/23] pack v4: normalize pack name to properly generate the pack index file name
  2013-08-27  4:25 [PATCH 00/23] Preliminary pack v4 support Nicolas Pitre
                   ` (19 preceding siblings ...)
  2013-08-27  4:26 ` [PATCH 20/23] pack index v3 Nicolas Pitre
@ 2013-08-27  4:26 ` Nicolas Pitre
  2013-08-27  4:26 ` [PATCH 22/23] pack v4: add progress display Nicolas Pitre
                   ` (3 subsequent siblings)
  24 siblings, 0 replies; 83+ messages in thread
From: Nicolas Pitre @ 2013-08-27  4:26 UTC (permalink / raw)
  To: git

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 packv4-create.c | 73 +++++++++++++++++++++++++++------------------------------
 1 file changed, 34 insertions(+), 39 deletions(-)

diff --git a/packv4-create.c b/packv4-create.c
index bb171c5..6801e21 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -949,56 +949,46 @@ static off_t packv4_write_object(struct sha1file *f, struct packed_git *p,
 	return hdrlen + size;
 }
 
-static struct packed_git *open_pack(const char *path)
+static char *normalize_pack_name(const char *path)
 {
-	char arg[PATH_MAX];
+	char buf[PATH_MAX];
 	int len;
-	struct packed_git *p;
 
-	len = strlcpy(arg, path, PATH_MAX);
-	if (len >= PATH_MAX) {
-		error("name too long: %s", path);
-		return NULL;
-	}
+	len = strlcpy(buf, path, PATH_MAX);
+	if (len >= PATH_MAX - 6)
+		die("name too long: %s", path);
 
 	/*
 	 * In addition to "foo.idx" we accept "foo.pack" and "foo";
-	 * normalize these forms to "foo.idx" for add_packed_git().
+	 * normalize these forms to "foo.pack".
 	 */
-	if (has_extension(arg, ".pack")) {
-		strcpy(arg + len - 5, ".idx");
-		len--;
-	} else if (!has_extension(arg, ".idx")) {
-		if (len + 4 >= PATH_MAX) {
-			error("name too long: %s.idx", arg);
-			return NULL;
-		}
-		strcpy(arg + len, ".idx");
-		len += 4;
+	if (has_extension(buf, ".idx")) {
+		strcpy(buf + len - 4, ".pack");
+		len++;
+	} else if (!has_extension(buf, ".pack")) {
+		strcpy(buf + len, ".pack");
+		len += 5;
 	}
 
-	/*
-	 * add_packed_git() uses our buffer (containing "foo.idx") to
-	 * build the pack filename ("foo.pack").  Make sure it fits.
-	 */
-	if (len + 1 >= PATH_MAX) {
-		arg[len - 4] = '\0';
-		error("name too long: %s.pack", arg);
-		return NULL;
-	}
+	return xstrdup(buf);
+}
 
-	p = add_packed_git(arg, len, 1);
-	if (!p) {
-		error("packfile %s not found.", arg);
-		return NULL;
-	}
+static struct packed_git *open_pack(const char *path)
+{
+	char *packname = normalize_pack_name(path);
+	int len = strlen(packname);
+	struct packed_git *p;
+
+	strcpy(packname + len - 5, ".idx");
+	p = add_packed_git(packname, len - 1, 1);
+	if (!p)
+		die("packfile %s not found.", packname);
 
 	install_packed_git(p);
-	if (open_pack_index(p)) {
-		error("packfile %s index not opened", p->pack_name);
-		return NULL;
-	}
+	if (open_pack_index(p))
+		die("packfile %s index not opened", p->pack_name);
 
+	free(packname);
 	return p;
 }
 
@@ -1010,6 +1000,7 @@ static void process_one_pack(char *src_pack, char *dst_pack)
 	struct pack_idx_option idx_opts;
 	unsigned i, nr_objects;
 	off_t written = 0;
+	char *packname;
 	unsigned char pack_sha1[20];
 
 	p = open_pack(src_pack);
@@ -1024,7 +1015,8 @@ static void process_one_pack(char *src_pack, char *dst_pack)
 	sort_dict_entries_by_hits(commit_name_table);
 	sort_dict_entries_by_hits(tree_path_table);
 
-	f = packv4_open(dst_pack);
+	packname = normalize_pack_name(dst_pack);
+	f = packv4_open(packname);
 	if (!f)
 		die("unable to open destination pack");
 	written += packv4_write_header(f, nr_objects);
@@ -1046,7 +1038,10 @@ static void process_one_pack(char *src_pack, char *dst_pack)
 
 	reset_pack_idx_option(&idx_opts);
 	idx_opts.version = 3;
-	write_idx_file(dst_pack, p_objs, nr_objects, &idx_opts, pack_sha1);
+	strcpy(packname + strlen(packname) - 5, ".idx");
+	write_idx_file(packname, p_objs, nr_objects, &idx_opts, pack_sha1);
+
+	free(packname);
 }
 
 static int git_pack_config(const char *k, const char *v, void *cb)
-- 
1.8.4.22.g54757b7

^ permalink raw reply related	[flat|nested] 83+ messages in thread

* [PATCH 22/23] pack v4: add progress display
  2013-08-27  4:25 [PATCH 00/23] Preliminary pack v4 support Nicolas Pitre
                   ` (20 preceding siblings ...)
  2013-08-27  4:26 ` [PATCH 21/23] pack v4: normalize pack name to properly generate the pack index file name Nicolas Pitre
@ 2013-08-27  4:26 ` Nicolas Pitre
  2013-08-27  4:26 ` [PATCH 23/23] initial pack index v3 support on the read side Nicolas Pitre
                   ` (2 subsequent siblings)
  24 siblings, 0 replies; 83+ messages in thread
From: Nicolas Pitre @ 2013-08-27  4:26 UTC (permalink / raw)
  To: git

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 packv4-create.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/packv4-create.c b/packv4-create.c
index 6801e21..22e14da 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -13,6 +13,7 @@
 #include "tree-walk.h"
 #include "pack.h"
 #include "pack-revindex.h"
+#include "progress.h"
 
 
 static int pack_compression_seen;
@@ -609,8 +610,10 @@ static struct pack_idx_entry **sort_objs_by_offset(struct pack_idx_entry *list,
 static int create_pack_dictionaries(struct packed_git *p,
 				    struct pack_idx_entry **obj_list)
 {
+	struct progress *progress_state;
 	unsigned int i;
 
+	progress_state = start_progress("Scanning objects", p->num_objects);
 	for (i = 0; i < p->num_objects; i++) {
 		struct pack_idx_entry *obj = obj_list[i];
 		void *data;
@@ -619,6 +622,8 @@ static int create_pack_dictionaries(struct packed_git *p,
 		struct object_info oi = {};
 		int (*add_dict_entries)(void *, unsigned long);
 
+		display_progress(progress_state, i+1);
+
 		oi.typep = &type;
 		oi.sizep = &size;
 		if (packed_object_info(p, obj->offset, &oi) < 0)
@@ -648,6 +653,7 @@ static int create_pack_dictionaries(struct packed_git *p,
 		free(data);
 	}
 
+	stop_progress(&progress_state);
 	return 0;
 }
 
@@ -1002,6 +1008,7 @@ static void process_one_pack(char *src_pack, char *dst_pack)
 	off_t written = 0;
 	char *packname;
 	unsigned char pack_sha1[20];
+	struct progress *progress_state;
 
 	p = open_pack(src_pack);
 	if (!p)
@@ -1023,6 +1030,7 @@ static void process_one_pack(char *src_pack, char *dst_pack)
 	written += packv4_write_tables(f, nr_objects, objs);
 
 	/* Let's write objects out, updating the object index list in place */
+	progress_state = start_progress("Writing objects", nr_objects);
 	all_objs = objs;
 	all_objs_nr = nr_objects;
 	for (i = 0; i < nr_objects; i++) {
@@ -1032,7 +1040,9 @@ static void process_one_pack(char *src_pack, char *dst_pack)
 		written += packv4_write_object(f, p, obj);
 		obj->offset = obj_pos;
 		obj->crc32 = crc32_end(f);
+		display_progress(progress_state, i+1);
 	}
+	stop_progress(&progress_state);
 
 	sha1close(f, pack_sha1, CSUM_CLOSE | CSUM_FSYNC);
 
-- 
1.8.4.22.g54757b7

^ permalink raw reply related	[flat|nested] 83+ messages in thread

* [PATCH 23/23] initial pack index v3 support on the read side
  2013-08-27  4:25 [PATCH 00/23] Preliminary pack v4 support Nicolas Pitre
                   ` (21 preceding siblings ...)
  2013-08-27  4:26 ` [PATCH 22/23] pack v4: add progress display Nicolas Pitre
@ 2013-08-27  4:26 ` Nicolas Pitre
  2013-08-31 11:45   ` Duy Nguyen
  2013-08-27 11:17 ` [PATCH] Document pack v4 format Nguyễn Thái Ngọc Duy
  2013-08-27 15:03 ` [PATCH 00/23] Preliminary pack v4 support Junio C Hamano
  24 siblings, 1 reply; 83+ messages in thread
From: Nicolas Pitre @ 2013-08-27  4:26 UTC (permalink / raw)
  To: git

A bit crud but good enough for now.

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 cache.h     |  1 +
 sha1_file.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 52 insertions(+), 7 deletions(-)

diff --git a/cache.h b/cache.h
index b6634c4..63066a1 100644
--- a/cache.h
+++ b/cache.h
@@ -1018,6 +1018,7 @@ extern struct packed_git {
 	off_t pack_size;
 	const void *index_data;
 	size_t index_size;
+	const unsigned char *sha1_table;
 	uint32_t num_objects;
 	uint32_t num_bad_objects;
 	unsigned char *bad_object_sha1;
diff --git a/sha1_file.c b/sha1_file.c
index c2020d0..e9c54f4 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -504,7 +504,7 @@ static int check_packed_git_idx(const char *path,  struct packed_git *p)
 	hdr = idx_map;
 	if (hdr->idx_signature == htonl(PACK_IDX_SIGNATURE)) {
 		version = ntohl(hdr->idx_version);
-		if (version < 2 || version > 2) {
+		if (version < 2 || version > 3) {
 			munmap(idx_map, idx_size);
 			return error("index file %s is version %"PRIu32
 				     " and is not supported by this binary"
@@ -539,12 +539,13 @@ static int check_packed_git_idx(const char *path,  struct packed_git *p)
 			munmap(idx_map, idx_size);
 			return error("wrong index v1 file size in %s", path);
 		}
-	} else if (version == 2) {
+	} else if (version == 2 || version == 3) {
+		unsigned long min_size, max_size;
 		/*
 		 * Minimum size:
 		 *  - 8 bytes of header
 		 *  - 256 index entries 4 bytes each
-		 *  - 20-byte sha1 entry * nr
+		 *  - 20-byte sha1 entry * nr (version 2 only)
 		 *  - 4-byte crc entry * nr
 		 *  - 4-byte offset entry * nr
 		 *  - 20-byte SHA1 of the packfile
@@ -553,8 +554,10 @@ static int check_packed_git_idx(const char *path,  struct packed_git *p)
 		 * variable sized table containing 8-byte entries
 		 * for offsets larger than 2^31.
 		 */
-		unsigned long min_size = 8 + 4*256 + nr*(20 + 4 + 4) + 20 + 20;
-		unsigned long max_size = min_size;
+		min_size = 8 + 4*256 + nr*(20 + 4 + 4) + 20 + 20;
+		if (version != 2)
+			min_size -= nr*20;
+		max_size = min_size;
 		if (nr)
 			max_size += (nr - 1)*8;
 		if (idx_size < min_size || idx_size > max_size) {
@@ -573,6 +576,36 @@ static int check_packed_git_idx(const char *path,  struct packed_git *p)
 		}
 	}
 
+	if (version >= 3) {
+		/* the SHA1 table is located in the main pack file */
+		void *pack_map;
+		struct pack_header *pack_hdr;
+
+		fd = git_open_noatime(p->pack_name);
+		if (fd < 0) {
+			munmap(idx_map, idx_size);
+			return error("unable to open %s", p->pack_name);
+		}
+		if (fstat(fd, &st) != 0 || xsize_t(st.st_size) < 12 + nr*20) {
+			close(fd);
+			munmap(idx_map, idx_size);
+			return error("size of %s is wrong", p->pack_name);
+		}
+		pack_map = xmmap(NULL, 12 + nr*20, PROT_READ, MAP_PRIVATE, fd, 0);
+		close(fd);
+		pack_hdr = pack_map;
+		if (pack_hdr->hdr_signature != htonl(PACK_SIGNATURE) ||
+		    pack_hdr->hdr_version != htonl(4) ||
+		    pack_hdr->hdr_entries != htonl(nr)) {
+			munmap(idx_map, idx_size);
+			munmap(pack_map, 12 + nr*20);
+			return error("packfile for %s doesn't match expectations", path);
+		}
+		p->sha1_table = pack_map;
+		p->sha1_table += 12;
+	} else
+		p->sha1_table = NULL;
+
 	p->index_version = version;
 	p->index_data = idx_map;
 	p->index_size = idx_size;
@@ -697,6 +730,10 @@ void close_pack_index(struct packed_git *p)
 		munmap((void *)p->index_data, p->index_size);
 		p->index_data = NULL;
 	}
+	if (p->sha1_table) {
+		munmap((void *)(p->sha1_table - 12), 12 + p->num_objects * 20);
+		p->sha1_table = NULL;
+	}
 }
 
 /*
@@ -808,7 +845,7 @@ static int open_packed_git_1(struct packed_git *p)
 		return error("file %s is far too short to be a packfile", p->pack_name);
 	if (hdr.hdr_signature != htonl(PACK_SIGNATURE))
 		return error("file %s is not a GIT packfile", p->pack_name);
-	if (!pack_version_ok(hdr.hdr_version))
+	if (!pack_version_ok(hdr.hdr_version) && hdr.hdr_version != htonl(4))
 		return error("packfile %s is version %"PRIu32" and not"
 			" supported (try upgrading GIT to a newer version)",
 			p->pack_name, ntohl(hdr.hdr_version));
@@ -2226,9 +2263,12 @@ const unsigned char *nth_packed_object_sha1(struct packed_git *p,
 	index += 4 * 256;
 	if (p->index_version == 1) {
 		return index + 24 * n + 4;
-	} else {
+	} else if (p->index_version == 2) {
 		index += 8;
 		return index + 20 * n;
+	} else {
+		index = p->sha1_table;
+		return index + 20 * n;
 	}
 }
 
@@ -2241,6 +2281,8 @@ off_t nth_packed_object_offset(const struct packed_git *p, uint32_t n)
 	} else {
 		uint32_t off;
 		index += 8 + p->num_objects * (20 + 4);
+		if (p->index_version != 2)
+			index -= p->num_objects * 20;
 		off = ntohl(*((uint32_t *)(index + 4 * n)));
 		if (!(off & 0x80000000))
 			return off;
@@ -2281,6 +2323,8 @@ off_t find_pack_entry_one(const unsigned char *sha1,
 		stride = 24;
 		index += 4;
 	}
+	if (p->index_version > 2)
+		index = p->sha1_table;
 
 	if (debug_lookup)
 		printf("%02x%02x%02x... lo %u hi %u nr %"PRIu32"\n",
-- 
1.8.4.22.g54757b7

^ permalink raw reply related	[flat|nested] 83+ messages in thread

* [PATCH] Document pack v4 format
  2013-08-27  4:25 [PATCH 00/23] Preliminary pack v4 support Nicolas Pitre
                   ` (22 preceding siblings ...)
  2013-08-27  4:26 ` [PATCH 23/23] initial pack index v3 support on the read side Nicolas Pitre
@ 2013-08-27 11:17 ` Nguyễn Thái Ngọc Duy
  2013-08-27 18:25   ` Junio C Hamano
                     ` (2 more replies)
  2013-08-27 15:03 ` [PATCH 00/23] Preliminary pack v4 support Junio C Hamano
  24 siblings, 3 replies; 83+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-08-27 11:17 UTC (permalink / raw)
  To: git; +Cc: Nicolas Pitre, Nguyễn Thái Ngọc Duy


Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 For my education but may help people who are interested in the
 format. Most is gathered from commit messages, except the delta tree
 entries.
 
 .idx is not documented yet, but it does not change much and not the
 focus right now anyway.

 Documentation/technical/pack-format-v4.txt (new) | 110 +++++++++++++++++++++++
 1 file changed, 110 insertions(+)
 create mode 100644 Documentation/technical/pack-format-v4.txt

diff --git a/Documentation/technical/pack-format-v4.txt b/Documentation/technical/pack-format-v4.txt
new file mode 100644
index 0000000..9123a53
--- /dev/null
+++ b/Documentation/technical/pack-format-v4.txt
@@ -0,0 +1,110 @@
+Git pack v4 format
+==================
+
+== pack-*.pack files have the following format:
+
+   - A header appears at the beginning and consists of the following:
+
+     4-byte signature:
+	  The signature is: {'P', 'A', 'C', 'K'}
+
+     4-byte version number (network byte order): must be version
+     number 4
+
+     4-byte number of objects contained in the pack (network byte
+     order)
+
+   - (20 * nr_objects)-byte SHA-1 table: sorted in memcmp() order.
+
+   - Commit name dictionary: the uncompressed length in variable
+     encoding, followed by zlib-compressed dictionary. Each entry
+     consists of two prefix bytes storing timezone followed by a
+     NUL-terminated string.
+
+     Entries should be sorted by frequency so that the most frequent
+     entry has the smallest index, thus most efficient variable
+     encoding.
+
+   - Tree path dictionary: similar format to commit name
+     dictionary. Each entry consists of two prefix bytes storing entry
+     mode, then a NUL-terminated path name. Same sort order
+     recommendation applies.
+
+   - The header is followed by number of object entries, each of
+     which looks like this:
+
+     (undeltified representation)
+     n-byte type and length (4-bit type, (n-1)*7+4-bit length)
+     [uncompressed data]
+     [compressed data]
+
+     (deltified representation)
+     n-byte type and length (4-bit type, (n-1)*7+4-bit length)
+     base object name in SHA-1 reference encoding
+     compressed delta data
+
+     In undeltified format, blobs and tags do not have the
+     uncompressed data, all object content is compressed. Trees are
+     not compressed at all. Some headers in commits are stored
+     uncompressed, the rest is compressed.
+
+     All objects except trees are deltified and compressed the same
+     way in v3. Trees however are deltified differently and use
+     undeltified representation. See "Tree representation" below for
+     details.
+
+  - The trailer records 20-byte SHA-1 checksum of all of the above.
+
+=== Commit representation
+
+  - n-byte type and length (4-bit type, (n-1)*7+4-bit length)
+
+  - Tree SHA-1 in SHA-1 reference encoding
+
+  - Parent count in variable length encoding
+
+  - Parent SHA-1s in SHA-1 reference encoding
+
+  - Author reference: the index, in variable length encoding, to comit
+    name dictionary, which covers the name and also the time zone.
+
+  - Author timestamp in variable length encoding
+
+  - Committer reference: the index, in variable length encoding, to
+    comit name dictionary, which covers the name and also the time
+    zone.
+
+  - Committer timestamp in variable length encoding
+
+  - Compressed data of remaining header and the body
+
+=== Tree representation
+
+  - n-byte type and length (4-bit type, (n-1)*7+4-bit length)
+
+  - Number of trees in variable length encoding
+
+  - A number of trees, each consists of
+
+    Path component reference: an index, in variable length encoding,
+    into tree path dictionary, which also covers entry mode.
+
+    SHA-1 in SHA-1 reference encoding.
+
+Path component reference zero is an indicator of deltified portion and
+has the following format:
+
+  - path component reference: zero
+
+  - index of the entry to copy from, in variable length encoding
+
+  - number of entries in variable length encoding
+
+  - base tree in SHA-1 reference encoding
+
+=== SHA-1 reference encoding
+
+This encoding is used to encode SHA-1 efficiently if it's already in
+the SHA-1 table. It starts with an index number in variable length
+encoding. If it's not zero, its value minus one is the index in the
+SHA-1 table. If it's zero, 20 bytes of SHA-1 is followed.
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 83+ messages in thread

* Re: [PATCH 00/23] Preliminary pack v4 support
  2013-08-27  4:25 [PATCH 00/23] Preliminary pack v4 support Nicolas Pitre
                   ` (23 preceding siblings ...)
  2013-08-27 11:17 ` [PATCH] Document pack v4 format Nguyễn Thái Ngọc Duy
@ 2013-08-27 15:03 ` Junio C Hamano
  2013-08-27 15:59   ` Nicolas Pitre
  24 siblings, 1 reply; 83+ messages in thread
From: Junio C Hamano @ 2013-08-27 15:03 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git

Nicolas Pitre <nico@fluxnic.net> writes:

> Subject says it all... at last !
>
> This can also be fetched here:
>
> 	git://git.linaro.org/people/nico/git
>
> Demonstration of what it does at the moment:
>
> 	http://article.gmane.org/gmane.comp.version-control.git/233038
>
> I'd like to preserve the author time stamps as they relate to where in
> the world I was when the corresponding code was written.  You'll notice
> I didn't work on the code in the same order as it is now presented.

We can also notice things like "From: user@machine.(none)" ;-)

> Still open question: what to do with a thin pack.  Should we really
> complete it with local objects upon reception, or were we only over
> paranoid at the time we imposed this rule?

I do not think paranoia had much to do with it.  I am afraid that
allowing a delta in a pack to depend on a base in another pack means
that the former pack becomes unusable without the latter, which
would make object store management (e.g. partial repacking) a lot
more cumbersome, no?

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH 01/23] pack v4: initial pack dictionary structure and code
  2013-08-27  4:25 ` [PATCH 01/23] pack v4: initial pack dictionary structure and code Nicolas Pitre
@ 2013-08-27 15:08   ` Junio C Hamano
  2013-08-27 16:13     ` Nicolas Pitre
  0 siblings, 1 reply; 83+ messages in thread
From: Junio C Hamano @ 2013-08-27 15:08 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git

Nicolas Pitre <nico@fluxnic.net> writes:

> Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
> ---

Was there a reason not to reuse the hash-table Linus did in
hash.[ch]?

It may not make much of a difference for something so small and
isolated from the rest of the system, but if hash.[ch] can be easily
fixed with a small tweak to suit the use by this subsystem better,
it might be worth reusing the existing code with improvement, which
may help other potential users.

>  packv4-create.c | 137 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 137 insertions(+)
>  create mode 100644 packv4-create.c
>
> diff --git a/packv4-create.c b/packv4-create.c
> new file mode 100644
> index 0000000..2de6d41
> --- /dev/null
> +++ b/packv4-create.c
> @@ -0,0 +1,137 @@
> +/*
> + * packv4-create.c: management of dictionary tables used in pack v4
> + *
> + * (C) Nicolas Pitre <nico@fluxnic.net>
> + *
> + * This code is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include "cache.h"
> +
> +struct data_entry {
> +	unsigned offset;
> +	unsigned hits;
> +};
> +
> +struct dict_table {
> +	char *data;
> +	unsigned ptr;
> +	unsigned size;
> +	struct data_entry *entry;
> +	unsigned nb_entries;
> +	unsigned max_entries;
> +	unsigned *hash;
> +	unsigned hash_size;
> +};
> +
> +struct dict_table *create_dict_table(void)
> +{
> +	return xcalloc(sizeof(struct dict_table), 1);
> +}
> +
> +void destroy_dict_table(struct dict_table *t)
> +{
> +	free(t->data);
> +	free(t->entry);
> +	free(t->hash);
> +	free(t);
> +}
> +
> +static int locate_entry(struct dict_table *t, const char *str)
> +{
> +	int i = 0;
> +	const unsigned char *s = (const unsigned char *) str;
> +
> +	while (*s)
> +		i = i * 111 + *s++;
> +	i = (unsigned)i % t->hash_size;
> +
> +	while (t->hash[i]) {
> +		unsigned n = t->hash[i] - 1;
> +		if (!strcmp(str, t->data + t->entry[n].offset))
> +			return n;
> +		if (++i >= t->hash_size)
> +			i = 0;
> +	}
> +	return -1 - i;
> +}
> +
> +static void rehash_entries(struct dict_table *t)
> +{
> +	unsigned n;
> +
> +	t->hash_size *= 2;
> +	if (t->hash_size < 1024)
> +		t->hash_size = 1024;
> +	t->hash = xrealloc(t->hash, t->hash_size * sizeof(*t->hash));
> +	memset(t->hash, 0, t->hash_size * sizeof(*t->hash));
> +
> +	for (n = 0; n < t->nb_entries; n++) {
> +		int i = locate_entry(t, t->data + t->entry[n].offset);
> +		if (i < 0)
> +			t->hash[-1 - i] = n + 1;
> +	}
> +}
> +
> +int dict_add_entry(struct dict_table *t, const char *str)
> +{
> +	int i, len = strlen(str) + 1;
> +
> +	if (t->ptr + len >= t->size) {
> +		t->size = (t->size + len + 1024) * 3 / 2;
> +		t->data = xrealloc(t->data, t->size);
> +	}
> +	memcpy(t->data + t->ptr, str, len);
> +
> +	i = (t->nb_entries) ? locate_entry(t, t->data + t->ptr) : -1;
> +	if (i >= 0) {
> +		t->entry[i].hits++;
> +		return i;
> +	}
> +
> +	if (t->nb_entries >= t->max_entries) {
> +		t->max_entries = (t->max_entries + 1024) * 3 / 2;
> +		t->entry = xrealloc(t->entry, t->max_entries * sizeof(*t->entry));
> +	}
> +	t->entry[t->nb_entries].offset = t->ptr;
> +	t->entry[t->nb_entries].hits = 1;
> +	t->ptr += len + 1;
> +	t->nb_entries++;
> +
> +	if (t->hash_size * 3 <= t->nb_entries * 4)
> +		rehash_entries(t);
> +	else
> +		t->hash[-1 - i] = t->nb_entries;
> +
> +	return t->nb_entries - 1;
> +}
> +
> +static int cmp_dict_entries(const void *a_, const void *b_)
> +{
> +	const struct data_entry *a = a_;
> +	const struct data_entry *b = b_;
> +	int diff = b->hits - a->hits;
> +	if (!diff)
> +		diff = a->offset - b->offset;
> +	return diff;
> +}
> +
> +static void sort_dict_entries_by_hits(struct dict_table *t)
> +{
> +	qsort(t->entry, t->nb_entries, sizeof(*t->entry), cmp_dict_entries);
> +	t->hash_size = (t->nb_entries * 4 / 3) / 2;
> +	rehash_entries(t);
> +}
> +
> +void dict_dump(struct dict_table *t)
> +{
> +	int i;
> +
> +	sort_dict_entries_by_hits(t);
> +	for (i = 0; i < t->nb_entries; i++)
> +		printf("%d\t%s\n",
> +			t->entry[i].hits,
> +			t->data + t->entry[i].offset);
> +}

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH 05/23] pack v4: add commit object parsing
  2013-08-27  4:25 ` [PATCH 05/23] pack v4: add commit object parsing Nicolas Pitre
@ 2013-08-27 15:26   ` Junio C Hamano
  2013-08-27 16:47     ` Nicolas Pitre
  0 siblings, 1 reply; 83+ messages in thread
From: Junio C Hamano @ 2013-08-27 15:26 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git

Nicolas Pitre <nico@fluxnic.net> writes:

> Let's create another dictionary table to hold the author and committer
> entries.  We use the same table format used for tree entries where the
> 16 bit data prefix is conveniently used to store the timezone value.
>
> In order to copy straight from a commit object buffer, dict_add_entry()
> is modified to get the string length as the provided string pointer is
> not always be null terminated.
>
> Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
> ---
> @@ -135,8 +136,73 @@ static void sort_dict_entries_by_hits(struct dict_table *t)
>  	rehash_entries(t);
>  }
>  
> +static struct dict_table *commit_name_table;
>  static struct dict_table *tree_path_table;
>  
> +/*
> + * Parse the author/committer line from a canonical commit object.
> + * The 'from' argument points right after the "author " or "committer "
> + * string.  The time zone is parsed and stored in *tz_val.  The returned
> + * pointer is right after the end of the email address which is also just
> + * before the time value, or NULL if a parsing error is encountered.
> + */
> +static char *get_nameend_and_tz(char *from, int *tz_val)
> +{
> +	char *end, *tz;
> +
> +	tz = strchr(from, '\n');
> +	/* let's assume the smallest possible string to be "x <x> 0 +0000\n" */
> +	if (!tz || tz - from < 13)
> +		return NULL;
> +	tz -= 4;
> +	end = tz - 4;
> +	while (end - from > 5 && *end != ' ')
> +		end--;
> +	if (end[-1] != '>' || end[0] != ' ' || tz[-2] != ' ')
> +		return NULL;
> +	*tz_val = (tz[0] - '0') * 1000 +
> +		  (tz[1] - '0') * 100 +
> +		  (tz[2] - '0') * 10 +
> +		  (tz[3] - '0');
> +	switch (tz[-1]) {
> +	default:	return NULL;
> +	case '+':	break;
> +	case '-':	*tz_val = -*tz_val;
> +	}
> +	return end;
> +}

This may want to share code with ident.c::split_ident_line(), as we
have been trying to reduce the number of ident-line parsers.

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH 08/23] pack v4: basic references encoding
  2013-08-27  4:25 ` [PATCH 08/23] pack v4: basic references encoding Nicolas Pitre
@ 2013-08-27 15:29   ` Junio C Hamano
  2013-08-27 15:53     ` Nicolas Pitre
  0 siblings, 1 reply; 83+ messages in thread
From: Junio C Hamano @ 2013-08-27 15:29 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git

Nicolas Pitre <nico@fluxnic.net> writes:

> Add variable length number encoding.  Let's apply the same encoding
> currently used for the offset to base of OBJ_OFS_DELTA objects which
> is the most efficient way to do this, and apply it to everything with
> pack version 4.
>
> Also add SHA1 reference encoding.  This one is either an index into a
> SHA1 table encoded using the variable length number encoding, or the
> literal 20 bytes SHA1 prefixed with a 0.
>
> The index 0 discriminates between an actual index value or the literal
> SHA1.  Therefore when the index is used its value must be increased by 1.
>
> Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
> ---
>  packv4-create.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 44 insertions(+)
>
> diff --git a/packv4-create.c b/packv4-create.c
> index 012129b..bf33d15 100644
> --- a/packv4-create.c
> +++ b/packv4-create.c
> @@ -245,6 +245,50 @@ static void dict_dump(void)
>  	dump_dict_table(tree_path_table);
>  }
>  
> +/*
> + * Encode a numerical value with variable length into the destination buffer
> + */
> +static unsigned char *add_number(unsigned char *dst, uint64_t value)
> +{
> +	unsigned char buf[20];
> +	unsigned pos = sizeof(buf) - 1;
> +	buf[pos] = value & 127;
> +	while (value >>= 7)
> +		buf[--pos] = 128 | (--value & 127);
> +	do {
> +		*dst++ = buf[pos++];
> +	} while (pos < sizeof(buf));
> +	return dst;
> +}

This may want to reuse (or enhance-then-reuse) varint.[ch].

> +/*
> + * Encode an object SHA1 reference with either an object index into the
> + * pack SHA1 table incremented by 1, or the literal SHA1 value prefixed
> + * with a zero byte if the needed SHA1 is not available in the table.
> + */
> +static struct pack_idx_entry *all_objs;
> +static unsigned all_objs_nr;
> +static unsigned char *add_sha1_ref(unsigned char *dst, const unsigned char *sha1)
> +{
> +	unsigned lo = 0, hi = all_objs_nr;
> +
> +	do {
> +		unsigned mi = (lo + hi) / 2;
> +		int cmp = hashcmp(all_objs[mi].sha1, sha1);
> +
> +		if (cmp == 0)
> +			return add_number(dst, mi + 1);
> +		if (cmp > 0)
> +			hi = mi;
> +		else
> +			lo = mi+1;
> +	} while (lo < hi);
> +
> +	*dst++ = 0;
> +	hashcpy(dst, sha1);
> +	return dst + 20;
> +}
> +
>  static struct pack_idx_entry *get_packed_object_list(struct packed_git *p)
>  {
>  	unsigned i, nr_objects = p->num_objects;

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH 09/23] pack v4: commit object encoding
  2013-08-27  4:25 ` [PATCH 09/23] pack v4: commit object encoding Nicolas Pitre
@ 2013-08-27 15:39   ` Junio C Hamano
  2013-08-27 16:50     ` Nicolas Pitre
  2013-08-27 19:59     ` Nicolas Pitre
  2013-09-02 20:48   ` Duy Nguyen
  1 sibling, 2 replies; 83+ messages in thread
From: Junio C Hamano @ 2013-08-27 15:39 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git

Nicolas Pitre <nico@fluxnic.net> writes:

> This goes as follows:
>
> - Tree reference: either variable length encoding of the index
>   into the SHA1 table or the literal SHA1 prefixed by 0 (see
>   add_sha1_ref()).
>
> - Parent count: variable length encoding of the number of parents.
>   This is normally going to occupy a single byte but doesn't have to.
>
> - List of parent references: a list of add_sha1_ref() encoded references,
>   or nothing if the parent count was zero.
>
> - Author reference: variable length encoding of an index into the author
>   string dictionary table which also covers the time zone.  To make the
>   overall encoding efficient, the author table is already sorted by usage
>   frequency so the most used names are first and require the shortest
>   index encoding.
>
> - Author time stamp: variable length encoded.  Year 2038 ready!
>
> - Committer reference: same as author reference.
>
> - Committer time stamp: same as author time stamp.
>
> The remainder of the canonical commit object content is then zlib
> compressed and appended to the above.
>
> Rationale: The most important commit object data is densely encoded while
> requiring no zlib inflate processing, and all SHA1 references are most
> likely to be direct indices into the pack index file requiring no SHA1
> search into the pack index file.
>
> Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
> ---
>  packv4-create.c | 119 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 119 insertions(+)
>
> diff --git a/packv4-create.c b/packv4-create.c
> index bf33d15..cedbbd9 100644
> --- a/packv4-create.c
> +++ b/packv4-create.c
> @@ -13,6 +13,9 @@
>  #include "tree-walk.h"
>  #include "pack.h"
>  
> +
> +static int pack_compression_level = Z_DEFAULT_COMPRESSION;
> +
>  struct data_entry {
>  	unsigned offset;
>  	unsigned size;
> @@ -289,6 +292,122 @@ static unsigned char *add_sha1_ref(unsigned char *dst, const unsigned char *sha1
>  	return dst + 20;
>  }
>  
> +/*
> + * This converts a canonical commit object buffer into its
> + * tightly packed representation using the already populated
> + * and sorted commit_name_table dictionary.  The parsing is
> + * strict so to ensure the canonical version may always be
> + * regenerated and produce the same hash.
> + */
> +void * conv_to_dict_commit(void *buffer, unsigned long *psize)

Drop SP between asterisk and "conv_"?

> +{
> +	unsigned long size = *psize;
> +	char *in, *tail, *end;
> +	unsigned char *out;
> +	unsigned char sha1[20];
> +	int nb_parents, index, tz_val;
> +	unsigned long time;
> +	z_stream stream;
> +	int status;
> +
> +	/*
> +	 * It is guaranteed that the output is always going to be smaller
> +	 * than the input.  We could even do this conversion in place.
> +	 */
> +	in = buffer;
> +	tail = in + size;
> +	buffer = xmalloc(size);
> +	out = buffer;
> +
> +	/* parse the "tree" line */
> +	if (in + 46 >= tail || memcmp(in, "tree ", 5) || in[45] != '\n')
> +		goto bad_data;
> +	if (get_sha1_hex(in + 5, sha1) < 0)
> +		goto bad_data;

Is this strict enough to guarantee roundtrip hash identity?  Because
get_sha1_hex() accepts hexadecimal represented with uppercase A-F,
you need to reject such a "broken" commit object, no?

Same for parent commit object names below that are parsed with the
same helper.

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH 10/23] pack v4: tree object encoding
  2013-08-27  4:25 ` [PATCH 10/23] pack v4: tree " Nicolas Pitre
@ 2013-08-27 15:44   ` Junio C Hamano
  2013-08-27 16:52     ` Nicolas Pitre
  0 siblings, 1 reply; 83+ messages in thread
From: Junio C Hamano @ 2013-08-27 15:44 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git

Nicolas Pitre <nico@fluxnic.net> writes:

> This goes as follows:
>
> - Number of tree entries: variable length encoded.
>
> Then for each tree entry:
>
> - Path component reference: variable length encoded index into the path
>   string dictionary table which also covers the entry mode.  To make the
>   overall encoding efficient, the author table is already sorted by usage
>   frequency so the most used names are first and require the shortest
>   index encoding.

s/author table/tree path table/, I think. The reason why it is a
good idea to sort these tables by use count applies equally to both
the author table and the tree path table, though.

> - SHA1 reference: either variable length encoding of the index into the
>   SHA1 table or the literal SHA1 prefixed by 0 (see add_sha1_ref()).
>
> Rationale: all the tree object data is densely encoded while requiring
> no zlib inflate processing, and all SHA1 references are most likely to
> be direct indices into the pack index file requiring no SHA1 search.
> Path filtering can be accomplished on the path index directly without
> any string comparison during the tree traversal.
>
> Still lacking is some kind of delta encoding for multiple tree objects
> with only small differences between them.  But that'll come later.
>
> Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
> ---
>  packv4-create.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 63 insertions(+)
>
> diff --git a/packv4-create.c b/packv4-create.c
> index cedbbd9..f46fbd9 100644
> --- a/packv4-create.c
> +++ b/packv4-create.c
> @@ -408,6 +408,69 @@ bad:
>  	return NULL;
>  }
>  
> +/*
> + * This converts a canonical tree object buffer into its
> + * tightly packed representation using the already populated
> + * and sorted tree_path_table dictionary.  The parsing is
> + * strict so to ensure the canonical version may always be
> + * regenerated and produce the same hash.
> + */
> +void * conv_to_dict_tree(void *buffer, unsigned long *psize)
> +{
> +	unsigned long size = *psize;
> +	unsigned char *in, *out, *end;
> +	struct tree_desc desc;
> +	struct name_entry name_entry;
> +	int nb_entries;
> +
> +	if (!size)
> +		return NULL;
> +
> +	/*
> +	 * We can't make sure the result will always be smaller.
> +	 * The smallest possible entry is "0 x\0<40 byte SHA1>"
> +	 * or 44 bytes.  The output entry may have a realistic path index
> +	 * encoding using up to 3 bytes, and a non indexable SHA1 meaning
> +	 * 41 bytes.  And the output data already has the and nb_entries
> +	 * headers.  In practice the output size will be significantly
> +	 * smaller but for now let's make it simple.
> +	 */
> +	in = buffer;
> +	out = xmalloc(size);
> +	end = out + size;
> +	buffer = out;
> +
> +	/* let's count how many entries there are */
> +	init_tree_desc(&desc, in, size);
> +	nb_entries = 0;
> +	while (tree_entry(&desc, &name_entry))
> +		nb_entries++;
> +	out = add_number(out, nb_entries);
> +
> +	init_tree_desc(&desc, in, size);
> +	while (tree_entry(&desc, &name_entry)) {
> +		int pathlen = tree_entry_len(&name_entry);
> +		int index = dict_add_entry(tree_path_table, name_entry.mode,
> +					   name_entry.path, pathlen);
> +		if (index < 0) {
> +			error("missing tree dict entry");
> +			free(buffer);
> +			return NULL;
> +		}
> +		if (end - out < 45) {
> +			unsigned long sofar = out - (unsigned char *)buffer;
> +			buffer = xrealloc(buffer, sofar + 45);
> +			out = (unsigned char *)buffer + sofar;
> +			end = out + 45;
> +		}
> +		out = add_number(out, index);
> +		out = add_sha1_ref(out, name_entry.sha1);
> +	}
> +
> +	*psize = out - (unsigned char *)buffer;
> +	return buffer;
> +}
> +
>  static struct pack_idx_entry *get_packed_object_list(struct packed_git *p)
>  {
>  	unsigned i, nr_objects = p->num_objects;

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH 12/23] pack v4: creation code
  2013-08-27  4:25 ` [PATCH 12/23] pack v4: creation code Nicolas Pitre
@ 2013-08-27 15:48   ` Junio C Hamano
  2013-08-27 16:59     ` Nicolas Pitre
  0 siblings, 1 reply; 83+ messages in thread
From: Junio C Hamano @ 2013-08-27 15:48 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git

Nicolas Pitre <nico@fluxnic.net> writes:

> Let's actually open the destination pack file and write the header and
> the tables.
>
> The header isn't much different from pack v3, except for the pack version
> number of course.
>
> The first table is the sorted SHA1 table normally found in the pack index
> file.  With pack v4 we write this table in the main pack file instead as
> it is index referenced by subsequent objects in the pack.  Doing so has
> many advantages:
>
> - The SHA1 references used to be duplicated on disk: once in the pack
>   index file, and then at least once or more within commit and tree
>   objects referencing them.  The only SHA1 which is not being listed more
>   than once this way is the one for a branch tip commit object and those
>   are normally very few.  Now all that SHA1 data is represented only once.
>

This tickles my curiosity. Why isn't this SHA-1 table sorted by
reference count the same way as the tree path and the people name
tables to keep the average length of varint references short?

> - The SHA1 references found in commit and tree objects can be obtained
>   on disk directly without having to deflate those objects first.
>
> The SHA1 table size is obtained by multiplying the number of objects by 20.
>
> And then the commit and path dictionary tables are written right after
> the SHA1 table.

> Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
> ---
>  packv4-create.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 55 insertions(+), 5 deletions(-)
>
> diff --git a/packv4-create.c b/packv4-create.c
> index 2956fda..5211f9c 100644
> --- a/packv4-create.c
> +++ b/packv4-create.c
> @@ -605,6 +605,48 @@ static unsigned long write_dict_table(struct sha1file *f, struct dict_table *t)
>  	return hdrlen + datalen;
>  }
>  
> +static struct sha1file * packv4_open(char *path)
> +{
> +	int fd;
> +
> +	fd = open(path, O_CREAT|O_EXCL|O_WRONLY, 0600);
> +	if (fd < 0)
> +		die_errno("unable to create '%s'", path);
> +	return sha1fd(fd, path);
> +}
> +
> +static unsigned int packv4_write_header(struct sha1file *f, unsigned nr_objects)
> +{
> +	struct pack_header hdr;
> +
> +	hdr.hdr_signature = htonl(PACK_SIGNATURE);
> +	hdr.hdr_version = htonl(4);
> +	hdr.hdr_entries = htonl(nr_objects);
> +	sha1write(f, &hdr, sizeof(hdr));
> +
> +	return sizeof(hdr);
> +}
> +
> +static unsigned long packv4_write_tables(struct sha1file *f, unsigned nr_objects,
> +					 struct pack_idx_entry *objs)
> +{
> +	unsigned i;
> +	unsigned long written = 0;
> +
> +	/* The sorted list of object SHA1's is always first */
> +	for (i = 0; i < nr_objects; i++)
> +		sha1write(f, objs[i].sha1, 20);
> +	written = 20 * nr_objects;
> +
> +	/* Then the commit dictionary table */
> +	written += write_dict_table(f, commit_name_table);
> +
> +	/* Followed by the path component dictionary table */
> +	written += write_dict_table(f, tree_path_table);
> +
> +	return written;
> +}
> +
>  static struct packed_git *open_pack(const char *path)
>  {
>  	char arg[PATH_MAX];
> @@ -658,9 +700,10 @@ static struct packed_git *open_pack(const char *path)
>  	return p;
>  }
>  
> -static void process_one_pack(char *src_pack)
> +static void process_one_pack(char *src_pack, char *dst_pack)
>  {
>  	struct packed_git *p;
> +	struct sha1file *f;
>  	struct pack_idx_entry *objs, **p_objs;
>  	unsigned nr_objects;
>  
> @@ -673,15 +716,22 @@ static void process_one_pack(char *src_pack)
>  	p_objs = sort_objs_by_offset(objs, nr_objects);
>  
>  	create_pack_dictionaries(p, p_objs);
> +
> +	f = packv4_open(dst_pack);
> +	if (!f)
> +		die("unable to open destination pack");
> +	packv4_write_header(f, nr_objects);
> +	packv4_write_tables(f, nr_objects, objs);
>  }
>  
>  int main(int argc, char *argv[])
>  {
> -	if (argc != 2) {
> -		fprintf(stderr, "Usage: %s <packfile>\n", argv[0]);
> +	if (argc != 3) {
> +		fprintf(stderr, "Usage: %s <src_packfile> <dst_packfile>\n", argv[0]);
>  		exit(1);
>  	}
> -	process_one_pack(argv[1]);
> -	dict_dump();
> +	process_one_pack(argv[1], argv[2]);
> +	if (0)
> +		dict_dump();
>  	return 0;
>  }

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH 08/23] pack v4: basic references encoding
  2013-08-27 15:29   ` Junio C Hamano
@ 2013-08-27 15:53     ` Nicolas Pitre
  0 siblings, 0 replies; 83+ messages in thread
From: Nicolas Pitre @ 2013-08-27 15:53 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

On Tue, 27 Aug 2013, Junio C Hamano wrote:

> Nicolas Pitre <nico@fluxnic.net> writes:
> 
> > Add variable length number encoding.  Let's apply the same encoding
> > currently used for the offset to base of OBJ_OFS_DELTA objects which
> > is the most efficient way to do this, and apply it to everything with
> > pack version 4.
> >
> > Also add SHA1 reference encoding.  This one is either an index into a
> > SHA1 table encoded using the variable length number encoding, or the
> > literal 20 bytes SHA1 prefixed with a 0.
> >
> > The index 0 discriminates between an actual index value or the literal
> > SHA1.  Therefore when the index is used its value must be increased by 1.
> >
> > Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
> > ---
> >  packv4-create.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 44 insertions(+)
> >
> > diff --git a/packv4-create.c b/packv4-create.c
> > index 012129b..bf33d15 100644
> > --- a/packv4-create.c
> > +++ b/packv4-create.c
> > @@ -245,6 +245,50 @@ static void dict_dump(void)
> >  	dump_dict_table(tree_path_table);
> >  }
> >  
> > +/*
> > + * Encode a numerical value with variable length into the destination buffer
> > + */
> > +static unsigned char *add_number(unsigned char *dst, uint64_t value)
> > +{
> > +	unsigned char buf[20];
> > +	unsigned pos = sizeof(buf) - 1;
> > +	buf[pos] = value & 127;
> > +	while (value >>= 7)
> > +		buf[--pos] = 128 | (--value & 127);
> > +	do {
> > +		*dst++ = buf[pos++];
> > +	} while (pos < sizeof(buf));
> > +	return dst;
> > +}
> 
> This may want to reuse (or enhance-then-reuse) varint.[ch].

Goodie!  I didn't notice this had happened.


Nicolas

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH 14/23] pack v4: object data copy
  2013-08-27  4:25 ` [PATCH 14/23] pack v4: object data copy Nicolas Pitre
@ 2013-08-27 15:53   ` Junio C Hamano
  2013-08-27 18:24     ` Nicolas Pitre
  0 siblings, 1 reply; 83+ messages in thread
From: Junio C Hamano @ 2013-08-27 15:53 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git

Nicolas Pitre <nico@fluxnic.net> writes:

> Blob and tag objects have no particular changes except for their object
> header.
>
> Delta objects are also copied as is, except for their delta base reference
> which is converted to the new way as used elsewhere in pack v4 encoding
> i.e. an index into the SHA1 table or a literal SHA1 prefixed by 0 if not
> found in the table (see add_sha1_ref).  This is true for both REF_DELTA
> as well as OFS_DELTA.
>
> Object payload is validated against the recorded CRC32 in the source
> pack index file when possible.
>
> Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
> ---

The title somewhat confused me until I realized that this series is
building a program that would convert existing data from a single
pack into packv4 format, not a "pack-objects --pack-verison=4".

>  packv4-create.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 66 insertions(+)
>
> diff --git a/packv4-create.c b/packv4-create.c
> index 6e0bb1d..a6dc818 100644
> --- a/packv4-create.c
> +++ b/packv4-create.c
> @@ -12,6 +12,7 @@
>  #include "object.h"
>  #include "tree-walk.h"
>  #include "pack.h"
> +#include "pack-revindex.h"
>  
>  
>  static int pack_compression_level = Z_DEFAULT_COMPRESSION;
> @@ -673,6 +674,71 @@ static unsigned int write_object_header(struct sha1file *f, enum object_type typ
>  	return end - buf;
>  }
>  
> +static unsigned long copy_object_data(struct sha1file *f, struct packed_git *p,
> +				      off_t offset)
> +{
> +	struct pack_window *w_curs = NULL;
> +	struct revindex_entry *revidx;
> +	enum object_type type;
> +	unsigned long avail, size, datalen, written;
> +	int hdrlen, idx_nr;
> +	unsigned char *src, *end, buf[24];
> +
> +	revidx = find_pack_revindex(p, offset);
> +	idx_nr = revidx->nr;
> +	datalen = revidx[1].offset - offset;
> +
> +	src = use_pack(p, &w_curs, offset, &avail);
> +	hdrlen = unpack_object_header_buffer(src, avail, &type, &size);
> +
> +	written = write_object_header(f, type, size);
> +
> +	if (type == OBJ_OFS_DELTA) {
> +		unsigned char c = src[hdrlen++];
> +		off_t base_offset = c & 127;
> +		while (c & 128) {
> +			base_offset += 1;
> +			if (!base_offset || MSB(base_offset, 7))
> +				die("delta offset overflow");
> +			c = src[hdrlen++];
> +			base_offset = (base_offset << 7) + (c & 127);
> +		}
> +		base_offset = offset - base_offset;
> +		if (base_offset <= 0 || base_offset >= offset)
> +			die("delta offset out of bound");
> +		revidx = find_pack_revindex(p, base_offset);
> +		end = add_sha1_ref(buf, nth_packed_object_sha1(p, revidx->nr));
> +		sha1write(f, buf, end - buf);
> +		written += end - buf;
> +	} else if (type == OBJ_REF_DELTA) {
> +		end = add_sha1_ref(buf, src + hdrlen);
> +		hdrlen += 20;
> +		sha1write(f, buf, end - buf);
> +		written += end - buf;
> +	}
> +
> +	if (p->index_version > 1 &&
> +	    check_pack_crc(p, &w_curs, offset, datalen, idx_nr))
> +		die("bad CRC for object at offset %"PRIuMAX" in %s",
> +		    (uintmax_t)offset, p->pack_name);
> +
> +	offset += hdrlen;
> +	datalen -= hdrlen;
> +
> +	while (datalen) {
> +		src = use_pack(p, &w_curs, offset, &avail);
> +		if (avail > datalen)
> +			avail = datalen;
> +		sha1write(f, src, avail);
> +		written += avail;
> +		offset += avail;
> +		datalen -= avail;
> +	}
> +	unuse_pack(&w_curs);
> +
> +	return written;
> +}
> +
>  static struct packed_git *open_pack(const char *path)
>  {
>  	char arg[PATH_MAX];

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH 00/23] Preliminary pack v4 support
  2013-08-27 15:03 ` [PATCH 00/23] Preliminary pack v4 support Junio C Hamano
@ 2013-08-27 15:59   ` Nicolas Pitre
  2013-08-27 16:44     ` Junio C Hamano
  0 siblings, 1 reply; 83+ messages in thread
From: Nicolas Pitre @ 2013-08-27 15:59 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

On Tue, 27 Aug 2013, Junio C Hamano wrote:

> Nicolas Pitre <nico@fluxnic.net> writes:
> 
> > Subject says it all... at last !
> >
> > This can also be fetched here:
> >
> > 	git://git.linaro.org/people/nico/git
> >
> > Demonstration of what it does at the moment:
> >
> > 	http://article.gmane.org/gmane.comp.version-control.git/233038
> >
> > I'd like to preserve the author time stamps as they relate to where in
> > the world I was when the corresponding code was written.  You'll notice
> > I didn't work on the code in the same order as it is now presented.
> 
> We can also notice things like "From: user@machine.(none)" ;-)

Heh.

> > Still open question: what to do with a thin pack.  Should we really
> > complete it with local objects upon reception, or were we only over
> > paranoid at the time we imposed this rule?
> 
> I do not think paranoia had much to do with it.  I am afraid that
> allowing a delta in a pack to depend on a base in another pack means
> that the former pack becomes unusable without the latter, which
> would make object store management (e.g. partial repacking) a lot
> more cumbersome, no?

That's what I'm wondering.  We already end up with a broken repository 
if the commit graph is spread across multiple packs and one of those 
pack is removed.  Having a delta base in a separate pack is not much 
different in that regard.

So the rule could be that any kind of repacking must not carry over 
deltas with a non local base i.e. repack always produces delta 
references belonging to the same pack.


Nicolas

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH 01/23] pack v4: initial pack dictionary structure and code
  2013-08-27 15:08   ` Junio C Hamano
@ 2013-08-27 16:13     ` Nicolas Pitre
  0 siblings, 0 replies; 83+ messages in thread
From: Nicolas Pitre @ 2013-08-27 16:13 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

On Tue, 27 Aug 2013, Junio C Hamano wrote:

> Nicolas Pitre <nico@fluxnic.net> writes:
> 
> > Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
> > ---
> 
> Was there a reason not to reuse the hash-table Linus did in
> hash.[ch]?

Well... Most likely because when I started that code (which used to be 
quite different initially) it might not have been served correctly by 
hash.c, or any other reasons I long have forgotten by now which might or 
might not still be valid.

> It may not make much of a difference for something so small and
> isolated from the rest of the system, but if hash.[ch] can be easily
> fixed with a small tweak to suit the use by this subsystem better,
> it might be worth reusing the existing code with improvement, which
> may help other potential users.

Absolutely.  If someone wants to give a hand in that direction I'll 
happily integrate patches into my series.  I cannot promise I'll do the 
work myself as I prefer spending the time I have available on actually 
making pack v4 usable.


Nicolas

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH 00/23] Preliminary pack v4 support
  2013-08-27 15:59   ` Nicolas Pitre
@ 2013-08-27 16:44     ` Junio C Hamano
  2013-08-28  2:30       ` Duy Nguyen
  0 siblings, 1 reply; 83+ messages in thread
From: Junio C Hamano @ 2013-08-27 16:44 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git

Nicolas Pitre <nico@fluxnic.net> writes:

> On Tue, 27 Aug 2013, Junio C Hamano wrote:
>
>> Nicolas Pitre <nico@fluxnic.net> writes:
>> ... 
>> > I'd like to preserve the author time stamps as they relate to where in
>> > the world I was when the corresponding code was written.  You'll notice
>> > I didn't work on the code in the same order as it is now presented.
>> 
>> We can also notice things like "From: user@machine.(none)" ;-)
>
> Heh.

In any case, the "Date: " in-body header next to your "From: "
in-body header is your friend if you want to do the "where and when
did I work on this?"

>> > Still open question: what to do with a thin pack.  Should we really
>> > complete it with local objects upon reception, or were we only over
>> > paranoid at the time we imposed this rule?
>> 
>> I do not think paranoia had much to do with it.  I am afraid that
>> allowing a delta in a pack to depend on a base in another pack means
>> that the former pack becomes unusable without the latter, which
>> would make object store management (e.g. partial repacking) a lot
>> more cumbersome, no?
>
> That's what I'm wondering.  We already end up with a broken repository 
> if the commit graph is spread across multiple packs and one of those 
> pack is removed.  Having a delta base in a separate pack is not much 
> different in that regard.

In practice, maybe, but I somehow find that it is more fundamental
breakage not to be able to reconstitute objects that a pack and its
index claims to have than missing an object that is referenced in
the reachability graph.

As you have "0-index" escape hatch for SHA-1 table, but no similar
escape hatch for the people's name table, I can see why it may be
cumbersome to fix a thin pack by only appending to a received
packfile and updating a few header fields, though.

> So the rule could be that any kind of repacking must not carry over 
> deltas with a non local base i.e. repack always produces delta 
> references belonging to the same pack.

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH 05/23] pack v4: add commit object parsing
  2013-08-27 15:26   ` Junio C Hamano
@ 2013-08-27 16:47     ` Nicolas Pitre
  2013-08-27 17:42       ` Junio C Hamano
  0 siblings, 1 reply; 83+ messages in thread
From: Nicolas Pitre @ 2013-08-27 16:47 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

On Tue, 27 Aug 2013, Junio C Hamano wrote:

> Nicolas Pitre <nico@fluxnic.net> writes:
> 
> > Let's create another dictionary table to hold the author and committer
> > entries.  We use the same table format used for tree entries where the
> > 16 bit data prefix is conveniently used to store the timezone value.
> >
> > In order to copy straight from a commit object buffer, dict_add_entry()
> > is modified to get the string length as the provided string pointer is
> > not always be null terminated.
> >
> > Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
> > ---
> > @@ -135,8 +136,73 @@ static void sort_dict_entries_by_hits(struct dict_table *t)
> >  	rehash_entries(t);
> >  }
> >  
> > +static struct dict_table *commit_name_table;
> >  static struct dict_table *tree_path_table;
> >  
> > +/*
> > + * Parse the author/committer line from a canonical commit object.
> > + * The 'from' argument points right after the "author " or "committer "
> > + * string.  The time zone is parsed and stored in *tz_val.  The returned
> > + * pointer is right after the end of the email address which is also just
> > + * before the time value, or NULL if a parsing error is encountered.
> > + */
> > +static char *get_nameend_and_tz(char *from, int *tz_val)
> > +{
> > +	char *end, *tz;
> > +
> > +	tz = strchr(from, '\n');
> > +	/* let's assume the smallest possible string to be "x <x> 0 +0000\n" */
> > +	if (!tz || tz - from < 13)
> > +		return NULL;
> > +	tz -= 4;
> > +	end = tz - 4;
> > +	while (end - from > 5 && *end != ' ')
> > +		end--;
> > +	if (end[-1] != '>' || end[0] != ' ' || tz[-2] != ' ')
> > +		return NULL;
> > +	*tz_val = (tz[0] - '0') * 1000 +
> > +		  (tz[1] - '0') * 100 +
> > +		  (tz[2] - '0') * 10 +
> > +		  (tz[3] - '0');
> > +	switch (tz[-1]) {
> > +	default:	return NULL;
> > +	case '+':	break;
> > +	case '-':	*tz_val = -*tz_val;
> > +	}
> > +	return end;
> > +}
> 
> This may want to share code with ident.c::split_ident_line(), as we
> have been trying to reduce the number of ident-line parsers.

Hmmm....  The problem I have with split_ident_line() right now is about 
the fact that it is too liberal with whitespaces.  Here I must be sure I 
can deconstruct a commit object and be sure I still can regenerate it 
byte for byte in order to match its SHA1 signature.

So there _must_ always be only one space between the email closing 
bracket and the time stamp, only one space between the time stamp and 
the time zone value, and no space after the time zone.

Is there a reason why split_ident_line() is not stricter in that regard?


Nicolas

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH 09/23] pack v4: commit object encoding
  2013-08-27 15:39   ` Junio C Hamano
@ 2013-08-27 16:50     ` Nicolas Pitre
  2013-08-27 19:59     ` Nicolas Pitre
  1 sibling, 0 replies; 83+ messages in thread
From: Nicolas Pitre @ 2013-08-27 16:50 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

On Tue, 27 Aug 2013, Junio C Hamano wrote:

> Nicolas Pitre <nico@fluxnic.net> writes:
> 
> > This goes as follows:
> >
> > - Tree reference: either variable length encoding of the index
> >   into the SHA1 table or the literal SHA1 prefixed by 0 (see
> >   add_sha1_ref()).
> >
> > - Parent count: variable length encoding of the number of parents.
> >   This is normally going to occupy a single byte but doesn't have to.
> >
> > - List of parent references: a list of add_sha1_ref() encoded references,
> >   or nothing if the parent count was zero.
> >
> > - Author reference: variable length encoding of an index into the author
> >   string dictionary table which also covers the time zone.  To make the
> >   overall encoding efficient, the author table is already sorted by usage
> >   frequency so the most used names are first and require the shortest
> >   index encoding.
> >
> > - Author time stamp: variable length encoded.  Year 2038 ready!
> >
> > - Committer reference: same as author reference.
> >
> > - Committer time stamp: same as author time stamp.
> >
> > The remainder of the canonical commit object content is then zlib
> > compressed and appended to the above.
> >
> > Rationale: The most important commit object data is densely encoded while
> > requiring no zlib inflate processing, and all SHA1 references are most
> > likely to be direct indices into the pack index file requiring no SHA1
> > search into the pack index file.
> >
> > Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
> > ---
> >  packv4-create.c | 119 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 119 insertions(+)
> >
> > diff --git a/packv4-create.c b/packv4-create.c
> > index bf33d15..cedbbd9 100644
> > --- a/packv4-create.c
> > +++ b/packv4-create.c
> > @@ -13,6 +13,9 @@
> >  #include "tree-walk.h"
> >  #include "pack.h"
> >  
> > +
> > +static int pack_compression_level = Z_DEFAULT_COMPRESSION;
> > +
> >  struct data_entry {
> >  	unsigned offset;
> >  	unsigned size;
> > @@ -289,6 +292,122 @@ static unsigned char *add_sha1_ref(unsigned char *dst, const unsigned char *sha1
> >  	return dst + 20;
> >  }
> >  
> > +/*
> > + * This converts a canonical commit object buffer into its
> > + * tightly packed representation using the already populated
> > + * and sorted commit_name_table dictionary.  The parsing is
> > + * strict so to ensure the canonical version may always be
> > + * regenerated and produce the same hash.
> > + */
> > +void * conv_to_dict_commit(void *buffer, unsigned long *psize)
> 
> Drop SP between asterisk and "conv_"?
> 
> > +{
> > +	unsigned long size = *psize;
> > +	char *in, *tail, *end;
> > +	unsigned char *out;
> > +	unsigned char sha1[20];
> > +	int nb_parents, index, tz_val;
> > +	unsigned long time;
> > +	z_stream stream;
> > +	int status;
> > +
> > +	/*
> > +	 * It is guaranteed that the output is always going to be smaller
> > +	 * than the input.  We could even do this conversion in place.
> > +	 */
> > +	in = buffer;
> > +	tail = in + size;
> > +	buffer = xmalloc(size);
> > +	out = buffer;
> > +
> > +	/* parse the "tree" line */
> > +	if (in + 46 >= tail || memcmp(in, "tree ", 5) || in[45] != '\n')
> > +		goto bad_data;
> > +	if (get_sha1_hex(in + 5, sha1) < 0)
> > +		goto bad_data;
> 
> Is this strict enough to guarantee roundtrip hash identity?  Because
> get_sha1_hex() accepts hexadecimal represented with uppercase A-F,
> you need to reject such a "broken" commit object, no?

Indeed, yes.  Same concern as with split_ident_line() I mentioned 
before.

> Same for parent commit object names below that are parsed with the
> same helper.

Exact.


Nicolas

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH 10/23] pack v4: tree object encoding
  2013-08-27 15:44   ` Junio C Hamano
@ 2013-08-27 16:52     ` Nicolas Pitre
  0 siblings, 0 replies; 83+ messages in thread
From: Nicolas Pitre @ 2013-08-27 16:52 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

On Tue, 27 Aug 2013, Junio C Hamano wrote:

> Nicolas Pitre <nico@fluxnic.net> writes:
> 
> > This goes as follows:
> >
> > - Number of tree entries: variable length encoded.
> >
> > Then for each tree entry:
> >
> > - Path component reference: variable length encoded index into the path
> >   string dictionary table which also covers the entry mode.  To make the
> >   overall encoding efficient, the author table is already sorted by usage
> >   frequency so the most used names are first and require the shortest
> >   index encoding.
> 
> s/author table/tree path table/, I think. The reason why it is a
> good idea to sort these tables by use count applies equally to both
> the author table and the tree path table, though.

Exact.  This was a cut and paste mistake.


Nicolas

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH 12/23] pack v4: creation code
  2013-08-27 15:48   ` Junio C Hamano
@ 2013-08-27 16:59     ` Nicolas Pitre
  0 siblings, 0 replies; 83+ messages in thread
From: Nicolas Pitre @ 2013-08-27 16:59 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

On Tue, 27 Aug 2013, Junio C Hamano wrote:

> Nicolas Pitre <nico@fluxnic.net> writes:
> 
> > Let's actually open the destination pack file and write the header and
> > the tables.
> >
> > The header isn't much different from pack v3, except for the pack version
> > number of course.
> >
> > The first table is the sorted SHA1 table normally found in the pack index
> > file.  With pack v4 we write this table in the main pack file instead as
> > it is index referenced by subsequent objects in the pack.  Doing so has
> > many advantages:
> >
> > - The SHA1 references used to be duplicated on disk: once in the pack
> >   index file, and then at least once or more within commit and tree
> >   objects referencing them.  The only SHA1 which is not being listed more
> >   than once this way is the one for a branch tip commit object and those
> >   are normally very few.  Now all that SHA1 data is represented only once.
> >
> 
> This tickles my curiosity. Why isn't this SHA-1 table sorted by
> reference count the same way as the tree path and the people name
> tables to keep the average length of varint references short?

Doing so allows for the SHA1 index used in objects to be used directly 
for lookups into the pack index in order to know immediately the 
location of the referenced object bypassing the binary search.  
Furthermore, SHA1 references are rather evenly spread across the whole 
table.  Only tree objects may share the same SHA1 references repeatedly 
across multiple objects, and those are likely to end up being deltas 
against each other.


Nicolas

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH 05/23] pack v4: add commit object parsing
  2013-08-27 16:47     ` Nicolas Pitre
@ 2013-08-27 17:42       ` Junio C Hamano
  0 siblings, 0 replies; 83+ messages in thread
From: Junio C Hamano @ 2013-08-27 17:42 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git

Nicolas Pitre <nico@fluxnic.net> writes:

> Hmmm....  The problem I have with split_ident_line() right now is about 
> the fact that it is too liberal with whitespaces.  Here I must be sure I 
> can deconstruct a commit object and be sure I still can regenerate it 
> byte for byte in order to match its SHA1 signature.

Yeah, I now see.  It's the same "accept with leniency"
get_sha1_hex() does, which is not appropriate for the 
purpose of this codepath.

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH 14/23] pack v4: object data copy
  2013-08-27 15:53   ` Junio C Hamano
@ 2013-08-27 18:24     ` Nicolas Pitre
  0 siblings, 0 replies; 83+ messages in thread
From: Nicolas Pitre @ 2013-08-27 18:24 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

On Tue, 27 Aug 2013, Junio C Hamano wrote:

> Nicolas Pitre <nico@fluxnic.net> writes:
> 
> > Blob and tag objects have no particular changes except for their object
> > header.
> >
> > Delta objects are also copied as is, except for their delta base reference
> > which is converted to the new way as used elsewhere in pack v4 encoding
> > i.e. an index into the SHA1 table or a literal SHA1 prefixed by 0 if not
> > found in the table (see add_sha1_ref).  This is true for both REF_DELTA
> > as well as OFS_DELTA.
> >
> > Object payload is validated against the recorded CRC32 in the source
> > pack index file when possible.
> >
> > Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
> > ---
> 
> The title somewhat confused me until I realized that this series is
> building a program that would convert existing data from a single
> pack into packv4 format, not a "pack-objects --pack-verison=4".

I initially started with extensions to pack-objects but that quickly 
became a big hassle to keep things working while experimenting.  So this 
tool is just a straight pack converter which in itself turned out to be 
quite complex already.  Eventually its code could be merged into 
pack-objects.


Nicolas

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH] Document pack v4 format
  2013-08-27 11:17 ` [PATCH] Document pack v4 format Nguyễn Thái Ngọc Duy
@ 2013-08-27 18:25   ` Junio C Hamano
  2013-08-27 18:53   ` Nicolas Pitre
  2013-08-31  2:49   ` [PATCH v2] " Nguyễn Thái Ngọc Duy
  2 siblings, 0 replies; 83+ messages in thread
From: Junio C Hamano @ 2013-08-27 18:25 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git, Nicolas Pitre

Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:

> Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
> ---
>  For my education but may help people who are interested in the
>  format. Most is gathered from commit messages, except the delta tree
>  entries.

Thanks.

> diff --git a/Documentation/technical/pack-format-v4.txt b/Documentation/technical/pack-format-v4.txt

In the final version it may be a good idea to either have this
together with the documentation for the existing pack-formats, or
add a reference from the documentation for the existing formats to
point at this new file saying "for v4 see ...".

> new file mode 100644
> index 0000000..9123a53
> --- /dev/null
> +++ b/Documentation/technical/pack-format-v4.txt
> @@ -0,0 +1,110 @@
> +Git pack v4 format
> +==================
> +
> +== pack-*.pack files have the following format:
> +
> +   - A header appears at the beginning and consists of the following:
> +
> +     4-byte signature:
> +	  The signature is: {'P', 'A', 'C', 'K'}
> +
> +     4-byte version number (network byte order): must be version
> +     number 4
> +
> +     4-byte number of objects contained in the pack (network byte
> +     order)
> +
> +   - (20 * nr_objects)-byte SHA-1 table: sorted in memcmp() order.
> +
> +   - Commit name dictionary: the uncompressed length in variable
> +     encoding, followed by zlib-compressed dictionary. Each entry
> +     consists of two prefix bytes storing timezone followed by a
> +     NUL-terminated string.

The log and code use different names to call this thing.  "commit
name" is misleading (e.g. it is not "commit object name", but "names
recorded in commit objects"; it is not only for "committer" names,
but also applies to authors; it is not just names but also emails
and TZ used).  Perhaps a better name would be "ident" table, as we
use the word "ident" only to refer to data to refer to people who
are recorded on either author/committer/tagger lines of the objects?

> +     (undeltified representation)
> +     n-byte type and length (4-bit type, (n-1)*7+4-bit length)
> +     [uncompressed data]
> +     [compressed data]

These two lines are not useful; it is better spelled as [data
specific to object type] as you have to enumerate what are stored
and how for each type separately anyway.

> +=== Tree representation
> +
> +  - n-byte type and length (4-bit type, (n-1)*7+4-bit length)
> +
> +  - Number of trees in variable length encoding
> +
> +  - A number of trees, each consists of

The above "number of trees" sounds both wrong; aren't they the
number of "tree entries" (that can be blobs or subtrees) this tree
object records?

> +    Path component reference: an index, in variable length encoding,
> +    into tree path dictionary, which also covers entry mode.
> +
> +    SHA-1 in SHA-1 reference encoding.
> +
> +Path component reference zero is an indicator of deltified portion and
> +has the following format:
> +
> +  - path component reference: zero
> +
> +  - index of the entry to copy from, in variable length encoding
> +
> +  - number of entries in variable length encoding
> +
> +  - base tree in SHA-1 reference encoding
> +
> +=== SHA-1 reference encoding
> +
> +This encoding is used to encode SHA-1 efficiently if it's already in
> +the SHA-1 table. It starts with an index number in variable length
> +encoding. If it's not zero, its value minus one is the index in the
> +SHA-1 table. If it's zero, 20 bytes of SHA-1 is followed.

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH] Document pack v4 format
  2013-08-27 11:17 ` [PATCH] Document pack v4 format Nguyễn Thái Ngọc Duy
  2013-08-27 18:25   ` Junio C Hamano
@ 2013-08-27 18:53   ` Nicolas Pitre
  2013-08-31  2:49   ` [PATCH v2] " Nguyễn Thái Ngọc Duy
  2 siblings, 0 replies; 83+ messages in thread
From: Nicolas Pitre @ 2013-08-27 18:53 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git

[-- Attachment #1: Type: TEXT/PLAIN, Size: 5681 bytes --]

On Tue, 27 Aug 2013, Nguyễn Thái Ngọc Duy wrote:

> 
> Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
> ---
>  For my education but may help people who are interested in the
>  format. Most is gathered from commit messages, except the delta tree
>  entries.

Excellent!  That's the kind of thing I need help with.

Some corrections below.

>  Documentation/technical/pack-format-v4.txt (new) | 110 +++++++++++++++++++++++
>  1 file changed, 110 insertions(+)
>  create mode 100644 Documentation/technical/pack-format-v4.txt
> 
> diff --git a/Documentation/technical/pack-format-v4.txt b/Documentation/technical/pack-format-v4.txt
> new file mode 100644
> index 0000000..9123a53
> --- /dev/null
> +++ b/Documentation/technical/pack-format-v4.txt
> @@ -0,0 +1,110 @@
> +Git pack v4 format
> +==================
> +
> +== pack-*.pack files have the following format:
> +
> +   - A header appears at the beginning and consists of the following:
> +
> +     4-byte signature:
> +	  The signature is: {'P', 'A', 'C', 'K'}
> +
> +     4-byte version number (network byte order): must be version
> +     number 4
> +
> +     4-byte number of objects contained in the pack (network byte
> +     order)

Conceptually, I'd suggest we don't talk about what follows as part of 
the header.  Maybe the "tables" section would make more sense.

> +   - (20 * nr_objects)-byte SHA-1 table: sorted in memcmp() order.
> +
> +   - Commit name dictionary: the uncompressed length in variable
> +     encoding, followed by zlib-compressed dictionary. Each entry
> +     consists of two prefix bytes storing timezone followed by a
> +     NUL-terminated string.
> +
> +     Entries should be sorted by frequency so that the most frequent
> +     entry has the smallest index, thus most efficient variable
> +     encoding.
> +
> +   - Tree path dictionary: similar format to commit name
> +     dictionary. Each entry consists of two prefix bytes storing entry
> +     mode, then a NUL-terminated path name. Same sort order
> +     recommendation applies.
> +
> +   - The header is followed by number of object entries, each of
> +     which looks like this:
> +
> +     (undeltified representation)
> +     n-byte type and length (4-bit type, (n-1)*7+4-bit length)
> +     [uncompressed data]
> +     [compressed data]
> +
> +     (deltified representation)
> +     n-byte type and length (4-bit type, (n-1)*7+4-bit length)
> +     base object name in SHA-1 reference encoding
> +     compressed delta data
> +
> +     In undeltified format, blobs and tags do not have the
> +     uncompressed data, all object content is compressed. Trees are
> +     not compressed at all. Some headers in commits are stored
> +     uncompressed, the rest is compressed.
> +
> +     All objects except trees are deltified and compressed the same
> +     way in v3. Trees however are deltified differently and use

Let's note that commits are not deltified at all.

> +     undeltified representation. See "Tree representation" below for
> +     details.
> +
> +  - The trailer records 20-byte SHA-1 checksum of all of the above.
> +
> +=== Commit representation
> +
> +  - n-byte type and length (4-bit type, (n-1)*7+4-bit length)
> +
> +  - Tree SHA-1 in SHA-1 reference encoding
> +
> +  - Parent count in variable length encoding
> +
> +  - Parent SHA-1s in SHA-1 reference encoding
> +
> +  - Author reference: the index, in variable length encoding, to comit
> +    name dictionary, which covers the name and also the time zone.
> +
> +  - Author timestamp in variable length encoding
> +
> +  - Committer reference: the index, in variable length encoding, to
> +    comit name dictionary, which covers the name and also the time
> +    zone.
> +
> +  - Committer timestamp in variable length encoding
> +
> +  - Compressed data of remaining header and the body
> +
> +=== Tree representation
> +
> +  - n-byte type and length (4-bit type, (n-1)*7+4-bit length)
> +
> +  - Number of trees in variable length encoding

Maybe that would be better to refer to tree entries instead of trees.

> +  - A number of trees, each consists of
> +
> +    Path component reference: an index, in variable length encoding,
> +    into tree path dictionary, which also covers entry mode.

Note the index nust have 1 added due to 0 being reserved.

> +    SHA-1 in SHA-1 reference encoding.
> +
> +Path component reference zero is an indicator of deltified portion and
> +has the following format:
> +
> +  - path component reference: zero
> +
> +  - index of the entry to copy from, in variable length encoding

I'd say "index of the starting entry in the tree object to copy from" to 
be clearer.  Also bit 0 of this index has a special meaning, therefore 
the actual index must be shifted left by 1 bit.

> +  - number of entries in variable length encoding

"number of entries to copy ..."

> +
> +  - base tree in SHA-1 reference encoding

The presence of this one depends on bit 0 in the index above.  If that 
bit is set then this base tree is provided.  Otherwise it is not 
provided and the previous base tree reference encountered in this tree 
object should be used.  This is to avoid repeating a SHA1 reference,
especially if a literal 20-byte SHA1 has to be used, when multiple 
references to the same base tree are made which is likely to be the 
common case.

> +=== SHA-1 reference encoding
> +
> +This encoding is used to encode SHA-1 efficiently if it's already in
> +the SHA-1 table. It starts with an index number in variable length
> +encoding. If it's not zero, its value minus one is the index in the
> +SHA-1 table. If it's zero, 20 bytes of SHA-1 is followed.
> -- 
> 1.8.2.83.gc99314b

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH 09/23] pack v4: commit object encoding
  2013-08-27 15:39   ` Junio C Hamano
  2013-08-27 16:50     ` Nicolas Pitre
@ 2013-08-27 19:59     ` Nicolas Pitre
  2013-08-27 20:15       ` Junio C Hamano
  1 sibling, 1 reply; 83+ messages in thread
From: Nicolas Pitre @ 2013-08-27 19:59 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

On Tue, 27 Aug 2013, Junio C Hamano wrote:

> Nicolas Pitre <nico@fluxnic.net> writes:
> 
> > +	/* parse the "tree" line */
> > +	if (in + 46 >= tail || memcmp(in, "tree ", 5) || in[45] != '\n')
> > +		goto bad_data;
> > +	if (get_sha1_hex(in + 5, sha1) < 0)
> > +		goto bad_data;
> 
> Is this strict enough to guarantee roundtrip hash identity?  Because
> get_sha1_hex() accepts hexadecimal represented with uppercase A-F,
> you need to reject such a "broken" commit object, no?

BTW, is there any such objects in existence where sha1 ascii strings are 
represented using uppercase letters?  Because there would be a simple 
way to encode that fact in the pack v4 sha1 reference... but that change 
has to happen now.

I'm already claiming we won't support mixed case though.


Nicolas

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH 09/23] pack v4: commit object encoding
  2013-08-27 19:59     ` Nicolas Pitre
@ 2013-08-27 20:15       ` Junio C Hamano
  2013-08-27 21:43         ` Nicolas Pitre
  0 siblings, 1 reply; 83+ messages in thread
From: Junio C Hamano @ 2013-08-27 20:15 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git

Nicolas Pitre <nico@fluxnic.net> writes:

> On Tue, 27 Aug 2013, Junio C Hamano wrote:
>
>> Nicolas Pitre <nico@fluxnic.net> writes:
>> 
>> > +	/* parse the "tree" line */
>> > +	if (in + 46 >= tail || memcmp(in, "tree ", 5) || in[45] != '\n')
>> > +		goto bad_data;
>> > +	if (get_sha1_hex(in + 5, sha1) < 0)
>> > +		goto bad_data;
>> 
>> Is this strict enough to guarantee roundtrip hash identity?  Because
>> get_sha1_hex() accepts hexadecimal represented with uppercase A-F,
>> you need to reject such a "broken" commit object, no?
>
> BTW, is there any such objects in existence where sha1 ascii strings are 
> represented using uppercase letters?

Any commit or tag object that refers to another object with SHA-1
using uppercase letters is broken and invalid.  get_sha1_hex() is
not limited to reading these (i.e. it also is used to read object
name given on the command line) so it is lenient, but the above
codepath should care so that the result of hashing will stay the
same.

> Because there would be a simple 
> way to encode that fact in the pack v4 sha1 reference... but that change 
> has to happen now.

Hence, I do not think we care.

> I'm already claiming we won't support mixed case though.

Yeah, I am already claiming we won't support any uppercase ;-).

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH 09/23] pack v4: commit object encoding
  2013-08-27 20:15       ` Junio C Hamano
@ 2013-08-27 21:43         ` Nicolas Pitre
  0 siblings, 0 replies; 83+ messages in thread
From: Nicolas Pitre @ 2013-08-27 21:43 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

On Tue, 27 Aug 2013, Junio C Hamano wrote:

> Nicolas Pitre <nico@fluxnic.net> writes:
> 
> > On Tue, 27 Aug 2013, Junio C Hamano wrote:
> >
> >> Nicolas Pitre <nico@fluxnic.net> writes:
> >> 
> >> > +	/* parse the "tree" line */
> >> > +	if (in + 46 >= tail || memcmp(in, "tree ", 5) || in[45] != '\n')
> >> > +		goto bad_data;
> >> > +	if (get_sha1_hex(in + 5, sha1) < 0)
> >> > +		goto bad_data;
> >> 
> >> Is this strict enough to guarantee roundtrip hash identity?  Because
> >> get_sha1_hex() accepts hexadecimal represented with uppercase A-F,
> >> you need to reject such a "broken" commit object, no?
> >
> > BTW, is there any such objects in existence where sha1 ascii strings are 
> > represented using uppercase letters?
> 
> Any commit or tag object that refers to another object with SHA-1
> using uppercase letters is broken and invalid.  get_sha1_hex() is
> not limited to reading these (i.e. it also is used to read object
> name given on the command line) so it is lenient, but the above
> codepath should care so that the result of hashing will stay the
> same.

Indeed, hence my concern about encoding the original case.

> > Because there would be a simple 
> > way to encode that fact in the pack v4 sha1 reference... but that change 
> > has to happen now.
> 
> Hence, I do not think we care.
> 
> > I'm already claiming we won't support mixed case though.
> 
> Yeah, I am already claiming we won't support any uppercase ;-).

Perfect!

I've added a get_sha1_lowhex() to my tree.


Nicolas

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH 00/23] Preliminary pack v4 support
  2013-08-27 16:44     ` Junio C Hamano
@ 2013-08-28  2:30       ` Duy Nguyen
  2013-08-28  2:58         ` Nicolas Pitre
  0 siblings, 1 reply; 83+ messages in thread
From: Duy Nguyen @ 2013-08-28  2:30 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Nicolas Pitre, Git Mailing List

On Tue, Aug 27, 2013 at 11:44 PM, Junio C Hamano <gitster@pobox.com> wrote:
> As you have "0-index" escape hatch for SHA-1 table, but no similar
> escape hatch for the people's name table, I can see why it may be
> cumbersome to fix a thin pack by only appending to a received
> packfile and updating a few header fields, though.

We also need an escape hatch for path name table. But what if we store
appended objects in OBJ_REF_DELTA format where base ref is empty
tree/commit and cached by sha1_file.c? We won't need to update
dictionary tables. Parsing is a bit ugly though (e.g. v3 tree with v2
base) but we have to deal with that anyway because people can have v2
and v3 packs mixed in.
-- 
Duy

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH 00/23] Preliminary pack v4 support
  2013-08-28  2:30       ` Duy Nguyen
@ 2013-08-28  2:58         ` Nicolas Pitre
  2013-08-28  3:06           ` Duy Nguyen
  0 siblings, 1 reply; 83+ messages in thread
From: Nicolas Pitre @ 2013-08-28  2:58 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Junio C Hamano, Git Mailing List

On Wed, 28 Aug 2013, Duy Nguyen wrote:

> On Tue, Aug 27, 2013 at 11:44 PM, Junio C Hamano <gitster@pobox.com> wrote:
> > As you have "0-index" escape hatch for SHA-1 table, but no similar
> > escape hatch for the people's name table, I can see why it may be
> > cumbersome to fix a thin pack by only appending to a received
> > packfile and updating a few header fields, though.
> 
> We also need an escape hatch for path name table.

Well, right. I think this is probably the cleanest solution if we don't 
want to update the commit/tree dictionary table with new entries (they 
could simply be appended at the end).  That wouldn't work for the SHA1 
table though, so perhaps a secondary table for the appended objects 
could then be carried into the pack index file.

> But what if we store
> appended objects in OBJ_REF_DELTA format where base ref is empty
> tree/commit and cached by sha1_file.c?

I don't follow you here.  Missing objects need to be added to the pack, 
they can't be cached anywhere.

> We won't need to update
> dictionary tables. Parsing is a bit ugly though (e.g. v3 tree with v2
> base) but we have to deal with that anyway because people can have v2
> and v3 packs mixed in.

Absolutely.


Nicolas

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH 00/23] Preliminary pack v4 support
  2013-08-28  2:58         ` Nicolas Pitre
@ 2013-08-28  3:06           ` Duy Nguyen
  0 siblings, 0 replies; 83+ messages in thread
From: Duy Nguyen @ 2013-08-28  3:06 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Junio C Hamano, Git Mailing List

On Wed, Aug 28, 2013 at 9:58 AM, Nicolas Pitre <nico@fluxnic.net> wrote:
>> But what if we store
>> appended objects in OBJ_REF_DELTA format where base ref is empty
>> tree/commit and cached by sha1_file.c?
>
> I don't follow you here.  Missing objects need to be added to the pack,
> they can't be cached anywhere.

I use ref-delta as an alternate escape hatch to store missing commits
and trees in unparsed format. In order to do that, I need a base ref
SHA-1 and we can create and store SHA-1 of empty commit and emptry
tree in sha1_file.c (because empty commit is not in a valid format and
can't be parsed and stored in v3). So the result pack is still "thin"
but it should only lack at most two objects: empty commit and empty
tree. Thinking again this is much uglier than escape hatches to v3
tables..
-- 
Duy

^ permalink raw reply	[flat|nested] 83+ messages in thread

* [PATCH v2] Document pack v4 format
  2013-08-27 11:17 ` [PATCH] Document pack v4 format Nguyễn Thái Ngọc Duy
  2013-08-27 18:25   ` Junio C Hamano
  2013-08-27 18:53   ` Nicolas Pitre
@ 2013-08-31  2:49   ` Nguyễn Thái Ngọc Duy
  2013-09-03  6:00     ` Nicolas Pitre
  2013-09-06  2:14     ` [PATCH v3] " Nguyễn Thái Ngọc Duy
  2 siblings, 2 replies; 83+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-08-31  2:49 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Nicolas Pitre, Nguyễn Thái Ngọc Duy


Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 Incorporated suggestions by Nico and Junio. I went ahead and added
 escape hatches for converting thin packs to full ones so the document
 does not really match the code (I've been watching Nico's repository,
 commit reading is added, good stuff!)

 The proposal is, value 0 in the index to ident table is reserved,
 followed by the ident string. The real index to ident table is idx-1.

 Similarly, the value 1 in the index to path name table is reserved 
 (value 0 is already used for referring back to base tree) so the
 actual index is idx-2.

 Documentation/technical/pack-format.txt | 128 +++++++++++++++++++++++++++++++-
 1 file changed, 127 insertions(+), 1 deletion(-)

diff --git a/Documentation/technical/pack-format.txt b/Documentation/technical/pack-format.txt
index 8e5bf60..c866287 100644
--- a/Documentation/technical/pack-format.txt
+++ b/Documentation/technical/pack-format.txt
@@ -1,7 +1,7 @@
 Git pack format
 ===============
 
-== pack-*.pack files have the following format:
+== pack-*.pack files version 2 and 3 have the following format:
 
    - A header appears at the beginning and consists of the following:
 
@@ -36,6 +36,127 @@ Git pack format
 
   - The trailer records 20-byte SHA-1 checksum of all of the above.
 
+== pack-*.pack files version 4 have the following format:
+
+   - A header appears at the beginning and consists of the following:
+
+     4-byte signature:
+	The signature is: {'P', 'A', 'C', 'K'}
+
+     4-byte version number (network byte order): must be 4
+
+     4-byte number of objects contained in the pack (network byte order)
+
+   - A series of tables, described separately.
+
+   - The tables are followed by number of object entries, each of
+     which looks like below:
+
+     (undeltified representation)
+     n-byte type and length (4-bit type, (n-1)*7+4-bit length)
+     data
+
+     (deltified representation)
+     n-byte type and length (4-bit type, (n-1)*7+4-bit length)
+     base object name in SHA-1 reference encoding
+     compressed delta data
+
+     In undeltified format, blobs and tags ares compressed. Trees are
+     not compressed at all. Some headers in commits are stored
+     uncompressed, the rest is compressed. Tree and commit
+     representations are described in detail separately.
+
+     Blobs and tags are deltified and compressed the same way in
+     v3. Commits are not delitifed. Trees are deltified using
+     undeltified representation.
+
+  - The trailer records 20-byte SHA-1 checksum of all of the above.
+
+=== Pack v4 tables
+
+ - A table of sorted SHA-1 object names for all objects contained in
+   the pack.
+
+   This table can be referred to using "SHA-1 reference encoding":
+   It's an index number in variable length encoding. If it's
+   non-zero, its value minus one is the index in this table. If it's
+   zero, 20 bytes of SHA-1 is followed.
+
+ - Ident table: the uncompressed length in variable encoding,
+   followed by zlib-compressed dictionary. Each entry consists of
+   two prefix bytes storing timezone followed by a NUL-terminated
+   string.
+
+   Entries should be sorted by frequency so that the most frequent
+   entry has the smallest index, thus most efficient variable
+   encoding.
+
+   The table can be referred to using "ident reference encoding":
+   It's an index number in variable length encoding. If it's
+   non-zero, its value minus one is the index in this table. If it's
+   zero, a new entry in the same format is followed: two prefix
+   bytes and a NUL-terminated string.
+
+ - Tree path table: the same format to ident table. Each entry
+   consists of two prefix bytes storing tree entry mode, then a
+   NUL-terminated path name. Same sort order recommendation applies.
+
+=== Commit representation
+
+  - n-byte type and length (4-bit type, (n-1)*7+4-bit length)
+
+  - Tree SHA-1 in SHA-1 reference encoding
+
+  - Parent count in variable length encoding
+
+  - Parent SHA-1s in SHA-1 reference encoding
+
+  - Author reference in ident reference encoding
+
+  - Author timestamp in variable length encoding
+
+  - Committer reference in ident reference encoding
+
+  - Committer timestamp in variable length encoding
+
+  - Compressed data of remaining header and the body
+
+=== Tree representation
+
+  - n-byte type and length (4-bit type, (n-1)*7+4-bit length)
+
+  - Number of tree entries in variable length encoding
+
+  - A number of entries, each starting with path component reference:
+    an number, in variable length encoding.
+
+    If the path component reference is greater than 1, its value minus
+    two is the index in tree path table. The path component reference
+    is followed by the tree entry SHA-1 in SHA-1 reference encoding.
+
+    If the path component reference is 1, it's followed by
+
+    - two prefix bytes representing tree entry mode
+
+    - NUL-terminated path name
+
+    - tree entry SHA-1 in SHA-1 reference encoding
+
+    If the path component reference is zero, tree entries will be
+    copied from another tree. It's followed by:
+
+    - the starting index number, in variable length encoding, in the
+      base tree object to copy from. Bit zero in this number is base
+      tree flag, so the actual index is this number shifted right by
+      one bit.
+
+    - number of tree entries to copy from, in variable length encoding
+
+    - base tree in SHA-1 reference encoding if base tree flag is
+      set. If the flag is cleared, the previous base tree encountered
+      is used. This avoids repeating the same base tree SHA-1 in the
+      common case.
+
 == Original (version 1) pack-*.idx files have the following format:
 
   - The header consists of 256 4-byte network byte order
@@ -160,3 +281,8 @@ Pack file entry: <+
     corresponding packfile.
 
     20-byte SHA-1-checksum of all of the above.
+
+== Version 3 pack-*.idx files support only *.pack files version 4. The
+   format is the same as version 2 except that the table of sorted
+   20-byte SHA-1 object names is missing in the .idx files. The same
+   table exists in .pack files and will be used instead.
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 83+ messages in thread

* Re: [PATCH 23/23] initial pack index v3 support on the read side
  2013-08-27  4:26 ` [PATCH 23/23] initial pack index v3 support on the read side Nicolas Pitre
@ 2013-08-31 11:45   ` Duy Nguyen
  2013-09-03  6:09     ` Nicolas Pitre
  0 siblings, 1 reply; 83+ messages in thread
From: Duy Nguyen @ 2013-08-31 11:45 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Git Mailing List

On Tue, Aug 27, 2013 at 11:26 AM, Nicolas Pitre <nico@fluxnic.net> wrote:
> A bit crud but good enough for now.

I wonder if we should keep a short SHA-1 table in .idx. An entry in
the original .idx v1 table is <SHA-1>+<offset> then offset moved out
to make the table more compact for binary search. I observe that we
don't always need 20 byte SHA-1 to uniquely identify an entry in a
pack, so the SHA-1 table could be split in two: one table contain the
first part of SHA-1, long enough to identify any entry in the pack;
the second table contains either full SHA-1 or the remaining part.
Binary search is done on the first table, if matched, full sha-1 from
the second table is compared. We already have the second sha-1 table
in .pack v4. We could add the first table in .idx v3.

On linux-2.6 even in one-pack configuration, we only need the first 6
bytes of sha-1 to identify an object. That would cut the bsearch sha-1
table to 1/4 full sha-1 table size.

>
> Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
> ---
>  cache.h     |  1 +
>  sha1_file.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++-------
>  2 files changed, 52 insertions(+), 7 deletions(-)
>
> diff --git a/cache.h b/cache.h
> index b6634c4..63066a1 100644
> --- a/cache.h
> +++ b/cache.h
> @@ -1018,6 +1018,7 @@ extern struct packed_git {
>         off_t pack_size;
>         const void *index_data;
>         size_t index_size;
> +       const unsigned char *sha1_table;
>         uint32_t num_objects;
>         uint32_t num_bad_objects;
>         unsigned char *bad_object_sha1;
> diff --git a/sha1_file.c b/sha1_file.c
> index c2020d0..e9c54f4 100644
> --- a/sha1_file.c
> +++ b/sha1_file.c
> @@ -504,7 +504,7 @@ static int check_packed_git_idx(const char *path,  struct packed_git *p)
>         hdr = idx_map;
>         if (hdr->idx_signature == htonl(PACK_IDX_SIGNATURE)) {
>                 version = ntohl(hdr->idx_version);
> -               if (version < 2 || version > 2) {
> +               if (version < 2 || version > 3) {
>                         munmap(idx_map, idx_size);
>                         return error("index file %s is version %"PRIu32
>                                      " and is not supported by this binary"
> @@ -539,12 +539,13 @@ static int check_packed_git_idx(const char *path,  struct packed_git *p)
>                         munmap(idx_map, idx_size);
>                         return error("wrong index v1 file size in %s", path);
>                 }
> -       } else if (version == 2) {
> +       } else if (version == 2 || version == 3) {
> +               unsigned long min_size, max_size;
>                 /*
>                  * Minimum size:
>                  *  - 8 bytes of header
>                  *  - 256 index entries 4 bytes each
> -                *  - 20-byte sha1 entry * nr
> +                *  - 20-byte sha1 entry * nr (version 2 only)
>                  *  - 4-byte crc entry * nr
>                  *  - 4-byte offset entry * nr
>                  *  - 20-byte SHA1 of the packfile
> @@ -553,8 +554,10 @@ static int check_packed_git_idx(const char *path,  struct packed_git *p)
>                  * variable sized table containing 8-byte entries
>                  * for offsets larger than 2^31.
>                  */
> -               unsigned long min_size = 8 + 4*256 + nr*(20 + 4 + 4) + 20 + 20;
> -               unsigned long max_size = min_size;
> +               min_size = 8 + 4*256 + nr*(20 + 4 + 4) + 20 + 20;
> +               if (version != 2)
> +                       min_size -= nr*20;
> +               max_size = min_size;
>                 if (nr)
>                         max_size += (nr - 1)*8;
>                 if (idx_size < min_size || idx_size > max_size) {
> @@ -573,6 +576,36 @@ static int check_packed_git_idx(const char *path,  struct packed_git *p)
>                 }
>         }
>
> +       if (version >= 3) {
> +               /* the SHA1 table is located in the main pack file */
> +               void *pack_map;
> +               struct pack_header *pack_hdr;
> +
> +               fd = git_open_noatime(p->pack_name);
> +               if (fd < 0) {
> +                       munmap(idx_map, idx_size);
> +                       return error("unable to open %s", p->pack_name);
> +               }
> +               if (fstat(fd, &st) != 0 || xsize_t(st.st_size) < 12 + nr*20) {
> +                       close(fd);
> +                       munmap(idx_map, idx_size);
> +                       return error("size of %s is wrong", p->pack_name);
> +               }
> +               pack_map = xmmap(NULL, 12 + nr*20, PROT_READ, MAP_PRIVATE, fd, 0);
> +               close(fd);
> +               pack_hdr = pack_map;
> +               if (pack_hdr->hdr_signature != htonl(PACK_SIGNATURE) ||
> +                   pack_hdr->hdr_version != htonl(4) ||
> +                   pack_hdr->hdr_entries != htonl(nr)) {
> +                       munmap(idx_map, idx_size);
> +                       munmap(pack_map, 12 + nr*20);
> +                       return error("packfile for %s doesn't match expectations", path);
> +               }
> +               p->sha1_table = pack_map;
> +               p->sha1_table += 12;
> +       } else
> +               p->sha1_table = NULL;
> +
>         p->index_version = version;
>         p->index_data = idx_map;
>         p->index_size = idx_size;
> @@ -697,6 +730,10 @@ void close_pack_index(struct packed_git *p)
>                 munmap((void *)p->index_data, p->index_size);
>                 p->index_data = NULL;
>         }
> +       if (p->sha1_table) {
> +               munmap((void *)(p->sha1_table - 12), 12 + p->num_objects * 20);
> +               p->sha1_table = NULL;
> +       }
>  }
>
>  /*
> @@ -808,7 +845,7 @@ static int open_packed_git_1(struct packed_git *p)
>                 return error("file %s is far too short to be a packfile", p->pack_name);
>         if (hdr.hdr_signature != htonl(PACK_SIGNATURE))
>                 return error("file %s is not a GIT packfile", p->pack_name);
> -       if (!pack_version_ok(hdr.hdr_version))
> +       if (!pack_version_ok(hdr.hdr_version) && hdr.hdr_version != htonl(4))
>                 return error("packfile %s is version %"PRIu32" and not"
>                         " supported (try upgrading GIT to a newer version)",
>                         p->pack_name, ntohl(hdr.hdr_version));
> @@ -2226,9 +2263,12 @@ const unsigned char *nth_packed_object_sha1(struct packed_git *p,
>         index += 4 * 256;
>         if (p->index_version == 1) {
>                 return index + 24 * n + 4;
> -       } else {
> +       } else if (p->index_version == 2) {
>                 index += 8;
>                 return index + 20 * n;
> +       } else {
> +               index = p->sha1_table;
> +               return index + 20 * n;
>         }
>  }
>
> @@ -2241,6 +2281,8 @@ off_t nth_packed_object_offset(const struct packed_git *p, uint32_t n)
>         } else {
>                 uint32_t off;
>                 index += 8 + p->num_objects * (20 + 4);
> +               if (p->index_version != 2)
> +                       index -= p->num_objects * 20;
>                 off = ntohl(*((uint32_t *)(index + 4 * n)));
>                 if (!(off & 0x80000000))
>                         return off;
> @@ -2281,6 +2323,8 @@ off_t find_pack_entry_one(const unsigned char *sha1,
>                 stride = 24;
>                 index += 4;
>         }
> +       if (p->index_version > 2)
> +               index = p->sha1_table;
>
>         if (debug_lookup)
>                 printf("%02x%02x%02x... lo %u hi %u nr %"PRIu32"\n",
> --
> 1.8.4.22.g54757b7
>
> --
> To unsubscribe from this list: send the line "unsubscribe git" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html



-- 
Duy

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH 09/23] pack v4: commit object encoding
  2013-08-27  4:25 ` [PATCH 09/23] pack v4: commit object encoding Nicolas Pitre
  2013-08-27 15:39   ` Junio C Hamano
@ 2013-09-02 20:48   ` Duy Nguyen
  2013-09-03  6:30     ` Nicolas Pitre
  1 sibling, 1 reply; 83+ messages in thread
From: Duy Nguyen @ 2013-09-02 20:48 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Git Mailing List

On Tue, Aug 27, 2013 at 11:25 AM, Nicolas Pitre <nico@fluxnic.net> wrote:
> This goes as follows:
>
> - Tree reference: either variable length encoding of the index
>   into the SHA1 table or the literal SHA1 prefixed by 0 (see
>   add_sha1_ref()).
>
> - Parent count: variable length encoding of the number of parents.
>   This is normally going to occupy a single byte but doesn't have to.
>
> - List of parent references: a list of add_sha1_ref() encoded references,
>   or nothing if the parent count was zero.

With .pack v3 it's impossible to create delta cycles (3b910d0 add
tests for indexing packs with delta cycles - 2013-08-23) but it is
possible with .pack v4 (and therefore at least index-pack needs to
detect and reject them), correct? Some malicious user can create
commit A with parent ref 1, then make the SHA-1 table so that ref 1 is
A. The same with the new tree representation.
-- 
Duy

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v2] Document pack v4 format
  2013-08-31  2:49   ` [PATCH v2] " Nguyễn Thái Ngọc Duy
@ 2013-09-03  6:00     ` Nicolas Pitre
  2013-09-03  6:46       ` Nicolas Pitre
  2013-09-06  2:14     ` [PATCH v3] " Nguyễn Thái Ngọc Duy
  1 sibling, 1 reply; 83+ messages in thread
From: Nicolas Pitre @ 2013-09-03  6:00 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git, Junio C Hamano

[-- Attachment #1: Type: TEXT/PLAIN, Size: 550 bytes --]

On Sat, 31 Aug 2013, Nguyễn Thái Ngọc Duy wrote:

> 
> Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
> ---
>  Incorporated suggestions by Nico and Junio. I went ahead and added
>  escape hatches for converting thin packs to full ones so the document
>  does not really match the code (I've been watching Nico's repository,
>  commit reading is added, good stuff!)

Now tree reading is added.  multiple encoding bug fixes trickled down to 
their originating commits as well.

Something is still wrong with deltas though.


Nicolas

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH 23/23] initial pack index v3 support on the read side
  2013-08-31 11:45   ` Duy Nguyen
@ 2013-09-03  6:09     ` Nicolas Pitre
  2013-09-03  7:34       ` Duy Nguyen
  0 siblings, 1 reply; 83+ messages in thread
From: Nicolas Pitre @ 2013-09-03  6:09 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Git Mailing List

On Sat, 31 Aug 2013, Duy Nguyen wrote:

> On Tue, Aug 27, 2013 at 11:26 AM, Nicolas Pitre <nico@fluxnic.net> wrote:
> > A bit crud but good enough for now.
> 
> I wonder if we should keep a short SHA-1 table in .idx. An entry in
> the original .idx v1 table is <SHA-1>+<offset> then offset moved out
> to make the table more compact for binary search. I observe that we
> don't always need 20 byte SHA-1 to uniquely identify an entry in a
> pack, so the SHA-1 table could be split in two: one table contain the
> first part of SHA-1, long enough to identify any entry in the pack;
> the second table contains either full SHA-1 or the remaining part.
> Binary search is done on the first table, if matched, full sha-1 from
> the second table is compared. We already have the second sha-1 table
> in .pack v4. We could add the first table in .idx v3.
> 
> On linux-2.6 even in one-pack configuration, we only need the first 6
> bytes of sha-1 to identify an object. That would cut the bsearch sha-1
> table to 1/4 full sha-1 table size.

I don't see the point though.

Why having two tables when only one suffice?

If the argument is about making the SHA1 binary search more efficient 
given a smaller memory footprint, that comes with extra complexity 
coming from the variable length SHA1 strings in the second table.  So 
I'm not sure there is much to gain.

Furthermore, one of the design of pack v4 is to avoid the SHA1 binary 
search entirely.  You will need the binary search only once to locate 
the top commit object, but from there the entire object tree can be 
walked simply by using the sha1ref index and looking up the 
corresponding offset into the pack index file to locate the next object.  

Same thing for walking tree objects: the pack v4 tree deltas are meant 
to be walked inline without having to _apply_ any delta.


Nicolas

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH 09/23] pack v4: commit object encoding
  2013-09-02 20:48   ` Duy Nguyen
@ 2013-09-03  6:30     ` Nicolas Pitre
  2013-09-03  7:41       ` Duy Nguyen
  0 siblings, 1 reply; 83+ messages in thread
From: Nicolas Pitre @ 2013-09-03  6:30 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Git Mailing List

On Tue, 3 Sep 2013, Duy Nguyen wrote:

> On Tue, Aug 27, 2013 at 11:25 AM, Nicolas Pitre <nico@fluxnic.net> wrote:
> > This goes as follows:
> >
> > - Tree reference: either variable length encoding of the index
> >   into the SHA1 table or the literal SHA1 prefixed by 0 (see
> >   add_sha1_ref()).
> >
> > - Parent count: variable length encoding of the number of parents.
> >   This is normally going to occupy a single byte but doesn't have to.
> >
> > - List of parent references: a list of add_sha1_ref() encoded references,
> >   or nothing if the parent count was zero.
> 
> With .pack v3 it's impossible to create delta cycles (3b910d0 add
> tests for indexing packs with delta cycles - 2013-08-23) but it is
> possible with .pack v4 (and therefore at least index-pack needs to
> detect and reject them), correct? Some malicious user can create
> commit A with parent ref 1, then make the SHA-1 table so that ref 1 is
> A. The same with the new tree representation.

pack-index should validate the SHA1 of the object being pointed at.

In that case I doubt you'll be able to actually construct an object 
which contains a SHA1 parent reference and make the SHA1 of this very 
object resolve to the same value.


Nicolas

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v2] Document pack v4 format
  2013-09-03  6:00     ` Nicolas Pitre
@ 2013-09-03  6:46       ` Nicolas Pitre
  2013-09-03 11:49         ` Duy Nguyen
  0 siblings, 1 reply; 83+ messages in thread
From: Nicolas Pitre @ 2013-09-03  6:46 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git, Junio C Hamano

[-- Attachment #1: Type: TEXT/PLAIN, Size: 802 bytes --]

On Tue, 3 Sep 2013, Nicolas Pitre wrote:

> On Sat, 31 Aug 2013, Nguyễn Thái Ngọc Duy wrote:
> 
> > 
> > Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
> > ---
> >  Incorporated suggestions by Nico and Junio. I went ahead and added
> >  escape hatches for converting thin packs to full ones so the document
> >  does not really match the code (I've been watching Nico's repository,
> >  commit reading is added, good stuff!)
> 
> Now tree reading is added.  multiple encoding bug fixes trickled down to 
> their originating commits as well.
> 
> Something is still wrong with deltas though.

Deltas fixed now.

So... looks like pack v4 is now "functional".

However something is still wrong as it operates about 6 times slower 
than pack v3.

Anyone wishes to investigate?


Nicolas

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH 23/23] initial pack index v3 support on the read side
  2013-09-03  6:09     ` Nicolas Pitre
@ 2013-09-03  7:34       ` Duy Nguyen
  0 siblings, 0 replies; 83+ messages in thread
From: Duy Nguyen @ 2013-09-03  7:34 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Git Mailing List

On Tue, Sep 3, 2013 at 1:09 PM, Nicolas Pitre <nico@fluxnic.net> wrote:
> On Sat, 31 Aug 2013, Duy Nguyen wrote:
>
>> On Tue, Aug 27, 2013 at 11:26 AM, Nicolas Pitre <nico@fluxnic.net> wrote:
>> > A bit crud but good enough for now.
>>
>> I wonder if we should keep a short SHA-1 table in .idx. An entry in
>> the original .idx v1 table is <SHA-1>+<offset> then offset moved out
>> to make the table more compact for binary search. I observe that we
>> don't always need 20 byte SHA-1 to uniquely identify an entry in a
>> pack, so the SHA-1 table could be split in two: one table contain the
>> first part of SHA-1, long enough to identify any entry in the pack;
>> the second table contains either full SHA-1 or the remaining part.
>> Binary search is done on the first table, if matched, full sha-1 from
>> the second table is compared. We already have the second sha-1 table
>> in .pack v4. We could add the first table in .idx v3.
>>
>> On linux-2.6 even in one-pack configuration, we only need the first 6
>> bytes of sha-1 to identify an object. That would cut the bsearch sha-1
>> table to 1/4 full sha-1 table size.
>
> I don't see the point though.
>
> Why having two tables when only one suffice?
>
> If the argument is about making the SHA1 binary search more efficient
> given a smaller memory footprint, that comes with extra complexity
> coming from the variable length SHA1 strings in the second table.  So
> I'm not sure there is much to gain.
>
> Furthermore, one of the design of pack v4 is to avoid the SHA1 binary
> search entirely.  You will need the binary search only once to locate
> the top commit object, but from there the entire object tree can be
> walked simply by using the sha1ref index and looking up the
> corresponding offset into the pack index file to locate the next object.
>
> Same thing for walking tree objects: the pack v4 tree deltas are meant
> to be walked inline without having to _apply_ any delta.

Hmm.. you are right. We avoid a lot of SHA-1 lookups with .pack v4. A
more compact SHA-1 table only benefits random SHA-1 lookups, which
should be much fewer compared to "rev-list --objects --all".
-- 
Duy

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH 09/23] pack v4: commit object encoding
  2013-09-03  6:30     ` Nicolas Pitre
@ 2013-09-03  7:41       ` Duy Nguyen
  2013-09-05  3:50         ` Nicolas Pitre
  0 siblings, 1 reply; 83+ messages in thread
From: Duy Nguyen @ 2013-09-03  7:41 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Git Mailing List

On Tue, Sep 3, 2013 at 1:30 PM, Nicolas Pitre <nico@fluxnic.net> wrote:
> On Tue, 3 Sep 2013, Duy Nguyen wrote:
>
>> On Tue, Aug 27, 2013 at 11:25 AM, Nicolas Pitre <nico@fluxnic.net> wrote:
>> > This goes as follows:
>> >
>> > - Tree reference: either variable length encoding of the index
>> >   into the SHA1 table or the literal SHA1 prefixed by 0 (see
>> >   add_sha1_ref()).
>> >
>> > - Parent count: variable length encoding of the number of parents.
>> >   This is normally going to occupy a single byte but doesn't have to.
>> >
>> > - List of parent references: a list of add_sha1_ref() encoded references,
>> >   or nothing if the parent count was zero.
>>
>> With .pack v3 it's impossible to create delta cycles (3b910d0 add
>> tests for indexing packs with delta cycles - 2013-08-23) but it is
>> possible with .pack v4 (and therefore at least index-pack needs to
>> detect and reject them), correct? Some malicious user can create
>> commit A with parent ref 1, then make the SHA-1 table so that ref 1 is
>> A. The same with the new tree representation.
>
> pack-index should validate the SHA1 of the object being pointed at.
>
> In that case I doubt you'll be able to actually construct an object
> which contains a SHA1 parent reference and make the SHA1 of this very
> object resolve to the same value.

We could do that for commits. For trees, we need to look at the base's
content to construct the current tree and cycles could happen. I think
we could make a rule that base trees must appear in the pack before
the tree being constructed (similar to delta-ofs). The exception is
objects appended for fixing thin pack.
-- 
Duy

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v2] Document pack v4 format
  2013-09-03  6:46       ` Nicolas Pitre
@ 2013-09-03 11:49         ` Duy Nguyen
  2013-09-03 14:54           ` Duy Nguyen
  0 siblings, 1 reply; 83+ messages in thread
From: Duy Nguyen @ 2013-09-03 11:49 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Git Mailing List, Junio C Hamano

On Tue, Sep 3, 2013 at 1:46 PM, Nicolas Pitre <nico@fluxnic.net> wrote:
> So... looks like pack v4 is now "functional".
>
> However something is still wrong as it operates about 6 times slower
> than pack v3.
>
> Anyone wishes to investigate?

You recurse in decode_entries too deep.I check the first 10000
decode_entries() calls in pv4_get_tree(). The deepest level is 3491.
-- 
Duy

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v2] Document pack v4 format
  2013-09-03 11:49         ` Duy Nguyen
@ 2013-09-03 14:54           ` Duy Nguyen
  2013-09-05  4:12             ` Nicolas Pitre
  0 siblings, 1 reply; 83+ messages in thread
From: Duy Nguyen @ 2013-09-03 14:54 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Git Mailing List, Junio C Hamano

On Tue, Sep 3, 2013 at 6:49 PM, Duy Nguyen <pclouds@gmail.com> wrote:
> On Tue, Sep 3, 2013 at 1:46 PM, Nicolas Pitre <nico@fluxnic.net> wrote:
>> So... looks like pack v4 is now "functional".
>>
>> However something is still wrong as it operates about 6 times slower
>> than pack v3.
>>
>> Anyone wishes to investigate?
>
> You recurse in decode_entries too deep.I check the first 10000
> decode_entries() calls in pv4_get_tree(). The deepest level is 3491.

And I was wrong, the call depth is not that deep, but the number of
decode_entries calls triggered by one pv4_get_tree() is that many.
This is on git.git and the tree being processed is "t", which has 672
entries.. There are funny access patterns. This is the output of

   fprintf(stderr, "[%d] %d - %d %u\n", call_depth, copy_start,
copy_count, copy_objoffset);

[1] 0 - 1 48838573
[2] 0 - 1 48826699
[3] 0 - 1 48820760
[4] 0 - 1 48814812
[5] 0 - 1 48805904
[6] 0 - 1 48797000
[7] 0 - 1 48794034
[8] 0 - 1 48791067
[9] 0 - 1 48788100
[10] 0 - 1 48785134
[11] 0 - 1 48776221
[12] 0 - 1 48764321
[13] 0 - 1 48503227
[14] 0 - 1 48485415
[15] 0 - 1 48473512
[16] 0 - 1 48443621
[17] 0 - 1 48401788
[18] 0 - 1 48377834
[19] 0 - 1 48371841
[20] 0 - 1 48341809
[21] 0 - 1 48260734
[22] 0 - 1 48236635
[23] 0 - 1 46845105
[24] 0 - 1 14603061
[25] 2 - 1 48838573
[2] 0 - 1 48826699

It goes through 20+ base trees just to get one tree entry, I think..
-- 
Duy

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH 09/23] pack v4: commit object encoding
  2013-09-03  7:41       ` Duy Nguyen
@ 2013-09-05  3:50         ` Nicolas Pitre
  0 siblings, 0 replies; 83+ messages in thread
From: Nicolas Pitre @ 2013-09-05  3:50 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Git Mailing List

On Tue, 3 Sep 2013, Duy Nguyen wrote:

> On Tue, Sep 3, 2013 at 1:30 PM, Nicolas Pitre <nico@fluxnic.net> wrote:
> > On Tue, 3 Sep 2013, Duy Nguyen wrote:
> >
> >> On Tue, Aug 27, 2013 at 11:25 AM, Nicolas Pitre <nico@fluxnic.net> wrote:
> >> > This goes as follows:
> >> >
> >> > - Tree reference: either variable length encoding of the index
> >> >   into the SHA1 table or the literal SHA1 prefixed by 0 (see
> >> >   add_sha1_ref()).
> >> >
> >> > - Parent count: variable length encoding of the number of parents.
> >> >   This is normally going to occupy a single byte but doesn't have to.
> >> >
> >> > - List of parent references: a list of add_sha1_ref() encoded references,
> >> >   or nothing if the parent count was zero.
> >>
> >> With .pack v3 it's impossible to create delta cycles (3b910d0 add
> >> tests for indexing packs with delta cycles - 2013-08-23) but it is
> >> possible with .pack v4 (and therefore at least index-pack needs to
> >> detect and reject them), correct? Some malicious user can create
> >> commit A with parent ref 1, then make the SHA-1 table so that ref 1 is
> >> A. The same with the new tree representation.
> >
> > pack-index should validate the SHA1 of the object being pointed at.
> >
> > In that case I doubt you'll be able to actually construct an object
> > which contains a SHA1 parent reference and make the SHA1 of this very
> > object resolve to the same value.
> 
> We could do that for commits. For trees, we need to look at the base's
> content to construct the current tree and cycles could happen. I think
> we could make a rule that base trees must appear in the pack before
> the tree being constructed (similar to delta-ofs). The exception is
> objects appended for fixing thin pack.

I don't really like such artificial constraints.  You never know when 
such constraints are going to prevent future legitimate pack layout 
optimizations or the like.

It is simple enough to detect cycles during delta validation in 
index-pack anyway.


Nicolas

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v2] Document pack v4 format
  2013-09-03 14:54           ` Duy Nguyen
@ 2013-09-05  4:12             ` Nicolas Pitre
  2013-09-05  4:19               ` Duy Nguyen
  0 siblings, 1 reply; 83+ messages in thread
From: Nicolas Pitre @ 2013-09-05  4:12 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Git Mailing List, Junio C Hamano

On Tue, 3 Sep 2013, Duy Nguyen wrote:

> On Tue, Sep 3, 2013 at 6:49 PM, Duy Nguyen <pclouds@gmail.com> wrote:
> > On Tue, Sep 3, 2013 at 1:46 PM, Nicolas Pitre <nico@fluxnic.net> wrote:
> >> So... looks like pack v4 is now "functional".
> >>
> >> However something is still wrong as it operates about 6 times slower
> >> than pack v3.
> >>
> >> Anyone wishes to investigate?
> >
> > You recurse in decode_entries too deep.I check the first 10000
> > decode_entries() calls in pv4_get_tree(). The deepest level is 3491.
> 
> And I was wrong, the call depth is not that deep, but the number of
> decode_entries calls triggered by one pv4_get_tree() is that many.
> This is on git.git and the tree being processed is "t", which has 672
> entries.. There are funny access patterns. This is the output of
> 
>    fprintf(stderr, "[%d] %d - %d %u\n", call_depth, copy_start,
> copy_count, copy_objoffset);
> 
> [1] 0 - 1 48838573
> [2] 0 - 1 48826699
> [3] 0 - 1 48820760
> [4] 0 - 1 48814812
> [5] 0 - 1 48805904
> [6] 0 - 1 48797000
> [7] 0 - 1 48794034
> [8] 0 - 1 48791067
> [9] 0 - 1 48788100
> [10] 0 - 1 48785134
> [11] 0 - 1 48776221
> [12] 0 - 1 48764321
> [13] 0 - 1 48503227
> [14] 0 - 1 48485415
> [15] 0 - 1 48473512
> [16] 0 - 1 48443621
> [17] 0 - 1 48401788
> [18] 0 - 1 48377834
> [19] 0 - 1 48371841
> [20] 0 - 1 48341809
> [21] 0 - 1 48260734
> [22] 0 - 1 48236635
> [23] 0 - 1 46845105
> [24] 0 - 1 14603061
> [25] 2 - 1 48838573
> [2] 0 - 1 48826699
> 
> It goes through 20+ base trees just to get one tree entry, I think..

Yeah... that's true.  The encoding should refer to the deepest tree 
directly in that case.  Better delta heuristics will have to be worked 
out here.  The code as it is now can't do that.

There was also a bug that prevented larger copy sequences to be created 
which is now fixed.

I added to packv4-create the ability to specify the minimum range of 
consecutive entries that can be represented by a copy sequence to allow 
experiments.  However, even when the tree deltas are completely disabled 
(using --min-tree-copy=0 achieves that) the CPU usage is still much 
higher which is rather unexpected.  In theory this shouldn't be the 
case.

Many other bugs have now been fixed.  A git.git repository with packs 
version 4 appears to be functional and passes git-fsck --full --strict.


Nicolas

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v2] Document pack v4 format
  2013-09-05  4:12             ` Nicolas Pitre
@ 2013-09-05  4:19               ` Duy Nguyen
  2013-09-05  4:40                 ` Nicolas Pitre
  0 siblings, 1 reply; 83+ messages in thread
From: Duy Nguyen @ 2013-09-05  4:19 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Git Mailing List, Junio C Hamano

On Thu, Sep 5, 2013 at 11:12 AM, Nicolas Pitre <nico@fluxnic.net> wrote:
> Many other bugs have now been fixed.  A git.git repository with packs
> version 4 appears to be functional and passes git-fsck --full --strict.

Yeah I was looking at the diff some minutes ago, saw changes in
pack-check.c and wondering if fsck was working. I'll add v4 support to
index-pack. Waiting to see the new, v4-aware tree walker interface
with good "rev-list --all --objects" numbers from you.
-- 
Duy

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v2] Document pack v4 format
  2013-09-05  4:19               ` Duy Nguyen
@ 2013-09-05  4:40                 ` Nicolas Pitre
  2013-09-05  5:04                   ` Duy Nguyen
  0 siblings, 1 reply; 83+ messages in thread
From: Nicolas Pitre @ 2013-09-05  4:40 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Git Mailing List, Junio C Hamano

On Thu, 5 Sep 2013, Duy Nguyen wrote:

> On Thu, Sep 5, 2013 at 11:12 AM, Nicolas Pitre <nico@fluxnic.net> wrote:
> > Many other bugs have now been fixed.  A git.git repository with packs
> > version 4 appears to be functional and passes git-fsck --full --strict.
> 
> Yeah I was looking at the diff some minutes ago, saw changes in
> pack-check.c and wondering if fsck was working. I'll add v4 support to
> index-pack.

Beware that the tree delta encoding has changed a little.  This saved up 
to 2% on some repos.

I'll probably change the encoding to incorporate the escape hatch 
for path and name references as discussed previously.

> Waiting to see the new, v4-aware tree walker interface
> with good "rev-list --all --objects" numbers from you.

Well, unfortunately I've put more time than I really had available into 
this project lately.  I'm about to call for other people to take over it 
and pursue this work further.

I really wanted to set the pack format direction since I've been toying 
with this for so many years.  Now the tool to convert a pack is there, 
and the read side is also there, proving that the format does work and 
the encoding and decoding code is functional and may serve as reference.  
So that's about the extent of what I can contribute at this point.

I'll be happy to provide design assistance and code review comments of 
course.  But I won't be able to put the time to do the actual coding 
myself much longer.


Nicolas

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v2] Document pack v4 format
  2013-09-05  4:40                 ` Nicolas Pitre
@ 2013-09-05  5:04                   ` Duy Nguyen
  2013-09-05  5:39                     ` Nicolas Pitre
  0 siblings, 1 reply; 83+ messages in thread
From: Duy Nguyen @ 2013-09-05  5:04 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Git Mailing List, Junio C Hamano

On Thu, Sep 5, 2013 at 11:40 AM, Nicolas Pitre <nico@fluxnic.net> wrote:
> On Thu, 5 Sep 2013, Duy Nguyen wrote:
>
>> On Thu, Sep 5, 2013 at 11:12 AM, Nicolas Pitre <nico@fluxnic.net> wrote:
>> > Many other bugs have now been fixed.  A git.git repository with packs
>> > version 4 appears to be functional and passes git-fsck --full --strict.
>>
>> Yeah I was looking at the diff some minutes ago, saw changes in
>> pack-check.c and wondering if fsck was working. I'll add v4 support to
>> index-pack.
>
> Beware that the tree delta encoding has changed a little.  This saved up
> to 2% on some repos.

Thanks for the heads up.

> I'll probably change the encoding to incorporate the escape hatch
> for path and name references as discussed previously.
>
>> Waiting to see the new, v4-aware tree walker interface
>> with good "rev-list --all --objects" numbers from you.
>
> Well, unfortunately I've put more time than I really had available into
> this project lately.  I'm about to call for other people to take over it
> and pursue this work further.
>
> I really wanted to set the pack format direction since I've been toying
> with this for so many years.  Now the tool to convert a pack is there,
> and the read side is also there, proving that the format does work and
> the encoding and decoding code is functional and may serve as reference.
> So that's about the extent of what I can contribute at this point.
>
> I'll be happy to provide design assistance and code review comments of
> course.  But I won't be able to put the time to do the actual coding
> myself much longer.

You've done a great job in designing v4 and getting basic support in
place. I think you'll need to post your series again so Junio can pick
it up. Then we (at least I) will try to continue from there. I have
high hopes that this will not drop out like the spit-blob series.
-- 
Duy

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v2] Document pack v4 format
  2013-09-05  5:04                   ` Duy Nguyen
@ 2013-09-05  5:39                     ` Nicolas Pitre
  2013-09-05 16:52                       ` Duy Nguyen
  0 siblings, 1 reply; 83+ messages in thread
From: Nicolas Pitre @ 2013-09-05  5:39 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Git Mailing List, Junio C Hamano

On Thu, 5 Sep 2013, Duy Nguyen wrote:

> On Thu, Sep 5, 2013 at 11:40 AM, Nicolas Pitre <nico@fluxnic.net> wrote:
> > On Thu, 5 Sep 2013, Duy Nguyen wrote:
> >
> >> On Thu, Sep 5, 2013 at 11:12 AM, Nicolas Pitre <nico@fluxnic.net> wrote:
> >> > Many other bugs have now been fixed.  A git.git repository with packs
> >> > version 4 appears to be functional and passes git-fsck --full --strict.
> >>
> >> Yeah I was looking at the diff some minutes ago, saw changes in
> >> pack-check.c and wondering if fsck was working. I'll add v4 support to
> >> index-pack.
> >
> > Beware that the tree delta encoding has changed a little.  This saved up
> > to 2% on some repos.
> 
> Thanks for the heads up.
> 
> > I'll probably change the encoding to incorporate the escape hatch
> > for path and name references as discussed previously.

this is now committed.  I don't think there should be any more pack 
format changes at this point.

Now the pack index v3 probably needs to be improved a little, again to 
accommodate completion of thin packs.  Given that the main SHA1 table is 
now in the main pack file, it should be possible to still carry a small 
SHA1 table in the index file that corresponds to the appended objects 
only. This means that a SHA1 search will have to first use the main SHA1 
table in the pack file as it is done now, and if not found then use the 
SHA1 table in the index file if it exists.  And of course 
nth_packed_object_sha1() will have to be adjusted accordingly.



Nicolas

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v2] Document pack v4 format
  2013-09-05  5:39                     ` Nicolas Pitre
@ 2013-09-05 16:52                       ` Duy Nguyen
  2013-09-05 17:14                         ` Nicolas Pitre
  2013-09-06  4:18                         ` Duy Nguyen
  0 siblings, 2 replies; 83+ messages in thread
From: Duy Nguyen @ 2013-09-05 16:52 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Git Mailing List, Junio C Hamano

On Thu, Sep 5, 2013 at 12:39 PM, Nicolas Pitre <nico@fluxnic.net> wrote:
> Now the pack index v3 probably needs to be improved a little, again to
> accommodate completion of thin packs.  Given that the main SHA1 table is
> now in the main pack file, it should be possible to still carry a small
> SHA1 table in the index file that corresponds to the appended objects
> only. This means that a SHA1 search will have to first use the main SHA1
> table in the pack file as it is done now, and if not found then use the
> SHA1 table in the index file if it exists.  And of course
> nth_packed_object_sha1() will have to be adjusted accordingly.

What if the sender prepares the sha-1 table to contain missing objects
in advance? The sender should know what base objects are missing. Then
we only need to append objects at the receiving end and verify that
all new objects are also present in the sha-1 table.
-- 
Duy

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v2] Document pack v4 format
  2013-09-05 16:52                       ` Duy Nguyen
@ 2013-09-05 17:14                         ` Nicolas Pitre
  2013-09-05 20:26                           ` Junio C Hamano
  2013-09-06  4:18                         ` Duy Nguyen
  1 sibling, 1 reply; 83+ messages in thread
From: Nicolas Pitre @ 2013-09-05 17:14 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Git Mailing List, Junio C Hamano

On Thu, 5 Sep 2013, Duy Nguyen wrote:

> On Thu, Sep 5, 2013 at 12:39 PM, Nicolas Pitre <nico@fluxnic.net> wrote:
> > Now the pack index v3 probably needs to be improved a little, again to
> > accommodate completion of thin packs.  Given that the main SHA1 table is
> > now in the main pack file, it should be possible to still carry a small
> > SHA1 table in the index file that corresponds to the appended objects
> > only. This means that a SHA1 search will have to first use the main SHA1
> > table in the pack file as it is done now, and if not found then use the
> > SHA1 table in the index file if it exists.  And of course
> > nth_packed_object_sha1() will have to be adjusted accordingly.
> 
> What if the sender prepares the sha-1 table to contain missing objects
> in advance? The sender should know what base objects are missing. Then
> we only need to append objects at the receiving end and verify that
> all new objects are also present in the sha-1 table.

I do like this idea very much.  And that doesn't increase the thin pack 
size as the larger SHA1 table will be compensated by a smaller sha1ref 
encoding in those objects referring to the missing ones.



Nicolas

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v2] Document pack v4 format
  2013-09-05 17:14                         ` Nicolas Pitre
@ 2013-09-05 20:26                           ` Junio C Hamano
  2013-09-05 21:04                             ` Nicolas Pitre
  0 siblings, 1 reply; 83+ messages in thread
From: Junio C Hamano @ 2013-09-05 20:26 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Duy Nguyen, Git Mailing List

Nicolas Pitre <nico@fluxnic.net> writes:

> On Thu, 5 Sep 2013, Duy Nguyen wrote:
>
>> On Thu, Sep 5, 2013 at 12:39 PM, Nicolas Pitre <nico@fluxnic.net> wrote:
>> > Now the pack index v3 probably needs to be improved a little, again to
>> > accommodate completion of thin packs.  Given that the main SHA1 table is
>> > now in the main pack file, it should be possible to still carry a small
>> > SHA1 table in the index file that corresponds to the appended objects
>> > only. This means that a SHA1 search will have to first use the main SHA1
>> > table in the pack file as it is done now, and if not found then use the
>> > SHA1 table in the index file if it exists.  And of course
>> > nth_packed_object_sha1() will have to be adjusted accordingly.
>> 
>> What if the sender prepares the sha-1 table to contain missing objects
>> in advance? The sender should know what base objects are missing. Then
>> we only need to append objects at the receiving end and verify that
>> all new objects are also present in the sha-1 table.
>
> I do like this idea very much.  And that doesn't increase the thin pack 
> size as the larger SHA1 table will be compensated by a smaller sha1ref 
> encoding in those objects referring to the missing ones.

Let me see if I understand the proposal correctly.  Compared to a
normal pack-v4 stream, a thin pack-v4 stream:

 - has all the SHA-1 object names involved in the stream in its main
   object name table---most importantly, names of objects that
   "thin" optimization omits from the pack data body are included;

 - uses the SHA-1 object name table offset to refer to other
   objects, even to ones that thin stream will not transfer in the
   pack data body;

 - is completed at the receiving end by appending the data for the
   objects that were not transferred due to the "thin" optimization.

So the invariant "all objects contained in the pack" in:

 - A table of sorted SHA-1 object names for all objects contained in
   the pack.

that appears in Documentation/technical/pack-format.txt is still
kept at the end, and more importantly, any object that is mentioned
in this table can be reconstructed by using pack data in the same
packfile without referencing anything else.  Most importantly, if we
were to build a v2 .idx file for the resulting .pack, the list of
object names in the .idx file would be identical to the object names
in this table in the .pack file.

If that is the case, I too like this.

I briefly wondered if it makes sense to mention objects that are
often referred to that do not exist in the pack in this table
(e.g. new commits included in this pack refer to a tree object that
has not changed for ages---their trees mention this subtree using a
"SHA-1 reference encoding" and being able to name the old,
unchanging tree with an index to the object table may save space),
but that would break the above invariant in a big way---some objects
mentioned in the table may not exist in the packfile itself---and it
probably is not a good idea.  Unlike that broken idea, "include
names of the objects that will be appended anyway" approach to help
fattening a thin-pack makes very good sense to me.

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v2] Document pack v4 format
  2013-09-05 20:26                           ` Junio C Hamano
@ 2013-09-05 21:04                             ` Nicolas Pitre
  0 siblings, 0 replies; 83+ messages in thread
From: Nicolas Pitre @ 2013-09-05 21:04 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Duy Nguyen, Git Mailing List

On Thu, 5 Sep 2013, Junio C Hamano wrote:

> Nicolas Pitre <nico@fluxnic.net> writes:
> 
> > On Thu, 5 Sep 2013, Duy Nguyen wrote:
> >
> >> On Thu, Sep 5, 2013 at 12:39 PM, Nicolas Pitre <nico@fluxnic.net> wrote:
> >> > Now the pack index v3 probably needs to be improved a little, again to
> >> > accommodate completion of thin packs.  Given that the main SHA1 table is
> >> > now in the main pack file, it should be possible to still carry a small
> >> > SHA1 table in the index file that corresponds to the appended objects
> >> > only. This means that a SHA1 search will have to first use the main SHA1
> >> > table in the pack file as it is done now, and if not found then use the
> >> > SHA1 table in the index file if it exists.  And of course
> >> > nth_packed_object_sha1() will have to be adjusted accordingly.
> >> 
> >> What if the sender prepares the sha-1 table to contain missing objects
> >> in advance? The sender should know what base objects are missing. Then
> >> we only need to append objects at the receiving end and verify that
> >> all new objects are also present in the sha-1 table.
> >
> > I do like this idea very much.  And that doesn't increase the thin pack 
> > size as the larger SHA1 table will be compensated by a smaller sha1ref 
> > encoding in those objects referring to the missing ones.
> 
> Let me see if I understand the proposal correctly.  Compared to a
> normal pack-v4 stream, a thin pack-v4 stream:
> 
>  - has all the SHA-1 object names involved in the stream in its main
>    object name table---most importantly, names of objects that
>    "thin" optimization omits from the pack data body are included;
> 
>  - uses the SHA-1 object name table offset to refer to other
>    objects, even to ones that thin stream will not transfer in the
>    pack data body;
> 
>  - is completed at the receiving end by appending the data for the
>    objects that were not transferred due to the "thin" optimization.
> 
> So the invariant "all objects contained in the pack" in:
> 
>  - A table of sorted SHA-1 object names for all objects contained in
>    the pack.
> 
> that appears in Documentation/technical/pack-format.txt is still
> kept at the end, and more importantly, any object that is mentioned
> in this table can be reconstructed by using pack data in the same
> packfile without referencing anything else.  Most importantly, if we
> were to build a v2 .idx file for the resulting .pack, the list of
> object names in the .idx file would be identical to the object names
> in this table in the .pack file.

That is right.

> If that is the case, I too like this.
> 
> I briefly wondered if it makes sense to mention objects that are
> often referred to that do not exist in the pack in this table
> (e.g. new commits included in this pack refer to a tree object that
> has not changed for ages---their trees mention this subtree using a
> "SHA-1 reference encoding" and being able to name the old,
> unchanging tree with an index to the object table may save space),
> but that would break the above invariant in a big way---some objects
> mentioned in the table may not exist in the packfile itself---and it
> probably is not a good idea.

Yet, if an old tree that doesn't change is often referred to, it should 
be possible to have only one such reference in the whole pack and all 
the other trees can use a delta copy sequence to refer to it.  At this 
point whether or not the tree being referred to is listed inline or in 
the SHA1 table doesn't make a big difference.

> Unlike that broken idea, "include
> names of the objects that will be appended anyway" approach to help
> fattening a thin-pack makes very good sense to me.
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe git" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply	[flat|nested] 83+ messages in thread

* [PATCH v3] Document pack v4 format
  2013-08-31  2:49   ` [PATCH v2] " Nguyễn Thái Ngọc Duy
  2013-09-03  6:00     ` Nicolas Pitre
@ 2013-09-06  2:14     ` Nguyễn Thái Ngọc Duy
  2013-09-06  3:23       ` Nicolas Pitre
  2013-09-07  4:52       ` Nicolas Pitre
  1 sibling, 2 replies; 83+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-06  2:14 UTC (permalink / raw)
  To: git; +Cc: Nicolas Pitre, Junio C Hamano, Nguyễn Thái Ngọc Duy


Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 Should be up to date with Nico's latest implementation and also cover
 additions to the format that everybody seems to agree on:

  - new types for canonical trees and commits
  - sha-1 table covering missing objects in thin packs

 Documentation/technical/pack-format.txt | 133 +++++++++++++++++++++++++++++++-
 1 file changed, 132 insertions(+), 1 deletion(-)

diff --git a/Documentation/technical/pack-format.txt b/Documentation/technical/pack-format.txt
index 8e5bf60..c5327ff 100644
--- a/Documentation/technical/pack-format.txt
+++ b/Documentation/technical/pack-format.txt
@@ -1,7 +1,7 @@
 Git pack format
 ===============
 
-== pack-*.pack files have the following format:
+== pack-*.pack files version 2 and 3 have the following format:
 
    - A header appears at the beginning and consists of the following:
 
@@ -36,6 +36,132 @@ Git pack format
 
   - The trailer records 20-byte SHA-1 checksum of all of the above.
 
+== pack-*.pack files version 4 have the following format:
+
+   - A header appears at the beginning and consists of the following:
+
+     4-byte signature:
+	The signature is: {'P', 'A', 'C', 'K'}
+
+     4-byte version number (network byte order): must be 4
+
+     4-byte number of objects contained in the pack (network byte order)
+
+   - A series of tables, described separately.
+
+   - The tables are followed by number of object entries, each of
+     which looks like below:
+
+     (undeltified representation)
+     n-byte type and length (4-bit type, (n-1)*7+4-bit length)
+     data
+
+     (deltified representation)
+     n-byte type and length (4-bit type, (n-1)*7+4-bit length)
+     base object name in SHA-1 reference encoding
+     compressed delta data
+
+     "type" is used to determine object type. Commit has type 1, tree
+     2, blob 3, tag 4, ref-delta 7, canonical-commit 9 (commit type
+     with bit 3 set), canonical-tree 10 (tree type with bit 3 set).
+     Compared to v2, ofs-delta type is not used, and canonical-commit
+     and canonical-tree are new types.
+
+     In undeltified format, blobs and tags ares compressed. Trees are
+     not compressed at all. Some headers in commits are stored
+     uncompressed, the rest is compressed. Tree and commit
+     representations are described in detail separately.
+
+     Blobs and tags are deltified and compressed the same way in
+     v3. Commits are not delitifed. Trees are deltified using
+     undeltified representation.
+
+     Trees and commits in canonical types are in the same format as
+     v2: in canonical format and deflated. They can be used for
+     completing thin packs or preserving somewhat ill-formatted
+     objects.
+
+  - The trailer records 20-byte SHA-1 checksum of all of the above.
+
+=== Pack v4 tables
+
+ - A table of sorted SHA-1 object names for all objects contained in
+   the on-disk pack.
+
+   Thin packs are used for transferring on the wire and may omit base
+   objects, expecting the receiver to add them before writing to
+   disk. The SHA-1 table in thin packs must include the omitted objects
+   as well.
+
+   This table can be referred to using "SHA-1 reference encoding": the
+   index, in variable length encoding, to this table.
+
+ - Ident table: the uncompressed length in variable encoding,
+   followed by zlib-compressed dictionary. Each entry consists of
+   two prefix bytes storing timezone followed by a NUL-terminated
+   string.
+
+   Entries should be sorted by frequency so that the most frequent
+   entry has the smallest index, thus most efficient variable
+   encoding.
+
+   The table can be referred to using "ident reference encoding": the
+   index number, in variable length encoding, to this table.
+
+ - Tree path table: the same format to ident table. Each entry
+   consists of two prefix bytes storing tree entry mode, then a
+   NUL-terminated path name. Same sort order recommendation applies.
+
+=== Commit representation
+
+  - n-byte type and length (4-bit type, (n-1)*7+4-bit length)
+
+  - Tree SHA-1 in SHA-1 reference encoding
+
+  - Parent count in variable length encoding
+
+  - Parent SHA-1s in SHA-1 reference encoding
+
+  - Author reference in ident reference encoding
+
+  - Author timestamp in variable length encoding
+
+  - Committer reference in ident reference encoding
+
+  - Committer timestamp, encoded as a difference against author
+    timestamp with the LSB used to indicate negative difference.
+
+  - Compressed data of remaining header and the body
+
+=== Tree representation
+
+  - n-byte type and length (4-bit type, (n-1)*7+4-bit length)
+
+  - Number of tree entries in variable length encoding
+
+  - A number of entries, each can be in either forms
+
+    - INT(path_index << 1)        INT(sha1_index)
+
+    - INT((entry_start << 1) | 1) INT(entry_count << 1)
+
+    - INT((entry_start << 1) | 1) INT((entry_count << 1) | 1) INT(base_sha1_index)
+
+    INT() denotes a number in variable length encoding. path_index is
+    the index to the tree path table. sha1_index is the index to the
+    SHA-1 table. entry_start is the first tree entry to copy
+    from. entry_count is the number of tree entries to
+    copy. base_sha1_index is the index to SHA-1 table of the base tree
+    to copy from.
+
+    The LSB of the first number indicates whether it's a plain tree
+    entry (LSB not set), or an instruction to copy tree entries from
+    another tree (LSB set).
+
+    For copying from another tree, is the LSB of the second number is
+    set, it will be followed by a base tree SHA-1. If it's not set,
+    the last base tree will be used.
+
 == Original (version 1) pack-*.idx files have the following format:
 
   - The header consists of 256 4-byte network byte order
@@ -160,3 +286,8 @@ Pack file entry: <+
     corresponding packfile.
 
     20-byte SHA-1-checksum of all of the above.
+
+== Version 3 pack-*.idx files support only *.pack files version 4. The
+   format is the same as version 2 except that the table of sorted
+   20-byte SHA-1 object names is missing in the .idx files. The same
+   table exists in .pack files and will be used instead.
-- 
1.8.2.83.gc99314b

^ permalink raw reply related	[flat|nested] 83+ messages in thread

* Re: [PATCH v3] Document pack v4 format
  2013-09-06  2:14     ` [PATCH v3] " Nguyễn Thái Ngọc Duy
@ 2013-09-06  3:23       ` Nicolas Pitre
  2013-09-06  9:48         ` Duy Nguyen
  2013-09-07  4:52       ` Nicolas Pitre
  1 sibling, 1 reply; 83+ messages in thread
From: Nicolas Pitre @ 2013-09-06  3:23 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git, Junio C Hamano

[-- Attachment #1: Type: TEXT/PLAIN, Size: 8522 bytes --]

On Fri, 6 Sep 2013, Nguyễn Thái Ngọc Duy wrote:

> 
> Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
> ---
>  Should be up to date with Nico's latest implementation and also cover
>  additions to the format that everybody seems to agree on:
> 
>   - new types for canonical trees and commits
>   - sha-1 table covering missing objects in thin packs

Great!  I've merged this into my branch with the following amendment:

diff --git a/Documentation/technical/pack-format.txt b/Documentation/technical/pack-format.txt
index 1980794..d0c2cde 100644
--- a/Documentation/technical/pack-format.txt
+++ b/Documentation/technical/pack-format.txt
@@ -81,6 +81,13 @@ Git pack format
      completing thin packs or preserving somewhat ill-formatted
      objects.
 
+     Thin packs are used for transferring on the wire and may omit delta
+     base objects, expecting the receiver to add them at the end of the pack
+     before writing to disk.  The number of objects contained in the pack
+     header must account for those omitted objects in any case. To indicate
+     no more objects are included in a thin pack, a "type 0" byte
+     indicator is inserted after the last transmitted object.
+
   - The trailer records 20-byte SHA-1 checksum of all of the above.
 
 === Pack v4 tables
@@ -88,10 +95,7 @@ Git pack format
  - A table of sorted SHA-1 object names for all objects contained in
    the on-disk pack.
 
-   Thin packs are used for transferring on the wire and may omit base
-   objects, expecting the receiver to add them before writing to
-   disk. The SHA-1 table in thin packs must include the omitted objects
-   as well.
+   The SHA-1 table in thin packs must include the omitted objects as well.
 
    This table can be referred to using "SHA-1 reference encoding": the
    index, in variable length encoding, to this table.
@@ -158,7 +162,7 @@ Git pack format
     entry (LSB not set), or an instruction to copy tree entries from
     another tree (LSB set).
 
-    For copying from another tree, is the LSB of the second number is
+    For copying from another tree, if the LSB of the second number is
     set, it will be followed by a base tree SHA-1. If it's not set,
     the last base tree will be used.
 

> diff --git a/Documentation/technical/pack-format.txt b/Documentation/technical/pack-format.txt
> index 8e5bf60..c5327ff 100644
> --- a/Documentation/technical/pack-format.txt
> +++ b/Documentation/technical/pack-format.txt
> @@ -1,7 +1,7 @@
>  Git pack format
>  ===============
>  
> -== pack-*.pack files have the following format:
> +== pack-*.pack files version 2 and 3 have the following format:
>  
>     - A header appears at the beginning and consists of the following:
>  
> @@ -36,6 +36,132 @@ Git pack format
>  
>    - The trailer records 20-byte SHA-1 checksum of all of the above.
>  
> +== pack-*.pack files version 4 have the following format:
> +
> +   - A header appears at the beginning and consists of the following:
> +
> +     4-byte signature:
> +	The signature is: {'P', 'A', 'C', 'K'}
> +
> +     4-byte version number (network byte order): must be 4
> +
> +     4-byte number of objects contained in the pack (network byte order)
> +
> +   - A series of tables, described separately.
> +
> +   - The tables are followed by number of object entries, each of
> +     which looks like below:
> +
> +     (undeltified representation)
> +     n-byte type and length (4-bit type, (n-1)*7+4-bit length)
> +     data
> +
> +     (deltified representation)
> +     n-byte type and length (4-bit type, (n-1)*7+4-bit length)
> +     base object name in SHA-1 reference encoding
> +     compressed delta data
> +
> +     "type" is used to determine object type. Commit has type 1, tree
> +     2, blob 3, tag 4, ref-delta 7, canonical-commit 9 (commit type
> +     with bit 3 set), canonical-tree 10 (tree type with bit 3 set).
> +     Compared to v2, ofs-delta type is not used, and canonical-commit
> +     and canonical-tree are new types.
> +
> +     In undeltified format, blobs and tags ares compressed. Trees are
> +     not compressed at all. Some headers in commits are stored
> +     uncompressed, the rest is compressed. Tree and commit
> +     representations are described in detail separately.
> +
> +     Blobs and tags are deltified and compressed the same way in
> +     v3. Commits are not delitifed. Trees are deltified using
> +     undeltified representation.
> +
> +     Trees and commits in canonical types are in the same format as
> +     v2: in canonical format and deflated. They can be used for
> +     completing thin packs or preserving somewhat ill-formatted
> +     objects.
> +
> +  - The trailer records 20-byte SHA-1 checksum of all of the above.
> +
> +=== Pack v4 tables
> +
> + - A table of sorted SHA-1 object names for all objects contained in
> +   the on-disk pack.
> +
> +   Thin packs are used for transferring on the wire and may omit base
> +   objects, expecting the receiver to add them before writing to
> +   disk. The SHA-1 table in thin packs must include the omitted objects
> +   as well.
> +
> +   This table can be referred to using "SHA-1 reference encoding": the
> +   index, in variable length encoding, to this table.
> +
> + - Ident table: the uncompressed length in variable encoding,
> +   followed by zlib-compressed dictionary. Each entry consists of
> +   two prefix bytes storing timezone followed by a NUL-terminated
> +   string.
> +
> +   Entries should be sorted by frequency so that the most frequent
> +   entry has the smallest index, thus most efficient variable
> +   encoding.
> +
> +   The table can be referred to using "ident reference encoding": the
> +   index number, in variable length encoding, to this table.
> +
> + - Tree path table: the same format to ident table. Each entry
> +   consists of two prefix bytes storing tree entry mode, then a
> +   NUL-terminated path name. Same sort order recommendation applies.
> +
> +=== Commit representation
> +
> +  - n-byte type and length (4-bit type, (n-1)*7+4-bit length)
> +
> +  - Tree SHA-1 in SHA-1 reference encoding
> +
> +  - Parent count in variable length encoding
> +
> +  - Parent SHA-1s in SHA-1 reference encoding
> +
> +  - Author reference in ident reference encoding
> +
> +  - Author timestamp in variable length encoding
> +
> +  - Committer reference in ident reference encoding
> +
> +  - Committer timestamp, encoded as a difference against author
> +    timestamp with the LSB used to indicate negative difference.
> +
> +  - Compressed data of remaining header and the body
> +
> +=== Tree representation
> +
> +  - n-byte type and length (4-bit type, (n-1)*7+4-bit length)
> +
> +  - Number of tree entries in variable length encoding
> +
> +  - A number of entries, each can be in either forms
> +
> +    - INT(path_index << 1)        INT(sha1_index)
> +
> +    - INT((entry_start << 1) | 1) INT(entry_count << 1)
> +
> +    - INT((entry_start << 1) | 1) INT((entry_count << 1) | 1) INT(base_sha1_index)
> +
> +    INT() denotes a number in variable length encoding. path_index is
> +    the index to the tree path table. sha1_index is the index to the
> +    SHA-1 table. entry_start is the first tree entry to copy
> +    from. entry_count is the number of tree entries to
> +    copy. base_sha1_index is the index to SHA-1 table of the base tree
> +    to copy from.
> +
> +    The LSB of the first number indicates whether it's a plain tree
> +    entry (LSB not set), or an instruction to copy tree entries from
> +    another tree (LSB set).
> +
> +    For copying from another tree, is the LSB of the second number is
> +    set, it will be followed by a base tree SHA-1. If it's not set,
> +    the last base tree will be used.
> +
>  == Original (version 1) pack-*.idx files have the following format:
>  
>    - The header consists of 256 4-byte network byte order
> @@ -160,3 +286,8 @@ Pack file entry: <+
>      corresponding packfile.
>  
>      20-byte SHA-1-checksum of all of the above.
> +
> +== Version 3 pack-*.idx files support only *.pack files version 4. The
> +   format is the same as version 2 except that the table of sorted
> +   20-byte SHA-1 object names is missing in the .idx files. The same
> +   table exists in .pack files and will be used instead.
> -- 
> 1.8.2.83.gc99314b
> 
> --
> To unsubscribe from this list: send the line "unsubscribe git" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply related	[flat|nested] 83+ messages in thread

* Re: [PATCH v2] Document pack v4 format
  2013-09-05 16:52                       ` Duy Nguyen
  2013-09-05 17:14                         ` Nicolas Pitre
@ 2013-09-06  4:18                         ` Duy Nguyen
  2013-09-06 13:19                           ` Nicolas Pitre
  1 sibling, 1 reply; 83+ messages in thread
From: Duy Nguyen @ 2013-09-06  4:18 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Git Mailing List, Junio C Hamano

On Thu, Sep 5, 2013 at 11:52 PM, Duy Nguyen <pclouds@gmail.com> wrote:
> On Thu, Sep 5, 2013 at 12:39 PM, Nicolas Pitre <nico@fluxnic.net> wrote:
>> Now the pack index v3 probably needs to be improved a little, again to
>> accommodate completion of thin packs.  Given that the main SHA1 table is
>> now in the main pack file, it should be possible to still carry a small
>> SHA1 table in the index file that corresponds to the appended objects
>> only. This means that a SHA1 search will have to first use the main SHA1
>> table in the pack file as it is done now, and if not found then use the
>> SHA1 table in the index file if it exists.  And of course
>> nth_packed_object_sha1() will have to be adjusted accordingly.
>
> What if the sender prepares the sha-1 table to contain missing objects
> in advance? The sender should know what base objects are missing. Then
> we only need to append objects at the receiving end and verify that
> all new objects are also present in the sha-1 table.

One minor detail to sort out: the size of sha-1 table. Previously it's
the number of objects in the pack. Now it's not true because the table
may have more entries. So how should we record the table size? We
could use null sha-1 as the end of table marker. Or we could make
pack-objects to write nr_objects as the number of entries _after_ pack
completion, not the true number of objects in thin pack. I like the
latter (no more rehashing the entire pack after completion) but then
we need a cue to know that we have reached the end of the pack..
-- 
Duy

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v3] Document pack v4 format
  2013-09-06  3:23       ` Nicolas Pitre
@ 2013-09-06  9:48         ` Duy Nguyen
  2013-09-06 13:25           ` Nicolas Pitre
  0 siblings, 1 reply; 83+ messages in thread
From: Duy Nguyen @ 2013-09-06  9:48 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Git Mailing List, Junio C Hamano

On Fri, Sep 6, 2013 at 10:23 AM, Nicolas Pitre <nico@fluxnic.net> wrote:
> On Fri, 6 Sep 2013, Nguy­n Thái Ng÷c Duy wrote:
>
>>
>> Signed-off-by: Nguy­n Thái Ng÷c Duy <pclouds@gmail.com>
>> ---
>>  Should be up to date with Nico's latest implementation and also cover
>>  additions to the format that everybody seems to agree on:
>>
>>   - new types for canonical trees and commits
>>   - sha-1 table covering missing objects in thin packs
>
> Great!  I've merged this into my branch with the following amendment:

I'd like to propose another change in the format to basically limit
the use of sha1ref format "\0<SHA-1>" to tree entries only. All forms
of deltas must have non-zero sha1 index (i.e. reference to SHA-1
table). It will simplify handling code, and I think it makes sense too

diff --git a/Documentation/technical/pack-format.txt
b/Documentation/technical/pack-format.txt
index d0c2cde..399416b 100644
--- a/Documentation/technical/pack-format.txt
+++ b/Documentation/technical/pack-format.txt
@@ -74,7 +74,8 @@ Git pack format

      Blobs and tags are deltified and compressed the same way in
      v3. Commits are not deltified. Trees are deltified using
-     special copy sequences.
+     special copy sequences. The base object name index in deltified
+     format must not be zero (i.e. it must point to SHA-1 table).

      Trees and commits in canonical types are in the same format as
      v2: in canonical format and deflated. They can be used for
@@ -166,6 +167,8 @@ Git pack format
     set, it will be followed by a base tree SHA-1. If it's not set,
     the last base tree will be used.

+    base_sha1_index cannot be zero and followed by full SHA-1.
+
 == Original (version 1) pack-*.idx files have the following format:

   - The header consists of 256 4-byte network byte order
-- 
Duy

^ permalink raw reply related	[flat|nested] 83+ messages in thread

* Re: [PATCH v2] Document pack v4 format
  2013-09-06  4:18                         ` Duy Nguyen
@ 2013-09-06 13:19                           ` Nicolas Pitre
  0 siblings, 0 replies; 83+ messages in thread
From: Nicolas Pitre @ 2013-09-06 13:19 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Git Mailing List, Junio C Hamano

On Fri, 6 Sep 2013, Duy Nguyen wrote:

> On Thu, Sep 5, 2013 at 11:52 PM, Duy Nguyen <pclouds@gmail.com> wrote:
> > On Thu, Sep 5, 2013 at 12:39 PM, Nicolas Pitre <nico@fluxnic.net> wrote:
> >> Now the pack index v3 probably needs to be improved a little, again to
> >> accommodate completion of thin packs.  Given that the main SHA1 table is
> >> now in the main pack file, it should be possible to still carry a small
> >> SHA1 table in the index file that corresponds to the appended objects
> >> only. This means that a SHA1 search will have to first use the main SHA1
> >> table in the pack file as it is done now, and if not found then use the
> >> SHA1 table in the index file if it exists.  And of course
> >> nth_packed_object_sha1() will have to be adjusted accordingly.
> >
> > What if the sender prepares the sha-1 table to contain missing objects
> > in advance? The sender should know what base objects are missing. Then
> > we only need to append objects at the receiving end and verify that
> > all new objects are also present in the sha-1 table.
> 
> One minor detail to sort out: the size of sha-1 table. Previously it's
> the number of objects in the pack. Now it's not true because the table
> may have more entries. So how should we record the table size? We
> could use null sha-1 as the end of table marker. Or we could make
> pack-objects to write nr_objects as the number of entries _after_ pack
> completion, not the true number of objects in thin pack. I like the
> latter (no more rehashing the entire pack after completion) but then
> we need a cue to know that we have reached the end of the pack..

See the amendment I made to your documentation patch.  I opted for the 
later.  To mark the end of the transmitted objects a zero byte (object 
type 0) is used.


Nicolas

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v3] Document pack v4 format
  2013-09-06  9:48         ` Duy Nguyen
@ 2013-09-06 13:25           ` Nicolas Pitre
  2013-09-06 13:44             ` Duy Nguyen
  0 siblings, 1 reply; 83+ messages in thread
From: Nicolas Pitre @ 2013-09-06 13:25 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Git Mailing List, Junio C Hamano

[-- Attachment #1: Type: TEXT/PLAIN, Size: 1147 bytes --]

On Fri, 6 Sep 2013, Duy Nguyen wrote:

> On Fri, Sep 6, 2013 at 10:23 AM, Nicolas Pitre <nico@fluxnic.net> wrote:
> > On Fri, 6 Sep 2013, Nguy­n Thái Ng÷c Duy wrote:
> >
> >>
> >> Signed-off-by: Nguy­n Thái Ng÷c Duy <pclouds@gmail.com>
> >> ---
> >>  Should be up to date with Nico's latest implementation and also cover
> >>  additions to the format that everybody seems to agree on:
> >>
> >>   - new types for canonical trees and commits
> >>   - sha-1 table covering missing objects in thin packs
> >
> > Great!  I've merged this into my branch with the following amendment:
> 
> I'd like to propose another change in the format to basically limit
> the use of sha1ref format "\0<SHA-1>" to tree entries only. All forms
> of deltas must have non-zero sha1 index (i.e. reference to SHA-1
> table). It will simplify handling code, and I think it makes sense too

Why?

This is still some artificial limitation and I tend not to like them.

"Simplifying handling code" is not a good enough reason on its own, 
especially if it reduce flexibility for possible future layout 
optimizations.

In what way is the code more difficult?


Nicolas

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v3] Document pack v4 format
  2013-09-06 13:25           ` Nicolas Pitre
@ 2013-09-06 13:44             ` Duy Nguyen
  2013-09-06 16:44               ` Nicolas Pitre
  0 siblings, 1 reply; 83+ messages in thread
From: Duy Nguyen @ 2013-09-06 13:44 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Git Mailing List, Junio C Hamano

On Fri, Sep 6, 2013 at 8:25 PM, Nicolas Pitre <nico@fluxnic.net> wrote:
> On Fri, 6 Sep 2013, Duy Nguyen wrote:
>
>> On Fri, Sep 6, 2013 at 10:23 AM, Nicolas Pitre <nico@fluxnic.net> wrote:
>> > On Fri, 6 Sep 2013, Nguy­n Thái Ng÷c Duy wrote:
>> >
>> >>
>> >> Signed-off-by: Nguy­n Thái Ng÷c Duy <pclouds@gmail.com>
>> >> ---
>> >>  Should be up to date with Nico's latest implementation and also cover
>> >>  additions to the format that everybody seems to agree on:
>> >>
>> >>   - new types for canonical trees and commits
>> >>   - sha-1 table covering missing objects in thin packs
>> >
>> > Great!  I've merged this into my branch with the following amendment:
>>
>> I'd like to propose another change in the format to basically limit
>> the use of sha1ref format "\0<SHA-1>" to tree entries only. All forms
>> of deltas must have non-zero sha1 index (i.e. reference to SHA-1
>> table). It will simplify handling code, and I think it makes sense too
>
> Why?
>
> This is still some artificial limitation and I tend not to like them.
>
> "Simplifying handling code" is not a good enough reason on its own,
> especially if it reduce flexibility for possible future layout
> optimizations.
>
> In what way is the code more difficult?

That'll be two ways of linking to another in-pack object. The linked
object must be in the pack anyway, its sha-1 should be present in the
sha-1 table. "\0<sha1>" is a longer encoding for nothing. If the
linked sha1 is not in the pack (similar to the old ref-delta), it
makes the pack depend on another one, which is what we've been
avoding. In your code you reject sha1ref starting with zero too
(sha1_file.c::get_base_delta and packv4-parse.c::decode_entries)
-- 
Duy

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v3] Document pack v4 format
  2013-09-06 13:44             ` Duy Nguyen
@ 2013-09-06 16:44               ` Nicolas Pitre
  2013-09-07  4:57                 ` Nicolas Pitre
  0 siblings, 1 reply; 83+ messages in thread
From: Nicolas Pitre @ 2013-09-06 16:44 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Git Mailing List, Junio C Hamano

[-- Attachment #1: Type: TEXT/PLAIN, Size: 2873 bytes --]

On Fri, 6 Sep 2013, Duy Nguyen wrote:

> On Fri, Sep 6, 2013 at 8:25 PM, Nicolas Pitre <nico@fluxnic.net> wrote:
> > On Fri, 6 Sep 2013, Duy Nguyen wrote:
> >
> >> On Fri, Sep 6, 2013 at 10:23 AM, Nicolas Pitre <nico@fluxnic.net> wrote:
> >> > On Fri, 6 Sep 2013, Nguy­n Thái Ng÷c Duy wrote:
> >> >
> >> >>
> >> >> Signed-off-by: Nguy­n Thái Ng÷c Duy <pclouds@gmail.com>
> >> >> ---
> >> >>  Should be up to date with Nico's latest implementation and also cover
> >> >>  additions to the format that everybody seems to agree on:
> >> >>
> >> >>   - new types for canonical trees and commits
> >> >>   - sha-1 table covering missing objects in thin packs
> >> >
> >> > Great!  I've merged this into my branch with the following amendment:
> >>
> >> I'd like to propose another change in the format to basically limit
> >> the use of sha1ref format "\0<SHA-1>" to tree entries only. All forms
> >> of deltas must have non-zero sha1 index (i.e. reference to SHA-1
> >> table). It will simplify handling code, and I think it makes sense too
> >
> > Why?
> >
> > This is still some artificial limitation and I tend not to like them.
> >
> > "Simplifying handling code" is not a good enough reason on its own,
> > especially if it reduce flexibility for possible future layout
> > optimizations.
> >
> > In what way is the code more difficult?
> 
> That'll be two ways of linking to another in-pack object. The linked
> object must be in the pack anyway, its sha-1 should be present in the
> sha-1 table. "\0<sha1>" is a longer encoding for nothing. If the
> linked sha1 is not in the pack (similar to the old ref-delta), it
> makes the pack depend on another one, which is what we've been
> avoding.

OK.  Let's say that if the needed SHA1 exists in the table then the 
sha1ref should use an index into that table.  That's a recommendation we 
can make, just like we suggest to have the ident/path tables sorted by 
usage frequency.

But even if index 0 with a SHA1 is used inline for an object in the same 
pack, it is still theoretically a valid pack even if somewhat wasteful.  
Maybe some tools in the future will benefit from this flexibility to 
simplify their implementation.

You mention that only tree objects should use this encoding, but commits 
should be allowed to use it as well.  Suppose that a commit reverts some 
changes to a pre-existing tree state.  That tree might already be in 
another pack and we shouldn't have to copy it into the current pack.  
Same goes for commit parents.  Etc.

> In your code you reject sha1ref starting with zero too
> (sha1_file.c::get_base_delta and packv4-parse.c::decode_entries)

Yeah... because I wrote the minimum code to make it work with the 
current encoder.  But the decoder could be more lenient than that.  It's 
just a matter of adding a call to find_pack_entry_one() when the sha1ref 
index is 0.


Nicolas

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v3] Document pack v4 format
  2013-09-06  2:14     ` [PATCH v3] " Nguyễn Thái Ngọc Duy
  2013-09-06  3:23       ` Nicolas Pitre
@ 2013-09-07  4:52       ` Nicolas Pitre
  2013-09-07  8:05         ` Duy Nguyen
  1 sibling, 1 reply; 83+ messages in thread
From: Nicolas Pitre @ 2013-09-07  4:52 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git, Junio C Hamano

[-- Attachment #1: Type: TEXT/PLAIN, Size: 4070 bytes --]

On Fri, 6 Sep 2013, Nguyễn Thái Ngọc Duy wrote:

> 
> Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
> ---
>  Should be up to date with Nico's latest implementation and also cover
>  additions to the format that everybody seems to agree on:
> 
>   - new types for canonical trees and commits
>   - sha-1 table covering missing objects in thin packs

Yes, I've tweaked the format again.  I've implemented the code needed to 
carry canonical commit and tree objects in pack v4.  And things are much 
simpler if the canonical commit and tree types are identical to pack v2.  
Therefore the new types are used for the pack v4 optimized commit and 
tree representations.

I've therefore added this patch to my tree (with the needed changes to 
the documentation patch):

commit 98d4b75aff266015b5dff0a324a2984c2a8f7fa2
Author: Nicolas Pitre <nico@fluxnic.net>
Date:   Fri Sep 6 23:45:49 2013 -0400

    pack v4: allow canonical commit and tree objects
    
    If for some reason we can't transcode a commit or tree object (lossy
    encoding such as the zero-padded file mode for example) then we can still
    store the canonical object like in pack v2.
    
    This is also useful for completing a thin pack without having to add
    missing entries to the dictionary tables.
    
    To simplify things, the canonical commit and tree types retain their type
    number 1 and 2 respectively, and the transcoded types are now 9 and 10
    i.e. with bit 3 added.
    
    Signed-off-by: Nicolas Pitre <nico@fluxnic.net>

diff --git a/cache.h b/cache.h
index a6d016b..b68ad5a 100644
--- a/cache.h
+++ b/cache.h
@@ -330,6 +330,8 @@ enum object_type {
 	/* 5 for future expansion */
 	OBJ_OFS_DELTA = 6,
 	OBJ_REF_DELTA = 7,
+	OBJ_PV4_COMMIT = (8 + 1),
+	OBJ_PV4_TREE = (8 + 2),
 	OBJ_ANY,
 	OBJ_MAX
 };
diff --git a/packv4-create.c b/packv4-create.c
index 11cfe6f..38fa594 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -981,10 +981,15 @@ static off_t packv4_write_object(struct sha1file *f, struct packed_git *p,
 		die("unexpected object type %d", type);
 	}
 	free(src);
-	if (!result)
-		die("can't convert %s object %s",
-		    typename(type), sha1_to_hex(obj->sha1));
+	if (!result) {
+		warning("can't convert %s object %s",
+			typename(type), sha1_to_hex(obj->sha1));
+		/* fall back to copy the object in its original form */
+		return copy_object_data(f, p, obj->offset);
+	}
 
+	/* Use bit 3 to indicate a special type encoding */
+	type += 8;
 	hdrlen = write_object_header(f, type, obj_size);
 	sha1write(f, result, buf_size);
 	free(result);
diff --git a/packv4-parse.c b/packv4-parse.c
index 86ec535..63bba03 100644
--- a/packv4-parse.c
+++ b/packv4-parse.c
@@ -266,7 +266,7 @@ static int decode_entries(struct packed_git *p, struct pack_window **w_curs,
 			if (++scp - src >= avail - 20)
 				return -1;
 		/* let's still make sure this is actually a tree */
-		if ((*scp++ & 0xf) != OBJ_TREE)
+		if ((*scp++ & 0xf) != OBJ_PV4_TREE)
 			return -1;
 	}
 
diff --git a/sha1_file.c b/sha1_file.c
index 79e1293..c7bf677 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -1806,6 +1806,11 @@ static enum object_type packed_to_object_type(struct packed_git *p,
 	}
 
 	switch (type) {
+	case OBJ_PV4_COMMIT:
+	case OBJ_PV4_TREE:
+		/* hide pack v4 special object types */
+		type -= 8;
+		break;
 	case OBJ_BAD:
 	case OBJ_COMMIT:
 	case OBJ_TREE:
@@ -2171,17 +2176,16 @@ void *unpack_entry(struct packed_git *p, off_t obj_offset,
 		if (data)
 			die("BUG in unpack_entry: left loop at a valid delta");
 		break;
+	case OBJ_PV4_COMMIT:
+		data = pv4_get_commit(p, &w_curs, curpos, size);
+		type -= 8;
+		break;
+	case OBJ_PV4_TREE:
+		data = pv4_get_tree(p, &w_curs, curpos, size);
+		type -= 8;
+		break;
 	case OBJ_COMMIT:
 	case OBJ_TREE:
-		if (p->version >= 4 && !base_from_cache) {
-			if (type == OBJ_COMMIT) {
-				data = pv4_get_commit(p, &w_curs, curpos, size);
-			} else {
-				data = pv4_get_tree(p, &w_curs, curpos, size);
-			}
-			break;
-		}
-		/* fall through */
 	case OBJ_BLOB:
 	case OBJ_TAG:
 		if (!base_from_cache)


^ permalink raw reply related	[flat|nested] 83+ messages in thread

* Re: [PATCH v3] Document pack v4 format
  2013-09-06 16:44               ` Nicolas Pitre
@ 2013-09-07  4:57                 ` Nicolas Pitre
  0 siblings, 0 replies; 83+ messages in thread
From: Nicolas Pitre @ 2013-09-07  4:57 UTC (permalink / raw)
  To: Duy Nguyen; +Cc: Git Mailing List, Junio C Hamano

On Fri, 6 Sep 2013, Nicolas Pitre wrote:

> On Fri, 6 Sep 2013, Duy Nguyen wrote:
> 
> > In your code you reject sha1ref starting with zero too
> > (sha1_file.c::get_base_delta and packv4-parse.c::decode_entries)
> 
> Yeah... because I wrote the minimum code to make it work with the 
> current encoder.  But the decoder could be more lenient than that.  It's 
> just a matter of adding a call to find_pack_entry_one() when the sha1ref 
> index is 0.

FYI I've added the missing code to packv4-parse.c::decode_entries.
The code in sha1_file.c::get_delta_base was already fine.


Nicolas

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v3] Document pack v4 format
  2013-09-07  4:52       ` Nicolas Pitre
@ 2013-09-07  8:05         ` Duy Nguyen
  0 siblings, 0 replies; 83+ messages in thread
From: Duy Nguyen @ 2013-09-07  8:05 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Git Mailing List, Junio C Hamano

On Sat, Sep 7, 2013 at 11:52 AM, Nicolas Pitre <nico@fluxnic.net> wrote:
> On Fri, 6 Sep 2013, Nguyễn Thái Ngọc Duy wrote:
>
>>
>> Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
>> ---
>>  Should be up to date with Nico's latest implementation and also cover
>>  additions to the format that everybody seems to agree on:
>>
>>   - new types for canonical trees and commits
>>   - sha-1 table covering missing objects in thin packs
>
> Yes, I've tweaked the format again.  I've implemented the code needed to
> carry canonical commit and tree objects in pack v4.  And things are much
> simpler if the canonical commit and tree types are identical to pack v2.
> Therefore the new types are used for the pack v4 optimized commit and
> tree representations.

Phew, index-pack finally produces .idx identical to packv4-create
output. Yeah using new types for v4 tree and commit should help. I
have a lot of "if (packv4 && type == OBJ_TREE)" tests, now they become
"if (type == OBJ_PV4_TREE)". Now rebasing on your tree.. Next step:
thin pack support in index-pack, then export/move code in
packv4-create to support v4 in pack-objects.
-- 
Duy

^ permalink raw reply	[flat|nested] 83+ messages in thread

end of thread, other threads:[~2013-09-07  8:06 UTC | newest]

Thread overview: 83+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-08-27  4:25 [PATCH 00/23] Preliminary pack v4 support Nicolas Pitre
2013-08-27  4:25 ` [PATCH 01/23] pack v4: initial pack dictionary structure and code Nicolas Pitre
2013-08-27 15:08   ` Junio C Hamano
2013-08-27 16:13     ` Nicolas Pitre
2013-08-27  4:25 ` [PATCH 02/23] export packed_object_info() Nicolas Pitre
2013-08-27  4:25 ` [PATCH 03/23] pack v4: scan tree objects Nicolas Pitre
2013-08-27  4:25 ` [PATCH 04/23] pack v4: add tree entry mode support to dictionary entries Nicolas Pitre
2013-08-27  4:25 ` [PATCH 05/23] pack v4: add commit object parsing Nicolas Pitre
2013-08-27 15:26   ` Junio C Hamano
2013-08-27 16:47     ` Nicolas Pitre
2013-08-27 17:42       ` Junio C Hamano
2013-08-27  4:25 ` [PATCH 06/23] pack v4: split the object list and dictionary creation Nicolas Pitre
2013-08-27  4:25 ` [PATCH 07/23] pack v4: move to struct pack_idx_entry and get rid of our own struct idx_entry Nicolas Pitre
2013-08-27  4:25 ` [PATCH 08/23] pack v4: basic references encoding Nicolas Pitre
2013-08-27 15:29   ` Junio C Hamano
2013-08-27 15:53     ` Nicolas Pitre
2013-08-27  4:25 ` [PATCH 09/23] pack v4: commit object encoding Nicolas Pitre
2013-08-27 15:39   ` Junio C Hamano
2013-08-27 16:50     ` Nicolas Pitre
2013-08-27 19:59     ` Nicolas Pitre
2013-08-27 20:15       ` Junio C Hamano
2013-08-27 21:43         ` Nicolas Pitre
2013-09-02 20:48   ` Duy Nguyen
2013-09-03  6:30     ` Nicolas Pitre
2013-09-03  7:41       ` Duy Nguyen
2013-09-05  3:50         ` Nicolas Pitre
2013-08-27  4:25 ` [PATCH 10/23] pack v4: tree " Nicolas Pitre
2013-08-27 15:44   ` Junio C Hamano
2013-08-27 16:52     ` Nicolas Pitre
2013-08-27  4:25 ` [PATCH 11/23] pack v4: dictionary table output Nicolas Pitre
2013-08-27  4:25 ` [PATCH 12/23] pack v4: creation code Nicolas Pitre
2013-08-27 15:48   ` Junio C Hamano
2013-08-27 16:59     ` Nicolas Pitre
2013-08-27  4:25 ` [PATCH 13/23] pack v4: object headers Nicolas Pitre
2013-08-27  4:25 ` [PATCH 14/23] pack v4: object data copy Nicolas Pitre
2013-08-27 15:53   ` Junio C Hamano
2013-08-27 18:24     ` Nicolas Pitre
2013-08-27  4:25 ` [PATCH 15/23] pack v4: object writing Nicolas Pitre
2013-08-27  4:26 ` [PATCH 16/23] pack v4: tree object delta encoding Nicolas Pitre
2013-08-27  4:26 ` [PATCH 17/23] pack v4: load delta candidate for encoding tree objects Nicolas Pitre
2013-08-27  4:26 ` [PATCH 18/23] pack v4: honor pack.compression config option Nicolas Pitre
2013-08-27  4:26 ` [PATCH 19/23] pack v4: relax commit parsing a bit Nicolas Pitre
2013-08-27  4:26 ` [PATCH 20/23] pack index v3 Nicolas Pitre
2013-08-27  4:26 ` [PATCH 21/23] pack v4: normalize pack name to properly generate the pack index file name Nicolas Pitre
2013-08-27  4:26 ` [PATCH 22/23] pack v4: add progress display Nicolas Pitre
2013-08-27  4:26 ` [PATCH 23/23] initial pack index v3 support on the read side Nicolas Pitre
2013-08-31 11:45   ` Duy Nguyen
2013-09-03  6:09     ` Nicolas Pitre
2013-09-03  7:34       ` Duy Nguyen
2013-08-27 11:17 ` [PATCH] Document pack v4 format Nguyễn Thái Ngọc Duy
2013-08-27 18:25   ` Junio C Hamano
2013-08-27 18:53   ` Nicolas Pitre
2013-08-31  2:49   ` [PATCH v2] " Nguyễn Thái Ngọc Duy
2013-09-03  6:00     ` Nicolas Pitre
2013-09-03  6:46       ` Nicolas Pitre
2013-09-03 11:49         ` Duy Nguyen
2013-09-03 14:54           ` Duy Nguyen
2013-09-05  4:12             ` Nicolas Pitre
2013-09-05  4:19               ` Duy Nguyen
2013-09-05  4:40                 ` Nicolas Pitre
2013-09-05  5:04                   ` Duy Nguyen
2013-09-05  5:39                     ` Nicolas Pitre
2013-09-05 16:52                       ` Duy Nguyen
2013-09-05 17:14                         ` Nicolas Pitre
2013-09-05 20:26                           ` Junio C Hamano
2013-09-05 21:04                             ` Nicolas Pitre
2013-09-06  4:18                         ` Duy Nguyen
2013-09-06 13:19                           ` Nicolas Pitre
2013-09-06  2:14     ` [PATCH v3] " Nguyễn Thái Ngọc Duy
2013-09-06  3:23       ` Nicolas Pitre
2013-09-06  9:48         ` Duy Nguyen
2013-09-06 13:25           ` Nicolas Pitre
2013-09-06 13:44             ` Duy Nguyen
2013-09-06 16:44               ` Nicolas Pitre
2013-09-07  4:57                 ` Nicolas Pitre
2013-09-07  4:52       ` Nicolas Pitre
2013-09-07  8:05         ` Duy Nguyen
2013-08-27 15:03 ` [PATCH 00/23] Preliminary pack v4 support Junio C Hamano
2013-08-27 15:59   ` Nicolas Pitre
2013-08-27 16:44     ` Junio C Hamano
2013-08-28  2:30       ` Duy Nguyen
2013-08-28  2:58         ` Nicolas Pitre
2013-08-28  3:06           ` Duy Nguyen

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.