All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] pack-objects: use streaming interface for reading large loose blobs
@ 2012-05-12 10:26 Nguyễn Thái Ngọc Duy
  2012-05-12 16:51 ` Nicolas Pitre
  2012-05-16 12:02 ` [PATCH v2 1/4] streaming: allow to call close_istream(NULL); Nguyễn Thái Ngọc Duy
  0 siblings, 2 replies; 18+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2012-05-12 10:26 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Nicolas Pitre, Nguyễn Thái Ngọc Duy

git usually streams large blobs directly to packs. But there are cases
where git can create large loose blobs (unpack-objects or hash-object
over pipe). Or they can come from other git implementations.
core.bigfilethreshold can also be lowered down and introduce a new
wave of large loose blobs.

Use streaming interface to read these blobs and compress/write at the
same time.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 index-pack's streaming support is on the way. unpack-objects is
 another story because I'm thinking of merging it back to index-pack
 first, which may take more than one release cycle.

 builtin/pack-objects.c |   73 ++++++++++++++++++++++++++++++++++++++++++++----
 t/t1050-large.sh       |   16 ++++++++++
 2 files changed, 83 insertions(+), 6 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 1861093..98b51c1 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -16,6 +16,7 @@
 #include "list-objects.h"
 #include "progress.h"
 #include "refs.h"
+#include "streaming.h"
 #include "thread-utils.h"
 
 static const char *pack_usage[] = {
@@ -150,6 +151,55 @@ static unsigned long do_compress(void **pptr, unsigned long size)
 	return stream.total_out;
 }
 
+static void write_large_blob_data(struct sha1file *f, const unsigned char *sha1)
+{
+	git_zstream stream;
+	unsigned char ibuf[1024 * 16];
+	unsigned char obuf[1024 * 16];
+	int zret;
+
+	struct git_istream *st;
+	enum object_type type;
+	unsigned long sz;
+
+	st = open_istream(sha1, &type, &sz, NULL);
+	if (!st)
+		die(_("failed to read %s"), sha1_to_hex(sha1));
+
+	memset(&stream, 0, sizeof(stream));
+	git_deflate_init(&stream, pack_compression_level);
+
+	if (type != OBJ_BLOB)
+		die("BUG: %s is not a blob", sha1_to_hex(sha1));
+
+	for (;;) {
+		ssize_t readlen;
+		readlen = read_istream(st, ibuf, sizeof(ibuf));
+		if (readlen == -1)
+			die(_("failed to read %s"), sha1_to_hex(sha1));
+
+		stream.next_in = ibuf;
+		stream.avail_in = readlen;
+		zret = Z_OK;
+		while ((stream.avail_in || readlen == 0) &&
+		       (zret == Z_OK || zret == Z_BUF_ERROR)) {
+			stream.next_out = obuf;
+			stream.avail_out = sizeof(obuf);
+			zret = git_deflate(&stream, readlen ? 0 : Z_FINISH);
+			sha1write(f, obuf, stream.next_out - obuf);
+		}
+		if (stream.avail_in)
+			die(_("deflate error (%d)"), zret);
+		if (readlen == 0) {
+			if (zret != Z_STREAM_END)
+				die(_("deflate error (%d)"), zret);
+			break;
+		}
+	}
+	close_istream(st);
+	git_deflate_end(&stream);
+}
+
 /*
  * we are going to reuse the existing object data as is.  make
  * sure it is not corrupt.
@@ -259,9 +309,14 @@ static unsigned long write_object(struct sha1file *f,
 	if (!to_reuse) {
 		no_reuse:
 		if (!usable_delta) {
-			buf = read_sha1_file(entry->idx.sha1, &type, &size);
-			if (!buf)
-				die("unable to read %s", sha1_to_hex(entry->idx.sha1));
+			type = sha1_object_info(entry->idx.sha1, &size);
+			if (type == OBJ_BLOB && size > big_file_threshold)
+				buf = NULL;
+			else {
+				buf = read_sha1_file(entry->idx.sha1, &type, &size);
+				if (!buf)
+					die("unable to read %s", sha1_to_hex(entry->idx.sha1));
+			}
 			/*
 			 * make sure no cached delta data remains from a
 			 * previous attempt before a pack split occurred.
@@ -284,8 +339,11 @@ static unsigned long write_object(struct sha1file *f,
 
 		if (entry->z_delta_size)
 			datalen = entry->z_delta_size;
-		else
+		else if (buf)
 			datalen = do_compress(&buf, size);
+		else
+			/* large blob case, just assume we don't compress well */
+			datalen = size;
 
 		/*
 		 * The object header is a byte of 'type' followed by zero or
@@ -330,8 +388,11 @@ static unsigned long write_object(struct sha1file *f,
 			}
 			sha1write(f, header, hdrlen);
 		}
-		sha1write(f, buf, datalen);
-		free(buf);
+		if (buf) {
+			sha1write(f, buf, datalen);
+			free(buf);
+		} else
+			write_large_blob_data(f, entry->idx.sha1);
 	}
 	else {
 		struct packed_git *p = entry->in_pack;
diff --git a/t/t1050-large.sh b/t/t1050-large.sh
index 55ed955..7fbd2e1 100755
--- a/t/t1050-large.sh
+++ b/t/t1050-large.sh
@@ -134,6 +134,22 @@ test_expect_success 'repack' '
 	git repack -ad
 '
 
+test_expect_success 'pack-objects with large loose object' '
+	echo Z | dd of=large4 bs=1k seek=2000 &&
+	OBJ=9f36d94e145816ec642592c09cc8e601d83af157 &&
+	P=.git/objects/9f/36d94e145816ec642592c09cc8e601d83af157 &&
+	(
+	unset GIT_ALLOC_LIMIT &&
+	cat large4 | git hash-object -w --stdin &&
+	git cat-file blob $OBJ >actual &&
+	cmp large4 actual
+	) &&
+	echo $OBJ | git pack-objects .git/objects/pack/pack &&
+	rm $P &&
+	git cat-file blob $OBJ >actual &&
+	cmp large4 actual
+'
+
 test_expect_success 'tar achiving' '
 	git archive --format=tar HEAD >/dev/null
 '
-- 
1.7.8.36.g69ee2

^ permalink raw reply related	[flat|nested] 18+ messages in thread
* [PATCH] pack-objects: use streaming interface for reading large loose blobs
@ 2012-05-26 10:28 Nguyễn Thái Ngọc Duy
  2012-05-29 17:56 ` Junio C Hamano
  0 siblings, 1 reply; 18+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2012-05-26 10:28 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Nicolas Pitre, Nguyễn Thái Ngọc Duy

git usually streams large blobs directly to packs. But there are cases
where git can create large loose blobs (unpack-objects or hash-object
over pipe). Or they can come from other git implementations.
core.bigfilethreshold can also be lowered down and introduce a new
wave of large loose blobs.

Use streaming interface to read/compress/write these blobs in one
go. Fall back to normal way if somehow streaming interface cannot be
used.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 On top of ng/pack-objects-cleanup. Changes since the last version is
 we do not rely on close_istream(NULL); any more.

 builtin/pack-objects.c | 73 +++++++++++++++++++++++++++++++++++++++++++++-----
 t/t1050-large.sh       | 12 +++++++++
 2 files changed, 79 insertions(+), 6 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index ccfcbad..f334820 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -16,6 +16,7 @@
 #include "list-objects.h"
 #include "progress.h"
 #include "refs.h"
+#include "streaming.h"
 #include "thread-utils.h"
 
 static const char *pack_usage[] = {
@@ -150,6 +151,46 @@ static unsigned long do_compress(void **pptr, unsigned long size)
 	return stream.total_out;
 }
 
+static unsigned long write_large_blob_data(struct git_istream *st, struct sha1file *f,
+					   const unsigned char *sha1)
+{
+	git_zstream stream;
+	unsigned char ibuf[1024 * 16];
+	unsigned char obuf[1024 * 16];
+	unsigned long olen = 0;
+
+	memset(&stream, 0, sizeof(stream));
+	git_deflate_init(&stream, pack_compression_level);
+
+	for (;;) {
+		ssize_t readlen;
+		int zret = Z_OK;
+		readlen = read_istream(st, ibuf, sizeof(ibuf));
+		if (readlen == -1)
+			die(_("unable to read %s"), sha1_to_hex(sha1));
+
+		stream.next_in = ibuf;
+		stream.avail_in = readlen;
+		while ((stream.avail_in || readlen == 0) &&
+		       (zret == Z_OK || zret == Z_BUF_ERROR)) {
+			stream.next_out = obuf;
+			stream.avail_out = sizeof(obuf);
+			zret = git_deflate(&stream, readlen ? 0 : Z_FINISH);
+			sha1write(f, obuf, stream.next_out - obuf);
+			olen += stream.next_out - obuf;
+		}
+		if (stream.avail_in)
+			die(_("deflate error (%d)"), zret);
+		if (readlen == 0) {
+			if (zret != Z_STREAM_END)
+				die(_("deflate error (%d)"), zret);
+			break;
+		}
+	}
+	git_deflate_end(&stream);
+	return olen;
+}
+
 /*
  * we are going to reuse the existing object data as is.  make
  * sure it is not corrupt.
@@ -208,11 +249,18 @@ static unsigned long write_no_reuse_object(struct sha1file *f, struct object_ent
 	unsigned hdrlen;
 	enum object_type type;
 	void *buf;
+	struct git_istream *st = NULL;
 
 	if (!usable_delta) {
-		buf = read_sha1_file(entry->idx.sha1, &type, &size);
-		if (!buf)
-			die("unable to read %s", sha1_to_hex(entry->idx.sha1));
+		if (entry->type == OBJ_BLOB &&
+		    entry->size > big_file_threshold &&
+		    (st = open_istream(entry->idx.sha1, &type, &size, NULL)) != NULL)
+			buf = NULL;
+		else {
+			buf = read_sha1_file(entry->idx.sha1, &type, &size);
+			if (!buf)
+				die(_("unable to read %s"), sha1_to_hex(entry->idx.sha1));
+		}
 		/*
 		 * make sure no cached delta data remains from a
 		 * previous attempt before a pack split occurred.
@@ -233,7 +281,9 @@ static unsigned long write_no_reuse_object(struct sha1file *f, struct object_ent
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	}
 
-	if (entry->z_delta_size)
+	if (st)	/* large blob case, just assume we don't compress well */
+		datalen = size;
+	else if (entry->z_delta_size)
 		datalen = entry->z_delta_size;
 	else
 		datalen = do_compress(&buf, size);
@@ -256,6 +306,8 @@ static unsigned long write_no_reuse_object(struct sha1file *f, struct object_ent
 		while (ofs >>= 7)
 			dheader[--pos] = 128 | (--ofs & 127);
 		if (limit && hdrlen + sizeof(dheader) - pos + datalen + 20 >= limit) {
+			if (st)
+				close_istream(st);
 			free(buf);
 			return 0;
 		}
@@ -268,6 +320,8 @@ static unsigned long write_no_reuse_object(struct sha1file *f, struct object_ent
 		 * an additional 20 bytes for the base sha1.
 		 */
 		if (limit && hdrlen + 20 + datalen + 20 >= limit) {
+			if (st)
+				close_istream(st);
 			free(buf);
 			return 0;
 		}
@@ -276,13 +330,20 @@ static unsigned long write_no_reuse_object(struct sha1file *f, struct object_ent
 		hdrlen += 20;
 	} else {
 		if (limit && hdrlen + datalen + 20 >= limit) {
+			if (st)
+				close_istream(st);
 			free(buf);
 			return 0;
 		}
 		sha1write(f, header, hdrlen);
 	}
-	sha1write(f, buf, datalen);
-	free(buf);
+	if (st) {
+		datalen = write_large_blob_data(st, f, entry->idx.sha1);
+		close_istream(st);
+	} else {
+		sha1write(f, buf, datalen);
+		free(buf);
+	}
 
 	return hdrlen + datalen;
 }
diff --git a/t/t1050-large.sh b/t/t1050-large.sh
index 55ed955..313889b 100755
--- a/t/t1050-large.sh
+++ b/t/t1050-large.sh
@@ -134,6 +134,18 @@ test_expect_success 'repack' '
 	git repack -ad
 '
 
+test_expect_success 'pack-objects with large loose object' '
+	SHA1=`git hash-object huge` &&
+	test_create_repo loose &&
+	echo $SHA1 | git pack-objects --stdout |
+		GIT_ALLOC_LIMIT=0 GIT_DIR=loose/.git git unpack-objects &&
+	echo $SHA1 | GIT_DIR=loose/.git git pack-objects pack &&
+	test_create_repo packed &&
+	mv pack-* packed/.git/objects/pack &&
+	GIT_DIR=packed/.git git cat-file blob $SHA1 >actual &&
+	cmp huge actual
+'
+
 test_expect_success 'tar achiving' '
 	git archive --format=tar HEAD >/dev/null
 '
-- 
1.7.10.2.549.g9354186

^ permalink raw reply related	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2012-05-29 17:56 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-05-12 10:26 [PATCH] pack-objects: use streaming interface for reading large loose blobs Nguyễn Thái Ngọc Duy
2012-05-12 16:51 ` Nicolas Pitre
2012-05-13  4:37   ` [PATCH v2] " Nguyễn Thái Ngọc Duy
2012-05-14 15:56     ` Junio C Hamano
2012-05-14 19:43     ` Junio C Hamano
2012-05-15 11:18       ` Nguyen Thai Ngoc Duy
2012-05-15 15:27         ` Junio C Hamano
2012-05-16  7:09           ` Nguyen Thai Ngoc Duy
2012-05-16 12:02 ` [PATCH v2 1/4] streaming: allow to call close_istream(NULL); Nguyễn Thái Ngọc Duy
2012-05-16 12:02   ` [PATCH v2 2/4] pack-objects, streaming: turn "xx >= big_file_threshold" to ".. > .." Nguyễn Thái Ngọc Duy
2012-05-18 21:05     ` Junio C Hamano
2012-05-16 12:02   ` [PATCH v2 3/4] pack-objects: refactor write_object() Nguyễn Thái Ngọc Duy
2012-05-18 21:16     ` Junio C Hamano
2012-05-19  2:43     ` Nicolas Pitre
2012-05-16 12:02   ` [PATCH v2 4/4] pack-objects: use streaming interface for reading large loose blobs Nguyễn Thái Ngọc Duy
2012-05-18 21:02   ` [PATCH v2 1/4] streaming: allow to call close_istream(NULL); Junio C Hamano
2012-05-26 10:28 [PATCH] pack-objects: use streaming interface for reading large loose blobs Nguyễn Thái Ngọc Duy
2012-05-29 17:56 ` Junio C Hamano

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.