All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Nguyễn Thái Ngọc Duy" <pclouds@gmail.com>
To: git@vger.kernel.org
Cc: "Junio C Hamano" <gitster@pobox.com>,
	"Nicolas Pitre" <nico@fluxnic.net>,
	"Nguyễn Thái Ngọc Duy" <pclouds@gmail.com>
Subject: [PATCH] pack-objects: use streaming interface for reading large loose blobs
Date: Sat, 26 May 2012 17:28:01 +0700	[thread overview]
Message-ID: <1338028081-22638-1-git-send-email-pclouds@gmail.com> (raw)

git usually streams large blobs directly to packs. But there are cases
where git can create large loose blobs (unpack-objects or hash-object
over pipe). Or they can come from other git implementations.
core.bigfilethreshold can also be lowered down and introduce a new
wave of large loose blobs.

Use streaming interface to read/compress/write these blobs in one
go. Fall back to normal way if somehow streaming interface cannot be
used.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 On top of ng/pack-objects-cleanup. Changes since the last version is
 we do not rely on close_istream(NULL); any more.

 builtin/pack-objects.c | 73 +++++++++++++++++++++++++++++++++++++++++++++-----
 t/t1050-large.sh       | 12 +++++++++
 2 files changed, 79 insertions(+), 6 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index ccfcbad..f334820 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -16,6 +16,7 @@
 #include "list-objects.h"
 #include "progress.h"
 #include "refs.h"
+#include "streaming.h"
 #include "thread-utils.h"
 
 static const char *pack_usage[] = {
@@ -150,6 +151,46 @@ static unsigned long do_compress(void **pptr, unsigned long size)
 	return stream.total_out;
 }
 
+static unsigned long write_large_blob_data(struct git_istream *st, struct sha1file *f,
+					   const unsigned char *sha1)
+{
+	git_zstream stream;
+	unsigned char ibuf[1024 * 16];
+	unsigned char obuf[1024 * 16];
+	unsigned long olen = 0;
+
+	memset(&stream, 0, sizeof(stream));
+	git_deflate_init(&stream, pack_compression_level);
+
+	for (;;) {
+		ssize_t readlen;
+		int zret = Z_OK;
+		readlen = read_istream(st, ibuf, sizeof(ibuf));
+		if (readlen == -1)
+			die(_("unable to read %s"), sha1_to_hex(sha1));
+
+		stream.next_in = ibuf;
+		stream.avail_in = readlen;
+		while ((stream.avail_in || readlen == 0) &&
+		       (zret == Z_OK || zret == Z_BUF_ERROR)) {
+			stream.next_out = obuf;
+			stream.avail_out = sizeof(obuf);
+			zret = git_deflate(&stream, readlen ? 0 : Z_FINISH);
+			sha1write(f, obuf, stream.next_out - obuf);
+			olen += stream.next_out - obuf;
+		}
+		if (stream.avail_in)
+			die(_("deflate error (%d)"), zret);
+		if (readlen == 0) {
+			if (zret != Z_STREAM_END)
+				die(_("deflate error (%d)"), zret);
+			break;
+		}
+	}
+	git_deflate_end(&stream);
+	return olen;
+}
+
 /*
  * we are going to reuse the existing object data as is.  make
  * sure it is not corrupt.
@@ -208,11 +249,18 @@ static unsigned long write_no_reuse_object(struct sha1file *f, struct object_ent
 	unsigned hdrlen;
 	enum object_type type;
 	void *buf;
+	struct git_istream *st = NULL;
 
 	if (!usable_delta) {
-		buf = read_sha1_file(entry->idx.sha1, &type, &size);
-		if (!buf)
-			die("unable to read %s", sha1_to_hex(entry->idx.sha1));
+		if (entry->type == OBJ_BLOB &&
+		    entry->size > big_file_threshold &&
+		    (st = open_istream(entry->idx.sha1, &type, &size, NULL)) != NULL)
+			buf = NULL;
+		else {
+			buf = read_sha1_file(entry->idx.sha1, &type, &size);
+			if (!buf)
+				die(_("unable to read %s"), sha1_to_hex(entry->idx.sha1));
+		}
 		/*
 		 * make sure no cached delta data remains from a
 		 * previous attempt before a pack split occurred.
@@ -233,7 +281,9 @@ static unsigned long write_no_reuse_object(struct sha1file *f, struct object_ent
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	}
 
-	if (entry->z_delta_size)
+	if (st)	/* large blob case, just assume we don't compress well */
+		datalen = size;
+	else if (entry->z_delta_size)
 		datalen = entry->z_delta_size;
 	else
 		datalen = do_compress(&buf, size);
@@ -256,6 +306,8 @@ static unsigned long write_no_reuse_object(struct sha1file *f, struct object_ent
 		while (ofs >>= 7)
 			dheader[--pos] = 128 | (--ofs & 127);
 		if (limit && hdrlen + sizeof(dheader) - pos + datalen + 20 >= limit) {
+			if (st)
+				close_istream(st);
 			free(buf);
 			return 0;
 		}
@@ -268,6 +320,8 @@ static unsigned long write_no_reuse_object(struct sha1file *f, struct object_ent
 		 * an additional 20 bytes for the base sha1.
 		 */
 		if (limit && hdrlen + 20 + datalen + 20 >= limit) {
+			if (st)
+				close_istream(st);
 			free(buf);
 			return 0;
 		}
@@ -276,13 +330,20 @@ static unsigned long write_no_reuse_object(struct sha1file *f, struct object_ent
 		hdrlen += 20;
 	} else {
 		if (limit && hdrlen + datalen + 20 >= limit) {
+			if (st)
+				close_istream(st);
 			free(buf);
 			return 0;
 		}
 		sha1write(f, header, hdrlen);
 	}
-	sha1write(f, buf, datalen);
-	free(buf);
+	if (st) {
+		datalen = write_large_blob_data(st, f, entry->idx.sha1);
+		close_istream(st);
+	} else {
+		sha1write(f, buf, datalen);
+		free(buf);
+	}
 
 	return hdrlen + datalen;
 }
diff --git a/t/t1050-large.sh b/t/t1050-large.sh
index 55ed955..313889b 100755
--- a/t/t1050-large.sh
+++ b/t/t1050-large.sh
@@ -134,6 +134,18 @@ test_expect_success 'repack' '
 	git repack -ad
 '
 
+test_expect_success 'pack-objects with large loose object' '
+	SHA1=`git hash-object huge` &&
+	test_create_repo loose &&
+	echo $SHA1 | git pack-objects --stdout |
+		GIT_ALLOC_LIMIT=0 GIT_DIR=loose/.git git unpack-objects &&
+	echo $SHA1 | GIT_DIR=loose/.git git pack-objects pack &&
+	test_create_repo packed &&
+	mv pack-* packed/.git/objects/pack &&
+	GIT_DIR=packed/.git git cat-file blob $SHA1 >actual &&
+	cmp huge actual
+'
+
 test_expect_success 'tar achiving' '
 	git archive --format=tar HEAD >/dev/null
 '
-- 
1.7.10.2.549.g9354186

             reply	other threads:[~2012-05-26 10:32 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-05-26 10:28 Nguyễn Thái Ngọc Duy [this message]
2012-05-29 17:56 ` [PATCH] pack-objects: use streaming interface for reading large loose blobs Junio C Hamano
  -- strict thread matches above, loose matches on Subject: below --
2012-05-12 10:26 Nguyễn Thái Ngọc Duy
2012-05-12 16:51 ` Nicolas Pitre

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1338028081-22638-1-git-send-email-pclouds@gmail.com \
    --to=pclouds@gmail.com \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    --cc=nico@fluxnic.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.