All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Nguyễn Thái Ngọc Duy" <pclouds@gmail.com>
To: git@vger.kernel.org
Cc: "Junio C Hamano" <gitster@pobox.com>,
	"Nicolas Pitre" <nico@fluxnic.net>,
	"Nguyễn Thái Ngọc Duy" <pclouds@gmail.com>
Subject: [PATCH v2] pack-objects: use streaming interface for reading large loose blobs
Date: Sun, 13 May 2012 11:37:42 +0700	[thread overview]
Message-ID: <1336883862-9013-1-git-send-email-pclouds@gmail.com> (raw)
In-Reply-To: <alpine.LFD.2.02.1205121220070.21030@xanadu.home>

git usually streams large blobs directly to packs. But there are cases
where git can create large loose blobs (unpack-objects or hash-object
over pipe). Or they can come from other git implementations.
core.bigfilethreshold can also be lowered down and introduce a new
wave of large loose blobs.

Use streaming interface to read/compress/write these blobs in one go.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 On Sat, May 12, 2012 at 11:51 PM, Nicolas Pitre <nico@fluxnic.net> wrote:
 >> @@ -259,9 +309,14 @@ static unsigned long write_object(struct sha1file *f,
 >>       if (!to_reuse) {
 >>               no_reuse:
 >>               if (!usable_delta) {
 >> -                     buf = read_sha1_file(entry->idx.sha1, &type, &size);
 >> -                     if (!buf)
 >> -                             die("unable to read %s", sha1_to_hex(entry->idx.sha1));
 >> +                     type = sha1_object_info(entry->idx.sha1, &size);
 >
 > Please don't use sha1_object_info() lightly.  This is a potentially
 > expensive operation, and you really don't want to do it on each objects.
 >
 > And as a matter of fact, the information you are looking for has already
 > been determined earlier.  See the code in check_object() which tries
 > hard to avoid sha1_object_info() as much as possible.
 >
 > Therefore you should have entry->type and entry->size already set for
 > you to use.

 Thanks. sha1_object_info() removed in favor of entry->type and entry->size.

 builtin/pack-objects.c |   72 ++++++++++++++++++++++++++++++++++++++++++++----
 t/t1050-large.sh       |   16 ++++++++++
 2 files changed, 82 insertions(+), 6 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 1861093..ab5438a 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -16,6 +16,7 @@
 #include "list-objects.h"
 #include "progress.h"
 #include "refs.h"
+#include "streaming.h"
 #include "thread-utils.h"
 
 static const char *pack_usage[] = {
@@ -150,6 +151,55 @@ static unsigned long do_compress(void **pptr, unsigned long size)
 	return stream.total_out;
 }
 
+static void write_large_blob_data(struct sha1file *f, const unsigned char *sha1)
+{
+	git_zstream stream;
+	unsigned char ibuf[1024 * 16];
+	unsigned char obuf[1024 * 16];
+	int zret;
+
+	struct git_istream *st;
+	enum object_type type;
+	unsigned long sz;
+
+	st = open_istream(sha1, &type, &sz, NULL);
+	if (!st)
+		die(_("unable to read %s"), sha1_to_hex(sha1));
+
+	memset(&stream, 0, sizeof(stream));
+	git_deflate_init(&stream, pack_compression_level);
+
+	if (type != OBJ_BLOB)
+		die("BUG: %s is not a blob", sha1_to_hex(sha1));
+
+	for (;;) {
+		ssize_t readlen;
+		readlen = read_istream(st, ibuf, sizeof(ibuf));
+		if (readlen == -1)
+			die(_("unable to read %s"), sha1_to_hex(sha1));
+
+		stream.next_in = ibuf;
+		stream.avail_in = readlen;
+		zret = Z_OK;
+		while ((stream.avail_in || readlen == 0) &&
+		       (zret == Z_OK || zret == Z_BUF_ERROR)) {
+			stream.next_out = obuf;
+			stream.avail_out = sizeof(obuf);
+			zret = git_deflate(&stream, readlen ? 0 : Z_FINISH);
+			sha1write(f, obuf, stream.next_out - obuf);
+		}
+		if (stream.avail_in)
+			die(_("deflate error (%d)"), zret);
+		if (readlen == 0) {
+			if (zret != Z_STREAM_END)
+				die(_("deflate error (%d)"), zret);
+			break;
+		}
+	}
+	close_istream(st);
+	git_deflate_end(&stream);
+}
+
 /*
  * we are going to reuse the existing object data as is.  make
  * sure it is not corrupt.
@@ -259,9 +309,13 @@ static unsigned long write_object(struct sha1file *f,
 	if (!to_reuse) {
 		no_reuse:
 		if (!usable_delta) {
-			buf = read_sha1_file(entry->idx.sha1, &type, &size);
-			if (!buf)
-				die("unable to read %s", sha1_to_hex(entry->idx.sha1));
+			if (entry->type == OBJ_BLOB && entry->size > big_file_threshold)
+				buf = NULL;
+			else {
+				buf = read_sha1_file(entry->idx.sha1, &type, &size);
+				if (!buf)
+					die(_("unable to read %s"), sha1_to_hex(entry->idx.sha1));
+			}
 			/*
 			 * make sure no cached delta data remains from a
 			 * previous attempt before a pack split occurred.
@@ -284,8 +338,11 @@ static unsigned long write_object(struct sha1file *f,
 
 		if (entry->z_delta_size)
 			datalen = entry->z_delta_size;
-		else
+		else if (buf)
 			datalen = do_compress(&buf, size);
+		else
+			/* large blob case, just assume we don't compress well */
+			datalen = size;
 
 		/*
 		 * The object header is a byte of 'type' followed by zero or
@@ -330,8 +387,11 @@ static unsigned long write_object(struct sha1file *f,
 			}
 			sha1write(f, header, hdrlen);
 		}
-		sha1write(f, buf, datalen);
-		free(buf);
+		if (buf) {
+			sha1write(f, buf, datalen);
+			free(buf);
+		} else
+			write_large_blob_data(f, entry->idx.sha1);
 	}
 	else {
 		struct packed_git *p = entry->in_pack;
diff --git a/t/t1050-large.sh b/t/t1050-large.sh
index 55ed955..7fbd2e1 100755
--- a/t/t1050-large.sh
+++ b/t/t1050-large.sh
@@ -134,6 +134,22 @@ test_expect_success 'repack' '
 	git repack -ad
 '
 
+test_expect_success 'pack-objects with large loose object' '
+	echo Z | dd of=large4 bs=1k seek=2000 &&
+	OBJ=9f36d94e145816ec642592c09cc8e601d83af157 &&
+	P=.git/objects/9f/36d94e145816ec642592c09cc8e601d83af157 &&
+	(
+	unset GIT_ALLOC_LIMIT &&
+	cat large4 | git hash-object -w --stdin &&
+	git cat-file blob $OBJ >actual &&
+	cmp large4 actual
+	) &&
+	echo $OBJ | git pack-objects .git/objects/pack/pack &&
+	rm $P &&
+	git cat-file blob $OBJ >actual &&
+	cmp large4 actual
+'
+
 test_expect_success 'tar achiving' '
 	git archive --format=tar HEAD >/dev/null
 '
-- 
1.7.8.36.g69ee2

  reply	other threads:[~2012-05-13  4:41 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-05-12 10:26 [PATCH] pack-objects: use streaming interface for reading large loose blobs Nguyễn Thái Ngọc Duy
2012-05-12 16:51 ` Nicolas Pitre
2012-05-13  4:37   ` Nguyễn Thái Ngọc Duy [this message]
2012-05-14 15:56     ` [PATCH v2] " Junio C Hamano
2012-05-14 19:43     ` Junio C Hamano
2012-05-15 11:18       ` Nguyen Thai Ngoc Duy
2012-05-15 15:27         ` Junio C Hamano
2012-05-16  7:09           ` Nguyen Thai Ngoc Duy
2012-05-16 12:02 ` [PATCH v2 1/4] streaming: allow to call close_istream(NULL); Nguyễn Thái Ngọc Duy
2012-05-16 12:02   ` [PATCH v2 2/4] pack-objects, streaming: turn "xx >= big_file_threshold" to ".. > .." Nguyễn Thái Ngọc Duy
2012-05-18 21:05     ` Junio C Hamano
2012-05-16 12:02   ` [PATCH v2 3/4] pack-objects: refactor write_object() Nguyễn Thái Ngọc Duy
2012-05-18 21:16     ` Junio C Hamano
2012-05-19  2:43     ` Nicolas Pitre
2012-05-16 12:02   ` [PATCH v2 4/4] pack-objects: use streaming interface for reading large loose blobs Nguyễn Thái Ngọc Duy
2012-05-18 21:02   ` [PATCH v2 1/4] streaming: allow to call close_istream(NULL); Junio C Hamano

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1336883862-9013-1-git-send-email-pclouds@gmail.com \
    --to=pclouds@gmail.com \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    --cc=nico@fluxnic.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.