All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] pack-objects: use streaming interface for reading large loose blobs
@ 2012-05-26 10:28 Nguyễn Thái Ngọc Duy
  2012-05-29 17:56 ` Junio C Hamano
  0 siblings, 1 reply; 4+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2012-05-26 10:28 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Nicolas Pitre, Nguyễn Thái Ngọc Duy

git usually streams large blobs directly to packs. But there are cases
where git can create large loose blobs (unpack-objects or hash-object
over pipe). Or they can come from other git implementations.
core.bigfilethreshold can also be lowered down and introduce a new
wave of large loose blobs.

Use streaming interface to read/compress/write these blobs in one
go. Fall back to normal way if somehow streaming interface cannot be
used.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 On top of ng/pack-objects-cleanup. Changes since the last version is
 we do not rely on close_istream(NULL); any more.

 builtin/pack-objects.c | 73 +++++++++++++++++++++++++++++++++++++++++++++-----
 t/t1050-large.sh       | 12 +++++++++
 2 files changed, 79 insertions(+), 6 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index ccfcbad..f334820 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -16,6 +16,7 @@
 #include "list-objects.h"
 #include "progress.h"
 #include "refs.h"
+#include "streaming.h"
 #include "thread-utils.h"
 
 static const char *pack_usage[] = {
@@ -150,6 +151,46 @@ static unsigned long do_compress(void **pptr, unsigned long size)
 	return stream.total_out;
 }
 
+static unsigned long write_large_blob_data(struct git_istream *st, struct sha1file *f,
+					   const unsigned char *sha1)
+{
+	git_zstream stream;
+	unsigned char ibuf[1024 * 16];
+	unsigned char obuf[1024 * 16];
+	unsigned long olen = 0;
+
+	memset(&stream, 0, sizeof(stream));
+	git_deflate_init(&stream, pack_compression_level);
+
+	for (;;) {
+		ssize_t readlen;
+		int zret = Z_OK;
+		readlen = read_istream(st, ibuf, sizeof(ibuf));
+		if (readlen == -1)
+			die(_("unable to read %s"), sha1_to_hex(sha1));
+
+		stream.next_in = ibuf;
+		stream.avail_in = readlen;
+		while ((stream.avail_in || readlen == 0) &&
+		       (zret == Z_OK || zret == Z_BUF_ERROR)) {
+			stream.next_out = obuf;
+			stream.avail_out = sizeof(obuf);
+			zret = git_deflate(&stream, readlen ? 0 : Z_FINISH);
+			sha1write(f, obuf, stream.next_out - obuf);
+			olen += stream.next_out - obuf;
+		}
+		if (stream.avail_in)
+			die(_("deflate error (%d)"), zret);
+		if (readlen == 0) {
+			if (zret != Z_STREAM_END)
+				die(_("deflate error (%d)"), zret);
+			break;
+		}
+	}
+	git_deflate_end(&stream);
+	return olen;
+}
+
 /*
  * we are going to reuse the existing object data as is.  make
  * sure it is not corrupt.
@@ -208,11 +249,18 @@ static unsigned long write_no_reuse_object(struct sha1file *f, struct object_ent
 	unsigned hdrlen;
 	enum object_type type;
 	void *buf;
+	struct git_istream *st = NULL;
 
 	if (!usable_delta) {
-		buf = read_sha1_file(entry->idx.sha1, &type, &size);
-		if (!buf)
-			die("unable to read %s", sha1_to_hex(entry->idx.sha1));
+		if (entry->type == OBJ_BLOB &&
+		    entry->size > big_file_threshold &&
+		    (st = open_istream(entry->idx.sha1, &type, &size, NULL)) != NULL)
+			buf = NULL;
+		else {
+			buf = read_sha1_file(entry->idx.sha1, &type, &size);
+			if (!buf)
+				die(_("unable to read %s"), sha1_to_hex(entry->idx.sha1));
+		}
 		/*
 		 * make sure no cached delta data remains from a
 		 * previous attempt before a pack split occurred.
@@ -233,7 +281,9 @@ static unsigned long write_no_reuse_object(struct sha1file *f, struct object_ent
 			OBJ_OFS_DELTA : OBJ_REF_DELTA;
 	}
 
-	if (entry->z_delta_size)
+	if (st)	/* large blob case, just assume we don't compress well */
+		datalen = size;
+	else if (entry->z_delta_size)
 		datalen = entry->z_delta_size;
 	else
 		datalen = do_compress(&buf, size);
@@ -256,6 +306,8 @@ static unsigned long write_no_reuse_object(struct sha1file *f, struct object_ent
 		while (ofs >>= 7)
 			dheader[--pos] = 128 | (--ofs & 127);
 		if (limit && hdrlen + sizeof(dheader) - pos + datalen + 20 >= limit) {
+			if (st)
+				close_istream(st);
 			free(buf);
 			return 0;
 		}
@@ -268,6 +320,8 @@ static unsigned long write_no_reuse_object(struct sha1file *f, struct object_ent
 		 * an additional 20 bytes for the base sha1.
 		 */
 		if (limit && hdrlen + 20 + datalen + 20 >= limit) {
+			if (st)
+				close_istream(st);
 			free(buf);
 			return 0;
 		}
@@ -276,13 +330,20 @@ static unsigned long write_no_reuse_object(struct sha1file *f, struct object_ent
 		hdrlen += 20;
 	} else {
 		if (limit && hdrlen + datalen + 20 >= limit) {
+			if (st)
+				close_istream(st);
 			free(buf);
 			return 0;
 		}
 		sha1write(f, header, hdrlen);
 	}
-	sha1write(f, buf, datalen);
-	free(buf);
+	if (st) {
+		datalen = write_large_blob_data(st, f, entry->idx.sha1);
+		close_istream(st);
+	} else {
+		sha1write(f, buf, datalen);
+		free(buf);
+	}
 
 	return hdrlen + datalen;
 }
diff --git a/t/t1050-large.sh b/t/t1050-large.sh
index 55ed955..313889b 100755
--- a/t/t1050-large.sh
+++ b/t/t1050-large.sh
@@ -134,6 +134,18 @@ test_expect_success 'repack' '
 	git repack -ad
 '
 
+test_expect_success 'pack-objects with large loose object' '
+	SHA1=`git hash-object huge` &&
+	test_create_repo loose &&
+	echo $SHA1 | git pack-objects --stdout |
+		GIT_ALLOC_LIMIT=0 GIT_DIR=loose/.git git unpack-objects &&
+	echo $SHA1 | GIT_DIR=loose/.git git pack-objects pack &&
+	test_create_repo packed &&
+	mv pack-* packed/.git/objects/pack &&
+	GIT_DIR=packed/.git git cat-file blob $SHA1 >actual &&
+	cmp huge actual
+'
+
 test_expect_success 'tar achiving' '
 	git archive --format=tar HEAD >/dev/null
 '
-- 
1.7.10.2.549.g9354186

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH] pack-objects: use streaming interface for reading large loose blobs
  2012-05-26 10:28 [PATCH] pack-objects: use streaming interface for reading large loose blobs Nguyễn Thái Ngọc Duy
@ 2012-05-29 17:56 ` Junio C Hamano
  0 siblings, 0 replies; 4+ messages in thread
From: Junio C Hamano @ 2012-05-29 17:56 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git, Nicolas Pitre

Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:

> git usually streams large blobs directly to packs. But there are cases
> where git can create large loose blobs (unpack-objects or hash-object
> over pipe). Or they can come from other git implementations.
> core.bigfilethreshold can also be lowered down and introduce a new
> wave of large loose blobs.
>
> Use streaming interface to read/compress/write these blobs in one
> go. Fall back to normal way if somehow streaming interface cannot be
> used.
>
> Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
> ---
>  On top of ng/pack-objects-cleanup. Changes since the last version is
>  we do not rely on close_istream(NULL); any more.

This version looks much cleaner, at least to me, as the logic in the
function always switches upon "st", using it as "are we reading the
data from the stream incrementally?"  boolean.

Thanks, will queue.

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] pack-objects: use streaming interface for reading large loose blobs
  2012-05-12 10:26 Nguyễn Thái Ngọc Duy
@ 2012-05-12 16:51 ` Nicolas Pitre
  0 siblings, 0 replies; 4+ messages in thread
From: Nicolas Pitre @ 2012-05-12 16:51 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git, Junio C Hamano

[-- Attachment #1: Type: TEXT/PLAIN, Size: 1841 bytes --]

On Sat, 12 May 2012, Nguyễn Thái Ngọc Duy wrote:

> git usually streams large blobs directly to packs. But there are cases
> where git can create large loose blobs (unpack-objects or hash-object
> over pipe). Or they can come from other git implementations.
> core.bigfilethreshold can also be lowered down and introduce a new
> wave of large loose blobs.
> 
> Use streaming interface to read these blobs and compress/write at the
> same time.
> 
> Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>

Comments below.

> ---
>  index-pack's streaming support is on the way. unpack-objects is
>  another story because I'm thinking of merging it back to index-pack
>  first, which may take more than one release cycle.
> 
>  builtin/pack-objects.c |   73 ++++++++++++++++++++++++++++++++++++++++++++----
>  t/t1050-large.sh       |   16 ++++++++++
>  2 files changed, 83 insertions(+), 6 deletions(-)
> 
> diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
> index 1861093..98b51c1 100644
> --- a/builtin/pack-objects.c
> +++ b/builtin/pack-objects.c
> @@ -259,9 +309,14 @@ static unsigned long write_object(struct sha1file *f,
>  	if (!to_reuse) {
>  		no_reuse:
>  		if (!usable_delta) {
> -			buf = read_sha1_file(entry->idx.sha1, &type, &size);
> -			if (!buf)
> -				die("unable to read %s", sha1_to_hex(entry->idx.sha1));
> +			type = sha1_object_info(entry->idx.sha1, &size);

Please don't use sha1_object_info() lightly.  This is a potentially 
expensive operation, and you really don't want to do it on each objects.

And as a matter of fact, the information you are looking for has already 
been determined earlier.  See the code in check_object() which tries 
hard to avoid sha1_object_info() as much as possible.

Therefore you should have entry->type and entry->size already set for 
you to use.


Nicolas

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH] pack-objects: use streaming interface for reading large loose blobs
@ 2012-05-12 10:26 Nguyễn Thái Ngọc Duy
  2012-05-12 16:51 ` Nicolas Pitre
  0 siblings, 1 reply; 4+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2012-05-12 10:26 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Nicolas Pitre, Nguyễn Thái Ngọc Duy

git usually streams large blobs directly to packs. But there are cases
where git can create large loose blobs (unpack-objects or hash-object
over pipe). Or they can come from other git implementations.
core.bigfilethreshold can also be lowered down and introduce a new
wave of large loose blobs.

Use streaming interface to read these blobs and compress/write at the
same time.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 index-pack's streaming support is on the way. unpack-objects is
 another story because I'm thinking of merging it back to index-pack
 first, which may take more than one release cycle.

 builtin/pack-objects.c |   73 ++++++++++++++++++++++++++++++++++++++++++++----
 t/t1050-large.sh       |   16 ++++++++++
 2 files changed, 83 insertions(+), 6 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 1861093..98b51c1 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -16,6 +16,7 @@
 #include "list-objects.h"
 #include "progress.h"
 #include "refs.h"
+#include "streaming.h"
 #include "thread-utils.h"
 
 static const char *pack_usage[] = {
@@ -150,6 +151,55 @@ static unsigned long do_compress(void **pptr, unsigned long size)
 	return stream.total_out;
 }
 
+static void write_large_blob_data(struct sha1file *f, const unsigned char *sha1)
+{
+	git_zstream stream;
+	unsigned char ibuf[1024 * 16];
+	unsigned char obuf[1024 * 16];
+	int zret;
+
+	struct git_istream *st;
+	enum object_type type;
+	unsigned long sz;
+
+	st = open_istream(sha1, &type, &sz, NULL);
+	if (!st)
+		die(_("failed to read %s"), sha1_to_hex(sha1));
+
+	memset(&stream, 0, sizeof(stream));
+	git_deflate_init(&stream, pack_compression_level);
+
+	if (type != OBJ_BLOB)
+		die("BUG: %s is not a blob", sha1_to_hex(sha1));
+
+	for (;;) {
+		ssize_t readlen;
+		readlen = read_istream(st, ibuf, sizeof(ibuf));
+		if (readlen == -1)
+			die(_("failed to read %s"), sha1_to_hex(sha1));
+
+		stream.next_in = ibuf;
+		stream.avail_in = readlen;
+		zret = Z_OK;
+		while ((stream.avail_in || readlen == 0) &&
+		       (zret == Z_OK || zret == Z_BUF_ERROR)) {
+			stream.next_out = obuf;
+			stream.avail_out = sizeof(obuf);
+			zret = git_deflate(&stream, readlen ? 0 : Z_FINISH);
+			sha1write(f, obuf, stream.next_out - obuf);
+		}
+		if (stream.avail_in)
+			die(_("deflate error (%d)"), zret);
+		if (readlen == 0) {
+			if (zret != Z_STREAM_END)
+				die(_("deflate error (%d)"), zret);
+			break;
+		}
+	}
+	close_istream(st);
+	git_deflate_end(&stream);
+}
+
 /*
  * we are going to reuse the existing object data as is.  make
  * sure it is not corrupt.
@@ -259,9 +309,14 @@ static unsigned long write_object(struct sha1file *f,
 	if (!to_reuse) {
 		no_reuse:
 		if (!usable_delta) {
-			buf = read_sha1_file(entry->idx.sha1, &type, &size);
-			if (!buf)
-				die("unable to read %s", sha1_to_hex(entry->idx.sha1));
+			type = sha1_object_info(entry->idx.sha1, &size);
+			if (type == OBJ_BLOB && size > big_file_threshold)
+				buf = NULL;
+			else {
+				buf = read_sha1_file(entry->idx.sha1, &type, &size);
+				if (!buf)
+					die("unable to read %s", sha1_to_hex(entry->idx.sha1));
+			}
 			/*
 			 * make sure no cached delta data remains from a
 			 * previous attempt before a pack split occurred.
@@ -284,8 +339,11 @@ static unsigned long write_object(struct sha1file *f,
 
 		if (entry->z_delta_size)
 			datalen = entry->z_delta_size;
-		else
+		else if (buf)
 			datalen = do_compress(&buf, size);
+		else
+			/* large blob case, just assume we don't compress well */
+			datalen = size;
 
 		/*
 		 * The object header is a byte of 'type' followed by zero or
@@ -330,8 +388,11 @@ static unsigned long write_object(struct sha1file *f,
 			}
 			sha1write(f, header, hdrlen);
 		}
-		sha1write(f, buf, datalen);
-		free(buf);
+		if (buf) {
+			sha1write(f, buf, datalen);
+			free(buf);
+		} else
+			write_large_blob_data(f, entry->idx.sha1);
 	}
 	else {
 		struct packed_git *p = entry->in_pack;
diff --git a/t/t1050-large.sh b/t/t1050-large.sh
index 55ed955..7fbd2e1 100755
--- a/t/t1050-large.sh
+++ b/t/t1050-large.sh
@@ -134,6 +134,22 @@ test_expect_success 'repack' '
 	git repack -ad
 '
 
+test_expect_success 'pack-objects with large loose object' '
+	echo Z | dd of=large4 bs=1k seek=2000 &&
+	OBJ=9f36d94e145816ec642592c09cc8e601d83af157 &&
+	P=.git/objects/9f/36d94e145816ec642592c09cc8e601d83af157 &&
+	(
+	unset GIT_ALLOC_LIMIT &&
+	cat large4 | git hash-object -w --stdin &&
+	git cat-file blob $OBJ >actual &&
+	cmp large4 actual
+	) &&
+	echo $OBJ | git pack-objects .git/objects/pack/pack &&
+	rm $P &&
+	git cat-file blob $OBJ >actual &&
+	cmp large4 actual
+'
+
 test_expect_success 'tar achiving' '
 	git archive --format=tar HEAD >/dev/null
 '
-- 
1.7.8.36.g69ee2

^ permalink raw reply related	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2012-05-29 17:56 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-05-26 10:28 [PATCH] pack-objects: use streaming interface for reading large loose blobs Nguyễn Thái Ngọc Duy
2012-05-29 17:56 ` Junio C Hamano
  -- strict thread matches above, loose matches on Subject: below --
2012-05-12 10:26 Nguyễn Thái Ngọc Duy
2012-05-12 16:51 ` Nicolas Pitre

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.