All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/5] Large file support for git-archive
@ 2012-04-30  4:57 Nguyễn Thái Ngọc Duy
  2012-04-30  4:57 ` [PATCH 1/5] archive-tar: turn write_tar_entry into blob-writing only Nguyễn Thái Ngọc Duy
                   ` (6 more replies)
  0 siblings, 7 replies; 23+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2012-04-30  4:57 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, René Scharfe, Nguyễn Thái Ngọc Duy

This is a spin-off from the large file support series I posted some
time ago [1]. Both tar and zip format now support streaming large
blobs. Although zip will store uncompressed large blobs.

[1] http://thread.gmane.org/gmane.comp.version-control.git/191605

Nguyễn Thái Ngọc Duy (5):
  archive-tar: turn write_tar_entry into blob-writing only
  archive-tar: unindent write_tar_entry by one level
  archive: delegate blob reading to backend
  archive-tar: stream large blobs to tar file
  archive-zip: stream large blobs into zip file

 Documentation/git-archive.txt |    3 +
 archive-tar.c                 |  184 ++++++++++++++++++++++++++++-------------
 archive-zip.c                 |   56 ++++++++++++-
 archive.c                     |   28 +++----
 archive.h                     |   10 ++-
 t/t1050-large.sh              |    8 ++
 6 files changed, 209 insertions(+), 80 deletions(-)

-- 
1.7.8.36.g69ee2

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH 1/5] archive-tar: turn write_tar_entry into blob-writing only
  2012-04-30  4:57 [PATCH 0/5] Large file support for git-archive Nguyễn Thái Ngọc Duy
@ 2012-04-30  4:57 ` Nguyễn Thái Ngọc Duy
  2012-04-30 18:15   ` Junio C Hamano
  2012-04-30  4:57 ` [PATCH 2/5] archive-tar: unindent write_tar_entry by one level Nguyễn Thái Ngọc Duy
                   ` (5 subsequent siblings)
  6 siblings, 1 reply; 23+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2012-04-30  4:57 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, René Scharfe, Nguyễn Thái Ngọc Duy

Before this patch write_tar_entry() can:

 - write global header
   by write_global_extended_header() calling write_tar_entry with
   with both sha1 and path == NULL

 - write extended header for symlinks, by write_tar_entry() calling
   itself with sha1 != NULL and path == NULL

 - write a normal blob. In this case both sha1 and path are valid.

After this patch, the first two call sites are modified to write the
header without calling write_tar_entry(). The function is now for
writing blobs only. This simplifies handling when write_tar_entry()
learns about large blobs.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 archive-tar.c |   78 ++++++++++++++++++++++++++++++++++++++-------------------
 1 files changed, 52 insertions(+), 26 deletions(-)

diff --git a/archive-tar.c b/archive-tar.c
index 20af005..1727ab9 100644
--- a/archive-tar.c
+++ b/archive-tar.c
@@ -123,6 +123,43 @@ static size_t get_path_prefix(const char *path, size_t pathlen, size_t maxlen)
 	return i;
 }
 
+static void prepare_header(struct archiver_args *args,
+			   struct ustar_header *header,
+			   unsigned int mode, unsigned long size)
+{
+	sprintf(header->mode, "%07o", mode & 07777);
+	sprintf(header->size, "%011lo", S_ISREG(mode) ? size : 0);
+	sprintf(header->mtime, "%011lo", (unsigned long) args->time);
+
+	sprintf(header->uid, "%07o", 0);
+	sprintf(header->gid, "%07o", 0);
+	strlcpy(header->uname, "root", sizeof(header->uname));
+	strlcpy(header->gname, "root", sizeof(header->gname));
+	sprintf(header->devmajor, "%07o", 0);
+	sprintf(header->devminor, "%07o", 0);
+
+	memcpy(header->magic, "ustar", 6);
+	memcpy(header->version, "00", 2);
+
+	sprintf(header->chksum, "%07o", ustar_header_chksum(header));
+}
+
+static int write_extended_header(struct archiver_args *args,
+				 const unsigned char *sha1,
+				 const void *buffer, unsigned long size)
+{
+	struct ustar_header header;
+	unsigned int mode;
+	memset(&header, 0, sizeof(header));
+	*header.typeflag = TYPEFLAG_EXT_HEADER;
+	mode = 0100666;
+	sprintf(header.name, "%s.paxheader", sha1_to_hex(sha1));
+	prepare_header(args, &header, mode, size);
+	write_blocked(&header, sizeof(header));
+	write_blocked(buffer, size);
+	return 0;
+}
+
 static int write_tar_entry(struct archiver_args *args,
 		const unsigned char *sha1, const char *path, size_t pathlen,
 		unsigned int mode, void *buffer, unsigned long size)
@@ -134,13 +171,9 @@ static int write_tar_entry(struct archiver_args *args,
 	memset(&header, 0, sizeof(header));
 
 	if (!sha1) {
-		*header.typeflag = TYPEFLAG_GLOBAL_HEADER;
-		mode = 0100666;
-		strcpy(header.name, "pax_global_header");
+		die("BUG: sha1 == NULL is not supported");
 	} else if (!path) {
-		*header.typeflag = TYPEFLAG_EXT_HEADER;
-		mode = 0100666;
-		sprintf(header.name, "%s.paxheader", sha1_to_hex(sha1));
+		die("BUG: path == NULL is not supported");
 	} else {
 		if (S_ISDIR(mode) || S_ISGITLINK(mode)) {
 			*header.typeflag = TYPEFLAG_DIR;
@@ -182,25 +215,11 @@ static int write_tar_entry(struct archiver_args *args,
 			memcpy(header.linkname, buffer, size);
 	}
 
-	sprintf(header.mode, "%07o", mode & 07777);
-	sprintf(header.size, "%011lo", S_ISREG(mode) ? size : 0);
-	sprintf(header.mtime, "%011lo", (unsigned long) args->time);
-
-	sprintf(header.uid, "%07o", 0);
-	sprintf(header.gid, "%07o", 0);
-	strlcpy(header.uname, "root", sizeof(header.uname));
-	strlcpy(header.gname, "root", sizeof(header.gname));
-	sprintf(header.devmajor, "%07o", 0);
-	sprintf(header.devminor, "%07o", 0);
-
-	memcpy(header.magic, "ustar", 6);
-	memcpy(header.version, "00", 2);
-
-	sprintf(header.chksum, "%07o", ustar_header_chksum(&header));
+	prepare_header(args, &header, mode, size);
 
 	if (ext_header.len > 0) {
-		err = write_tar_entry(args, sha1, NULL, 0, 0, ext_header.buf,
-				ext_header.len);
+		err = write_extended_header(args, sha1, ext_header.buf,
+					    ext_header.len);
 		if (err)
 			return err;
 	}
@@ -215,11 +234,18 @@ static int write_global_extended_header(struct archiver_args *args)
 {
 	const unsigned char *sha1 = args->commit_sha1;
 	struct strbuf ext_header = STRBUF_INIT;
-	int err;
+	struct ustar_header header;
+	unsigned int mode;
+	int err = 0;
 
 	strbuf_append_ext_header(&ext_header, "comment", sha1_to_hex(sha1), 40);
-	err = write_tar_entry(args, NULL, NULL, 0, 0, ext_header.buf,
-			ext_header.len);
+	memset(&header, 0, sizeof(header));
+	*header.typeflag = TYPEFLAG_GLOBAL_HEADER;
+	mode = 0100666;
+	strcpy(header.name, "pax_global_header");
+	prepare_header(args, &header, mode, ext_header.len);
+	write_blocked(&header, sizeof(header));
+	write_blocked(ext_header.buf, ext_header.len);
 	strbuf_release(&ext_header);
 	return err;
 }
-- 
1.7.8.36.g69ee2

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH 2/5] archive-tar: unindent write_tar_entry by one level
  2012-04-30  4:57 [PATCH 0/5] Large file support for git-archive Nguyễn Thái Ngọc Duy
  2012-04-30  4:57 ` [PATCH 1/5] archive-tar: turn write_tar_entry into blob-writing only Nguyễn Thái Ngọc Duy
@ 2012-04-30  4:57 ` Nguyễn Thái Ngọc Duy
  2012-04-30  4:57 ` [PATCH 3/5] archive: delegate blob reading to backend Nguyễn Thái Ngọc Duy
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 23+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2012-04-30  4:57 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, René Scharfe, Nguyễn Thái Ngọc Duy

It's used to be

if (!sha1) {
  ...
} else if (!path) {
  ...
} else {
  ...
}

Now that the first two blocks are no-op. We can remove the if/else
skeleton and put the else block back by one indent level.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 archive-tar.c |   56 +++++++++++++++++++++++++-------------------------------
 1 files changed, 25 insertions(+), 31 deletions(-)

diff --git a/archive-tar.c b/archive-tar.c
index 1727ab9..6c8a0bd 100644
--- a/archive-tar.c
+++ b/archive-tar.c
@@ -170,40 +170,34 @@ static int write_tar_entry(struct archiver_args *args,
 
 	memset(&header, 0, sizeof(header));
 
-	if (!sha1) {
-		die("BUG: sha1 == NULL is not supported");
-	} else if (!path) {
-		die("BUG: path == NULL is not supported");
+	if (S_ISDIR(mode) || S_ISGITLINK(mode)) {
+		*header.typeflag = TYPEFLAG_DIR;
+		mode = (mode | 0777) & ~tar_umask;
+	} else if (S_ISLNK(mode)) {
+		*header.typeflag = TYPEFLAG_LNK;
+		mode |= 0777;
+	} else if (S_ISREG(mode)) {
+		*header.typeflag = TYPEFLAG_REG;
+		mode = (mode | ((mode & 0100) ? 0777 : 0666)) & ~tar_umask;
 	} else {
-		if (S_ISDIR(mode) || S_ISGITLINK(mode)) {
-			*header.typeflag = TYPEFLAG_DIR;
-			mode = (mode | 0777) & ~tar_umask;
-		} else if (S_ISLNK(mode)) {
-			*header.typeflag = TYPEFLAG_LNK;
-			mode |= 0777;
-		} else if (S_ISREG(mode)) {
-			*header.typeflag = TYPEFLAG_REG;
-			mode = (mode | ((mode & 0100) ? 0777 : 0666)) & ~tar_umask;
+		return error("unsupported file mode: 0%o (SHA1: %s)",
+			     mode, sha1_to_hex(sha1));
+	}
+	if (pathlen > sizeof(header.name)) {
+		size_t plen = get_path_prefix(path, pathlen,
+					      sizeof(header.prefix));
+		size_t rest = pathlen - plen - 1;
+		if (plen > 0 && rest <= sizeof(header.name)) {
+			memcpy(header.prefix, path, plen);
+				memcpy(header.name, path + plen + 1, rest);
 		} else {
-			return error("unsupported file mode: 0%o (SHA1: %s)",
-					mode, sha1_to_hex(sha1));
+			sprintf(header.name, "%s.data",
+				sha1_to_hex(sha1));
+			strbuf_append_ext_header(&ext_header, "path",
+						 path, pathlen);
 		}
-		if (pathlen > sizeof(header.name)) {
-			size_t plen = get_path_prefix(path, pathlen,
-					sizeof(header.prefix));
-			size_t rest = pathlen - plen - 1;
-			if (plen > 0 && rest <= sizeof(header.name)) {
-				memcpy(header.prefix, path, plen);
-				memcpy(header.name, path + plen + 1, rest);
-			} else {
-				sprintf(header.name, "%s.data",
-				        sha1_to_hex(sha1));
-				strbuf_append_ext_header(&ext_header, "path",
-						path, pathlen);
-			}
-		} else
-			memcpy(header.name, path, pathlen);
-	}
+	} else
+		memcpy(header.name, path, pathlen);
 
 	if (S_ISLNK(mode) && buffer) {
 		if (size > sizeof(header.linkname)) {
-- 
1.7.8.36.g69ee2

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH 3/5] archive: delegate blob reading to backend
  2012-04-30  4:57 [PATCH 0/5] Large file support for git-archive Nguyễn Thái Ngọc Duy
  2012-04-30  4:57 ` [PATCH 1/5] archive-tar: turn write_tar_entry into blob-writing only Nguyễn Thái Ngọc Duy
  2012-04-30  4:57 ` [PATCH 2/5] archive-tar: unindent write_tar_entry by one level Nguyễn Thái Ngọc Duy
@ 2012-04-30  4:57 ` Nguyễn Thái Ngọc Duy
  2012-04-30 21:07   ` René Scharfe
  2012-04-30  4:57 ` [PATCH 4/5] archive-tar: stream large blobs to tar file Nguyễn Thái Ngọc Duy
                   ` (3 subsequent siblings)
  6 siblings, 1 reply; 23+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2012-04-30  4:57 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, René Scharfe, Nguyễn Thái Ngọc Duy

archive-tar.c and archive-zip.c now perform conversion check, with
help of sha1_file_to_archive() from archive.c

This gives backends more freedom in dealing with (streaming) large
blobs.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 archive-tar.c |   20 +++++++++++++++++---
 archive-zip.c |   14 ++++++++++++--
 archive.c     |   28 +++++++++++-----------------
 archive.h     |   10 +++++++++-
 4 files changed, 49 insertions(+), 23 deletions(-)

diff --git a/archive-tar.c b/archive-tar.c
index 6c8a0bd..61821f4 100644
--- a/archive-tar.c
+++ b/archive-tar.c
@@ -161,11 +161,15 @@ static int write_extended_header(struct archiver_args *args,
 }
 
 static int write_tar_entry(struct archiver_args *args,
-		const unsigned char *sha1, const char *path, size_t pathlen,
-		unsigned int mode, void *buffer, unsigned long size)
+			   const unsigned char *sha1,
+			   const char *path, size_t pathlen,
+			   unsigned int mode)
 {
 	struct ustar_header header;
 	struct strbuf ext_header = STRBUF_INIT;
+	unsigned int old_mode = mode;
+	unsigned long size;
+	void *buffer;
 	int err = 0;
 
 	memset(&header, 0, sizeof(header));
@@ -199,7 +203,17 @@ static int write_tar_entry(struct archiver_args *args,
 	} else
 		memcpy(header.name, path, pathlen);
 
-	if (S_ISLNK(mode) && buffer) {
+	if (S_ISLNK(mode) || S_ISREG(mode)) {
+		enum object_type type;
+		buffer = sha1_file_to_archive(args, path, sha1, old_mode, &type, &size);
+		if (!buffer)
+			return error("cannot read %s", sha1_to_hex(sha1));
+	} else {
+		buffer = NULL;
+		size = 0;
+	}
+
+	if (S_ISLNK(mode)) {
 		if (size > sizeof(header.linkname)) {
 			sprintf(header.linkname, "see %s.paxheader",
 			        sha1_to_hex(sha1));
diff --git a/archive-zip.c b/archive-zip.c
index 02d1f37..f8039ba 100644
--- a/archive-zip.c
+++ b/archive-zip.c
@@ -121,8 +121,9 @@ static void *zlib_deflate(void *data, unsigned long size,
 }
 
 static int write_zip_entry(struct archiver_args *args,
-		const unsigned char *sha1, const char *path, size_t pathlen,
-		unsigned int mode, void *buffer, unsigned long size)
+			   const unsigned char *sha1,
+			   const char *path, size_t pathlen,
+			   unsigned int mode)
 {
 	struct zip_local_header header;
 	struct zip_dir_header dirent;
@@ -134,6 +135,8 @@ static int write_zip_entry(struct archiver_args *args,
 	int method;
 	unsigned char *out;
 	void *deflated = NULL;
+	void *buffer;
+	unsigned long size;
 
 	crc = crc32(0, NULL, 0);
 
@@ -148,7 +151,14 @@ static int write_zip_entry(struct archiver_args *args,
 		out = NULL;
 		uncompressed_size = 0;
 		compressed_size = 0;
+		buffer = NULL;
+		size = 0;
 	} else if (S_ISREG(mode) || S_ISLNK(mode)) {
+		enum object_type type;
+		buffer = sha1_file_to_archive(args, path, sha1, mode, &type, &size);
+		if (!buffer)
+			return error("cannot read %s", sha1_to_hex(sha1));
+
 		method = 0;
 		attr2 = S_ISLNK(mode) ? ((mode | 0777) << 16) :
 			(mode & 0111) ? ((mode) << 16) : 0;
diff --git a/archive.c b/archive.c
index 1ee837d..cd083ea 100644
--- a/archive.c
+++ b/archive.c
@@ -59,12 +59,15 @@ static void format_subst(const struct commit *commit,
 	free(to_free);
 }
 
-static void *sha1_file_to_archive(const char *path, const unsigned char *sha1,
-		unsigned int mode, enum object_type *type,
-		unsigned long *sizep, const struct commit *commit)
+void *sha1_file_to_archive(const struct archiver_args *args,
+			   const char *path, const unsigned char *sha1,
+			   unsigned int mode, enum object_type *type,
+			   unsigned long *sizep)
 {
 	void *buffer;
+	const struct commit *commit = args->convert ? args->commit : NULL;
 
+	path += args->baselen;
 	buffer = read_sha1_file(sha1, type, sizep);
 	if (buffer && S_ISREG(mode)) {
 		struct strbuf buf = STRBUF_INIT;
@@ -109,12 +112,9 @@ static int write_archive_entry(const unsigned char *sha1, const char *base,
 	write_archive_entry_fn_t write_entry = c->write_entry;
 	struct git_attr_check check[2];
 	const char *path_without_prefix;
-	int convert = 0;
 	int err;
-	enum object_type type;
-	unsigned long size;
-	void *buffer;
 
+	args->convert = 0;
 	strbuf_reset(&path);
 	strbuf_grow(&path, PATH_MAX);
 	strbuf_add(&path, args->base, args->baselen);
@@ -126,28 +126,22 @@ static int write_archive_entry(const unsigned char *sha1, const char *base,
 	if (!git_check_attr(path_without_prefix, ARRAY_SIZE(check), check)) {
 		if (ATTR_TRUE(check[0].value))
 			return 0;
-		convert = ATTR_TRUE(check[1].value);
+		args->convert = ATTR_TRUE(check[1].value);
 	}
 
 	if (S_ISDIR(mode) || S_ISGITLINK(mode)) {
 		strbuf_addch(&path, '/');
 		if (args->verbose)
 			fprintf(stderr, "%.*s\n", (int)path.len, path.buf);
-		err = write_entry(args, sha1, path.buf, path.len, mode, NULL, 0);
+		err = write_entry(args, sha1, path.buf, path.len, mode);
 		if (err)
 			return err;
 		return (S_ISDIR(mode) ? READ_TREE_RECURSIVE : 0);
 	}
 
-	buffer = sha1_file_to_archive(path_without_prefix, sha1, mode,
-			&type, &size, convert ? args->commit : NULL);
-	if (!buffer)
-		return error("cannot read %s", sha1_to_hex(sha1));
 	if (args->verbose)
 		fprintf(stderr, "%.*s\n", (int)path.len, path.buf);
-	err = write_entry(args, sha1, path.buf, path.len, mode, buffer, size);
-	free(buffer);
-	return err;
+	return write_entry(args, sha1, path.buf, path.len, mode);
 }
 
 int write_archive_entries(struct archiver_args *args,
@@ -167,7 +161,7 @@ int write_archive_entries(struct archiver_args *args,
 		if (args->verbose)
 			fprintf(stderr, "%.*s\n", (int)len, args->base);
 		err = write_entry(args, args->tree->object.sha1, args->base,
-				len, 040777, NULL, 0);
+				  len, 040777);
 		if (err)
 			return err;
 	}
diff --git a/archive.h b/archive.h
index 2b0884f..895afcd 100644
--- a/archive.h
+++ b/archive.h
@@ -11,6 +11,7 @@ struct archiver_args {
 	const char **pathspec;
 	unsigned int verbose : 1;
 	unsigned int worktree_attributes : 1;
+	unsigned int convert : 1;
 	int compression_level;
 };
 
@@ -27,11 +28,18 @@ extern void register_archiver(struct archiver *);
 extern void init_tar_archiver(void);
 extern void init_zip_archiver(void);
 
-typedef int (*write_archive_entry_fn_t)(struct archiver_args *args, const unsigned char *sha1, const char *path, size_t pathlen, unsigned int mode, void *buffer, unsigned long size);
+typedef int (*write_archive_entry_fn_t)(struct archiver_args *args,
+					const unsigned char *sha1,
+					const char *path, size_t pathlen,
+					unsigned int mode);
 
 extern int write_archive_entries(struct archiver_args *args, write_archive_entry_fn_t write_entry);
 extern int write_archive(int argc, const char **argv, const char *prefix, int setup_prefix, const char *name_hint, int remote);
 
 const char *archive_format_from_filename(const char *filename);
+extern void *sha1_file_to_archive(const struct archiver_args *args,
+				  const char *path, const unsigned char *sha1,
+				  unsigned int mode, enum object_type *type,
+				  unsigned long *sizep);
 
 #endif	/* ARCHIVE_H */
-- 
1.7.8.36.g69ee2

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH 4/5] archive-tar: stream large blobs to tar file
  2012-04-30  4:57 [PATCH 0/5] Large file support for git-archive Nguyễn Thái Ngọc Duy
                   ` (2 preceding siblings ...)
  2012-04-30  4:57 ` [PATCH 3/5] archive: delegate blob reading to backend Nguyễn Thái Ngọc Duy
@ 2012-04-30  4:57 ` Nguyễn Thái Ngọc Duy
  2012-04-30 19:01   ` Junio C Hamano
  2012-04-30 21:08   ` René Scharfe
  2012-04-30  4:57 ` [PATCH 5/5] archive-zip: stream large blobs into zip file Nguyễn Thái Ngọc Duy
                   ` (2 subsequent siblings)
  6 siblings, 2 replies; 23+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2012-04-30  4:57 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, René Scharfe, Nguyễn Thái Ngọc Duy


Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 archive-tar.c    |   38 +++++++++++++++++++++++++++++++++++---
 t/t1050-large.sh |    4 ++++
 2 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/archive-tar.c b/archive-tar.c
index 61821f4..865ef6d 100644
--- a/archive-tar.c
+++ b/archive-tar.c
@@ -4,6 +4,7 @@
 #include "cache.h"
 #include "tar.h"
 #include "archive.h"
+#include "streaming.h"
 #include "run-command.h"
 
 #define RECORDSIZE	(512)
@@ -62,6 +63,29 @@ static void write_blocked(const void *data, unsigned long size)
 	write_if_needed();
 }
 
+static int stream_blob_to_file(const unsigned char *sha1)
+{
+	struct git_istream *st;
+	enum object_type type;
+	unsigned long sz;
+
+	st = open_istream(sha1, &type, &sz, NULL);
+	if (!st)
+		return error("cannot stream blob %s", sha1_to_hex(sha1));
+	for (;;) {
+		char buf[BLOCKSIZE];
+		ssize_t readlen;
+
+		readlen = read_istream(st, buf, sizeof(buf));
+
+		if (readlen <= 0)
+			return readlen;
+		write_blocked(buf, readlen);
+	}
+	close_istream(st);
+	return 0;
+}
+
 /*
  * The end of tar archives is marked by 2*512 nul bytes and after that
  * follows the rest of the block (if any).
@@ -203,7 +227,11 @@ static int write_tar_entry(struct archiver_args *args,
 	} else
 		memcpy(header.name, path, pathlen);
 
-	if (S_ISLNK(mode) || S_ISREG(mode)) {
+	if (S_ISREG(mode) && !args->convert &&
+	    sha1_object_info(sha1, &size) == OBJ_BLOB &&
+	    size > big_file_threshold)
+		buffer = NULL;
+	else if (S_ISLNK(mode) || S_ISREG(mode)) {
 		enum object_type type;
 		buffer = sha1_file_to_archive(args, path, sha1, old_mode, &type, &size);
 		if (!buffer)
@@ -233,8 +261,12 @@ static int write_tar_entry(struct archiver_args *args,
 	}
 	strbuf_release(&ext_header);
 	write_blocked(&header, sizeof(header));
-	if (S_ISREG(mode) && buffer && size > 0)
-		write_blocked(buffer, size);
+	if (S_ISREG(mode) && size > 0) {
+		if (buffer)
+			write_blocked(buffer, size);
+		else
+			err = stream_blob_to_file(sha1);
+	}
 	return err;
 }
 
diff --git a/t/t1050-large.sh b/t/t1050-large.sh
index 4d127f1..fe47554 100755
--- a/t/t1050-large.sh
+++ b/t/t1050-large.sh
@@ -134,4 +134,8 @@ test_expect_success 'repack' '
 	git repack -ad
 '
 
+test_expect_success 'tar achiving' '
+	git archive --format=tar HEAD >/dev/null
+'
+
 test_done
-- 
1.7.8.36.g69ee2

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH 5/5] archive-zip: stream large blobs into zip file
  2012-04-30  4:57 [PATCH 0/5] Large file support for git-archive Nguyễn Thái Ngọc Duy
                   ` (3 preceding siblings ...)
  2012-04-30  4:57 ` [PATCH 4/5] archive-tar: stream large blobs to tar file Nguyễn Thái Ngọc Duy
@ 2012-04-30  4:57 ` Nguyễn Thái Ngọc Duy
  2012-04-30 19:12   ` Junio C Hamano
                     ` (5 more replies)
  2012-04-30 19:15 ` [PATCH 0/5] Large file support for git-archive Junio C Hamano
  2012-04-30 21:07 ` René Scharfe
  6 siblings, 6 replies; 23+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2012-04-30  4:57 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, René Scharfe, Nguyễn Thái Ngọc Duy

A large blob will be read twice. One for calculating crc32, one for
actual writing. Large blobs are written uncompressed for simplicity.

Writing compressed large blobs is possible. But a naive implementation
would need to decompress/compress the blob twice: one to calculate
compressed size, one for actual writing, assuming compressed blobs are
still over large file limit.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 I think we could extract compressed size from pack index, then stream
 the compressed blob directly from pack to zip file. But that makes
 git-archive sensitive to pack format. And to be honest I don't care
 that much about large file support to do it. This patch is good
 enough for me.

 Documentation/git-archive.txt |    3 ++
 archive-zip.c                 |   42 ++++++++++++++++++++++++++++++++++++++++-
 t/t1050-large.sh              |    4 +++
 3 files changed, 48 insertions(+), 1 deletions(-)

diff --git a/Documentation/git-archive.txt b/Documentation/git-archive.txt
index ac7006e..6df85a6 100644
--- a/Documentation/git-archive.txt
+++ b/Documentation/git-archive.txt
@@ -120,6 +120,9 @@ tar.<format>.remote::
 	user-defined formats, but true for the "tar.gz" and "tgz"
 	formats.
 
+core.bigFileThreshold::
+	Files larger than this size are stored uncompressed in zip format.
+
 ATTRIBUTES
 ----------
 
diff --git a/archive-zip.c b/archive-zip.c
index f8039ba..ee58bda 100644
--- a/archive-zip.c
+++ b/archive-zip.c
@@ -3,6 +3,7 @@
  */
 #include "cache.h"
 #include "archive.h"
+#include "streaming.h"
 
 static int zip_date;
 static int zip_time;
@@ -120,6 +121,29 @@ static void *zlib_deflate(void *data, unsigned long size,
 	return buffer;
 }
 
+static int crc32_stream(const unsigned char *sha1, unsigned long *crc)
+{
+	struct git_istream *st;
+	enum object_type type;
+	unsigned long sz;
+
+	st = open_istream(sha1, &type, &sz, NULL);
+	if (!st)
+		return error("cannot stream blob %s", sha1_to_hex(sha1));
+	for (;;) {
+		char buf[1024];
+		ssize_t readlen;
+
+		readlen = read_istream(st, buf, sizeof(buf));
+
+		if (readlen <= 0)
+			return readlen;
+		*crc = crc32(*crc, (unsigned char*)buf, readlen);
+	}
+	close_istream(st);
+	return 0;
+}
+
 static int write_zip_entry(struct archiver_args *args,
 			   const unsigned char *sha1,
 			   const char *path, size_t pathlen,
@@ -153,6 +177,19 @@ static int write_zip_entry(struct archiver_args *args,
 		compressed_size = 0;
 		buffer = NULL;
 		size = 0;
+	} else if (!args->convert && S_ISREG(mode) &&
+		      sha1_object_info(sha1, &size) == OBJ_BLOB &&
+		      size > big_file_threshold) {
+		buffer = NULL;
+		method = 0;
+		attr2 = S_ISLNK(mode) ? ((mode | 0777) << 16) :
+			(mode & 0111) ? ((mode) << 16) : 0;
+		if (crc32_stream(sha1, &crc) < 0)
+			return error("failed to calculate crc32 from blob %s, SHA1 %s",
+				     path, sha1_to_hex(sha1));
+		out = buffer;
+		uncompressed_size = size;
+		compressed_size = size;
 	} else if (S_ISREG(mode) || S_ISLNK(mode)) {
 		enum object_type type;
 		buffer = sha1_file_to_archive(args, path, sha1, mode, &type, &size);
@@ -234,7 +271,10 @@ static int write_zip_entry(struct archiver_args *args,
 	write_or_die(1, path, pathlen);
 	zip_offset += pathlen;
 	if (compressed_size > 0) {
-		write_or_die(1, out, compressed_size);
+		if (out)
+			write_or_die(1, out, compressed_size);
+		else
+			stream_blob_to_fd(1, sha1, NULL, 0);
 		zip_offset += compressed_size;
 	}
 
diff --git a/t/t1050-large.sh b/t/t1050-large.sh
index fe47554..458fdde 100755
--- a/t/t1050-large.sh
+++ b/t/t1050-large.sh
@@ -138,4 +138,8 @@ test_expect_success 'tar achiving' '
 	git archive --format=tar HEAD >/dev/null
 '
 
+test_expect_success 'zip achiving' '
+	git archive --format=zip HEAD >/dev/null
+'
+
 test_done
-- 
1.7.8.36.g69ee2

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/5] archive-tar: turn write_tar_entry into blob-writing only
  2012-04-30  4:57 ` [PATCH 1/5] archive-tar: turn write_tar_entry into blob-writing only Nguyễn Thái Ngọc Duy
@ 2012-04-30 18:15   ` Junio C Hamano
  2012-04-30 22:11     ` René Scharfe
  0 siblings, 1 reply; 23+ messages in thread
From: Junio C Hamano @ 2012-04-30 18:15 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git, René Scharfe

Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:

> Before this patch write_tar_entry() can:
>
>  - write global header
>    by write_global_extended_header() calling write_tar_entry with
>    with both sha1 and path == NULL
>
>  - write extended header for symlinks, by write_tar_entry() calling
>    itself with sha1 != NULL and path == NULL
>
>  - write a normal blob. In this case both sha1 and path are valid.
>
> After this patch, the first two call sites are modified to write the
> header without calling write_tar_entry(). The function is now for
> writing blobs only.

Nice.

I am kind of surprised how hacky the original code that switched on !sha1
and !path was, especially given that it came from René at ae64bbc
(tar-tree: Introduce write_entry(), 2006-03-25) --- it even claims that
these are "reasonable" magic values ;-).

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 4/5] archive-tar: stream large blobs to tar file
  2012-04-30  4:57 ` [PATCH 4/5] archive-tar: stream large blobs to tar file Nguyễn Thái Ngọc Duy
@ 2012-04-30 19:01   ` Junio C Hamano
  2012-04-30 21:08   ` René Scharfe
  1 sibling, 0 replies; 23+ messages in thread
From: Junio C Hamano @ 2012-04-30 19:01 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git, René Scharfe

Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:

> Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
> ---
>  archive-tar.c    |   38 +++++++++++++++++++++++++++++++++++---
>  t/t1050-large.sh |    4 ++++
>  2 files changed, 39 insertions(+), 3 deletions(-)
>
> diff --git a/archive-tar.c b/archive-tar.c
> index 61821f4..865ef6d 100644
> --- a/archive-tar.c
> +++ b/archive-tar.c
> @@ -4,6 +4,7 @@
>  #include "cache.h"
>  #include "tar.h"
>  #include "archive.h"
> +#include "streaming.h"
>  #include "run-command.h"
>  
>  #define RECORDSIZE	(512)
> @@ -62,6 +63,29 @@ static void write_blocked(const void *data, unsigned long size)
>  	write_if_needed();
>  }
>  
> +static int stream_blob_to_file(const unsigned char *sha1)

The name of this function is misleading and caused me to look for an
existing implementation that streams blob contents to a given fd while
wondering how that fd is passed to this function.

I do not think it would make sense to reuse stream_blob_to_fd() here, as
is it too much hassle to account for blocking factors.  In order to make
it clear for future futzers that this is not something they can turn into
global function and reuse outside the context of generating tar archive
output, please rename it either to stream_blob_to_tarfile(), which is more
specific, or stream_blocked(), which is too vague and nobody sane would
think about exporting from the file just like existing write_blocked() is.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 5/5] archive-zip: stream large blobs into zip file
  2012-04-30  4:57 ` [PATCH 5/5] archive-zip: stream large blobs into zip file Nguyễn Thái Ngọc Duy
@ 2012-04-30 19:12   ` Junio C Hamano
  2012-04-30 22:54     ` René Scharfe
  2012-04-30 22:11   ` [PATCH 5a/5] streaming: void pointer instead of char pointer René Scharfe
                     ` (4 subsequent siblings)
  5 siblings, 1 reply; 23+ messages in thread
From: Junio C Hamano @ 2012-04-30 19:12 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git, René Scharfe

Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:

> A large blob will be read twice. One for calculating crc32, one for
> actual writing.

Is that because you need the checksum before the payload?  That is
unfortunate.  It would be nice (read: not a rejection of this patch---it
is a good first step to do it stupid but correct way before trying to
optimize it) to avoid it when the output is seekable, especially because
we are talking about a *large* payload.

> diff --git a/t/t1050-large.sh b/t/t1050-large.sh
> index fe47554..458fdde 100755
> --- a/t/t1050-large.sh
> +++ b/t/t1050-large.sh
> @@ -138,4 +138,8 @@ test_expect_success 'tar achiving' '
>  	git archive --format=tar HEAD >/dev/null
>  '
>  
> +test_expect_success 'zip achiving' '
> +	git archive --format=zip HEAD >/dev/null
> +'

Can't we do better than "we only check if it finishes without barfing; we
cannot afford to check the correctness of the output"?  The same comment
applies to all the tests you added to this file in the past 3 months.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 0/5] Large file support for git-archive
  2012-04-30  4:57 [PATCH 0/5] Large file support for git-archive Nguyễn Thái Ngọc Duy
                   ` (4 preceding siblings ...)
  2012-04-30  4:57 ` [PATCH 5/5] archive-zip: stream large blobs into zip file Nguyễn Thái Ngọc Duy
@ 2012-04-30 19:15 ` Junio C Hamano
  2012-04-30 21:07 ` René Scharfe
  6 siblings, 0 replies; 23+ messages in thread
From: Junio C Hamano @ 2012-04-30 19:15 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git, René Scharfe

Overall this was a very pleasant read.  Thanks.  Will queue.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 3/5] archive: delegate blob reading to backend
  2012-04-30  4:57 ` [PATCH 3/5] archive: delegate blob reading to backend Nguyễn Thái Ngọc Duy
@ 2012-04-30 21:07   ` René Scharfe
  0 siblings, 0 replies; 23+ messages in thread
From: René Scharfe @ 2012-04-30 21:07 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git, Junio C Hamano

Am 30.04.2012 06:57, schrieb Nguyễn Thái Ngọc Duy:
> archive-tar.c and archive-zip.c now perform conversion check, with
> help of sha1_file_to_archive() from archive.c
> 
> This gives backends more freedom in dealing with (streaming) large
> blobs.
> 
> Signed-off-by: Nguyễn Thái Ngọc Duy<pclouds@gmail.com>
> ---
>   archive-tar.c |   20 +++++++++++++++++---
>   archive-zip.c |   14 ++++++++++++--
>   archive.c     |   28 +++++++++++-----------------
>   archive.h     |   10 +++++++++-
>   4 files changed, 49 insertions(+), 23 deletions(-)
> 
> diff --git a/archive-tar.c b/archive-tar.c
> index 6c8a0bd..61821f4 100644
> --- a/archive-tar.c
> +++ b/archive-tar.c
> @@ -161,11 +161,15 @@ static int write_extended_header(struct archiver_args *args,
>   }
> 
>   static int write_tar_entry(struct archiver_args *args,
> -		const unsigned char *sha1, const char *path, size_t pathlen,
> -		unsigned int mode, void *buffer, unsigned long size)
> +			   const unsigned char *sha1,
> +			   const char *path, size_t pathlen,
> +			   unsigned int mode)
>   {
>   	struct ustar_header header;
>   	struct strbuf ext_header = STRBUF_INIT;
> +	unsigned int old_mode = mode;
> +	unsigned long size;
> +	void *buffer;
>   	int err = 0;
> 
>   	memset(&header, 0, sizeof(header));
> @@ -199,7 +203,17 @@ static int write_tar_entry(struct archiver_args *args,
>   	} else
>   		memcpy(header.name, path, pathlen);
> 
> -	if (S_ISLNK(mode)&&  buffer) {
> +	if (S_ISLNK(mode) || S_ISREG(mode)) {
> +		enum object_type type;
> +		buffer = sha1_file_to_archive(args, path, sha1, old_mode,&type,&size);

This buffer is never freed.

> +		if (!buffer)
> +			return error("cannot read %s", sha1_to_hex(sha1));
> +	} else {
> +		buffer = NULL;
> +		size = 0;
> +	}
> +
> +	if (S_ISLNK(mode)) {
>   		if (size>  sizeof(header.linkname)) {
>   			sprintf(header.linkname, "see %s.paxheader",
>   			        sha1_to_hex(sha1));
> diff --git a/archive-zip.c b/archive-zip.c
> index 02d1f37..f8039ba 100644
> --- a/archive-zip.c
> +++ b/archive-zip.c
> @@ -121,8 +121,9 @@ static void *zlib_deflate(void *data, unsigned long size,
>   }
> 
>   static int write_zip_entry(struct archiver_args *args,
> -		const unsigned char *sha1, const char *path, size_t pathlen,
> -		unsigned int mode, void *buffer, unsigned long size)
> +			   const unsigned char *sha1,
> +			   const char *path, size_t pathlen,
> +			   unsigned int mode)
>   {
>   	struct zip_local_header header;
>   	struct zip_dir_header dirent;
> @@ -134,6 +135,8 @@ static int write_zip_entry(struct archiver_args *args,
>   	int method;
>   	unsigned char *out;
>   	void *deflated = NULL;
> +	void *buffer;
> +	unsigned long size;
> 
>   	crc = crc32(0, NULL, 0);
> 
> @@ -148,7 +151,14 @@ static int write_zip_entry(struct archiver_args *args,
>   		out = NULL;
>   		uncompressed_size = 0;
>   		compressed_size = 0;
> +		buffer = NULL;
> +		size = 0;
>   	} else if (S_ISREG(mode) || S_ISLNK(mode)) {
> +		enum object_type type;
> +		buffer = sha1_file_to_archive(args, path, sha1, mode,&type,&size);

This one is leaked as well.

> +		if (!buffer)
> +			return error("cannot read %s", sha1_to_hex(sha1));
> +
>   		method = 0;
>   		attr2 = S_ISLNK(mode) ? ((mode | 0777)<<  16) :
>   			(mode&  0111) ? ((mode)<<  16) : 0;

You could squash this in:


diff --git a/archive-tar.c b/archive-tar.c
index 61821f4..bb320ad 100644
--- a/archive-tar.c
+++ b/archive-tar.c
@@ -235,6 +235,7 @@ static int write_tar_entry(struct archiver_args *args,
 	write_blocked(&header, sizeof(header));
 	if (S_ISREG(mode) && buffer && size > 0)
 		write_blocked(buffer, size);
+	free(buffer);
 	return err;
 }
 
diff --git a/archive-zip.c b/archive-zip.c
index f8039ba..716cc42 100644
--- a/archive-zip.c
+++ b/archive-zip.c
@@ -239,6 +239,7 @@ static int write_zip_entry(struct archiver_args *args,
 	}
 
 	free(deflated);
+	free(buffer);
 
 	return 0;
 }

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH 0/5] Large file support for git-archive
  2012-04-30  4:57 [PATCH 0/5] Large file support for git-archive Nguyễn Thái Ngọc Duy
                   ` (5 preceding siblings ...)
  2012-04-30 19:15 ` [PATCH 0/5] Large file support for git-archive Junio C Hamano
@ 2012-04-30 21:07 ` René Scharfe
  2012-05-01 10:19   ` Nguyen Thai Ngoc Duy
  6 siblings, 1 reply; 23+ messages in thread
From: René Scharfe @ 2012-04-30 21:07 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git, Junio C Hamano

Am 30.04.2012 06:57, schrieb Nguyễn Thái Ngọc Duy:
> This is a spin-off from the large file support series I posted some
> time ago [1]. Both tar and zip format now support streaming large
> blobs. Although zip will store uncompressed large blobs.
>
> [1] http://thread.gmane.org/gmane.comp.version-control.git/191605
>
> Nguyễn Thái Ngọc Duy (5):
>    archive-tar: turn write_tar_entry into blob-writing only
>    archive-tar: unindent write_tar_entry by one level
>    archive: delegate blob reading to backend
>    archive-tar: stream large blobs to tar file
>    archive-zip: stream large blobs into zip file
>
>   Documentation/git-archive.txt |    3 +
>   archive-tar.c                 |  184 ++++++++++++++++++++++++++++-------------
>   archive-zip.c                 |   56 ++++++++++++-
>   archive.c                     |   28 +++----
>   archive.h                     |   10 ++-
>   t/t1050-large.sh              |    8 ++
>   6 files changed, 209 insertions(+), 80 deletions(-)

I like the cleanups in the first two patches.

I'm neutral to positive on the third one; it certainly simplifies the 
interface to the backends with only little code duplication.

The ZIP format supports streaming natively (look for "Data descriptor" 
in http://www.pkware.com/documents/casestudies/APPNOTE.TXT). 
Incidentally, I had been working on supporting that, but without any 
presentable results.  So far.  I've adapted the pieces I had to your 
series now; will send shortly.

René

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 4/5] archive-tar: stream large blobs to tar file
  2012-04-30  4:57 ` [PATCH 4/5] archive-tar: stream large blobs to tar file Nguyễn Thái Ngọc Duy
  2012-04-30 19:01   ` Junio C Hamano
@ 2012-04-30 21:08   ` René Scharfe
  2012-04-30 21:36     ` Junio C Hamano
  1 sibling, 1 reply; 23+ messages in thread
From: René Scharfe @ 2012-04-30 21:08 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git, Junio C Hamano

Am 30.04.2012 06:57, schrieb Nguyễn Thái Ngọc Duy:
> 
> Signed-off-by: Nguyễn Thái Ngọc Duy<pclouds@gmail.com>
> ---
>   archive-tar.c    |   38 +++++++++++++++++++++++++++++++++++---
>   t/t1050-large.sh |    4 ++++
>   2 files changed, 39 insertions(+), 3 deletions(-)
> 
> diff --git a/archive-tar.c b/archive-tar.c
> index 61821f4..865ef6d 100644
> --- a/archive-tar.c
> +++ b/archive-tar.c
> @@ -4,6 +4,7 @@
>   #include "cache.h"
>   #include "tar.h"
>   #include "archive.h"
> +#include "streaming.h"
>   #include "run-command.h"
> 
>   #define RECORDSIZE	(512)
> @@ -62,6 +63,29 @@ static void write_blocked(const void *data, unsigned long size)
>   	write_if_needed();
>   }
> 
> +static int stream_blob_to_file(const unsigned char *sha1)
> +{
> +	struct git_istream *st;
> +	enum object_type type;
> +	unsigned long sz;
> +
> +	st = open_istream(sha1,&type,&sz, NULL);
> +	if (!st)
> +		return error("cannot stream blob %s", sha1_to_hex(sha1));
> +	for (;;) {
> +		char buf[BLOCKSIZE];
> +		ssize_t readlen;
> +
> +		readlen = read_istream(st, buf, sizeof(buf));
> +
> +		if (readlen <= 0)
> +			return readlen;
> +		write_blocked(buf, readlen);
> +	}
> +	close_istream(st);
> +	return 0;
> +}

The stream is never closed.  Perhaps squash this in?


diff --git a/archive-tar.c b/archive-tar.c
index 506c8cb..6109fd3 100644
--- a/archive-tar.c
+++ b/archive-tar.c
@@ -66,6 +66,7 @@ static void write_blocked(const void *data, unsigned long size)
 static int stream_blob_to_file(const unsigned char *sha1)
 {
 	struct git_istream *st;
+	ssize_t readlen;
 	enum object_type type;
 	unsigned long sz;
 
@@ -74,16 +75,15 @@ static int stream_blob_to_file(const unsigned char *sha1)
 		return error("cannot stream blob %s", sha1_to_hex(sha1));
 	for (;;) {
 		char buf[BLOCKSIZE];
-		ssize_t readlen;
 
 		readlen = read_istream(st, buf, sizeof(buf));
 
 		if (readlen <= 0)
-			return readlen;
+			break;
 		write_blocked(buf, readlen);
 	}
 	close_istream(st);
-	return 0;
+	return readlen;
 }
 
 /*

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH 4/5] archive-tar: stream large blobs to tar file
  2012-04-30 21:08   ` René Scharfe
@ 2012-04-30 21:36     ` Junio C Hamano
  2012-04-30 22:12       ` René Scharfe
  0 siblings, 1 reply; 23+ messages in thread
From: Junio C Hamano @ 2012-04-30 21:36 UTC (permalink / raw)
  To: René Scharfe; +Cc: Nguyễn Thái Ngọc Duy, git

René Scharfe <rene.scharfe@lsrfire.ath.cx> writes:

>> +static int stream_blob_to_file(const unsigned char *sha1)
>> +{
>> +	struct git_istream *st;
>> +	enum object_type type;
>> +	unsigned long sz;
>> +
>> +	st = open_istream(sha1,&type,&sz, NULL);
>> +	if (!st)
>> +		return error("cannot stream blob %s", sha1_to_hex(sha1));
>> +	for (;;) {
>> +		char buf[BLOCKSIZE];
>> +		ssize_t readlen;
>> +
>> +		readlen = read_istream(st, buf, sizeof(buf));
>> +
>> +		if (readlen <= 0)
>> +			return readlen;
>> +		write_blocked(buf, readlen);
>> +	}
>> +	close_istream(st);
>> +	return 0;
>> +}
>
> The stream is never closed.  Perhaps squash this in?
>
> diff --git a/archive-tar.c b/archive-tar.c
> index 506c8cb..6109fd3 100644
> --- a/archive-tar.c
> +++ b/archive-tar.c
> @@ -66,6 +66,7 @@ static void write_blocked(const void *data, unsigned long size)
>  static int stream_blob_to_file(const unsigned char *sha1)
>  {
>  	struct git_istream *st;
> +	ssize_t readlen;
>  	enum object_type type;
>  	unsigned long sz;
>  
> @@ -74,16 +75,15 @@ static int stream_blob_to_file(const unsigned char *sha1)
>  		return error("cannot stream blob %s", sha1_to_hex(sha1));
>  	for (;;) {
>  		char buf[BLOCKSIZE];
> -		ssize_t readlen;
>  
>  		readlen = read_istream(st, buf, sizeof(buf));
>  
>  		if (readlen <= 0)
> -			return readlen;
> +			break;
>  		write_blocked(buf, readlen);
>  	}
>  	close_istream(st);
> -	return 0;
> +	return readlen;
>  }

Your patch on top obviouly is the right thing to do, but reading the code
again, I am not sure if the original is correct.  read_istream() itself
does not promise that it will always fill the buffer before returning (it
could return with a short read).  It seems incorrect that the caller does
not loop to avoid padding a short read with padding by calling
write_blocked().

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/5] archive-tar: turn write_tar_entry into blob-writing only
  2012-04-30 18:15   ` Junio C Hamano
@ 2012-04-30 22:11     ` René Scharfe
  0 siblings, 0 replies; 23+ messages in thread
From: René Scharfe @ 2012-04-30 22:11 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Nguyễn Thái Ngọc Duy, git

Am 30.04.2012 20:15, schrieb Junio C Hamano:
> Nguyễn Thái Ngọc Duy<pclouds@gmail.com>  writes:
>
>> Before this patch write_tar_entry() can:
>>
>>   - write global header
>>     by write_global_extended_header() calling write_tar_entry with
>>     with both sha1 and path == NULL
>>
>>   - write extended header for symlinks, by write_tar_entry() calling
>>     itself with sha1 != NULL and path == NULL
>>
>>   - write a normal blob. In this case both sha1 and path are valid.
>>
>> After this patch, the first two call sites are modified to write the
>> header without calling write_tar_entry(). The function is now for
>> writing blobs only.
>
> Nice.
>
> I am kind of surprised how hacky the original code that switched on !sha1
> and !path was, especially given that it came from René at ae64bbc
> (tar-tree: Introduce write_entry(), 2006-03-25) --- it even claims that
> these are "reasonable" magic values ;-).

Yeah, and there are still opportunities for cleanup.  Can't say what I 
was thinking back then, but a few cleanup attempts since then invariably 
made the code even more ugly, so I never sent them.

René

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH 5a/5] streaming: void pointer instead of char pointer
  2012-04-30  4:57 ` [PATCH 5/5] archive-zip: stream large blobs into zip file Nguyễn Thái Ngọc Duy
  2012-04-30 19:12   ` Junio C Hamano
@ 2012-04-30 22:11   ` René Scharfe
  2012-04-30 22:12   ` [PATCH 6a/5] archive-zip: remove uncompressed_size René Scharfe
                     ` (3 subsequent siblings)
  5 siblings, 0 replies; 23+ messages in thread
From: René Scharfe @ 2012-04-30 22:11 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git, Junio C Hamano

Allow any kind of buffer to be fed to read_istream() without an explicit
cast by making it's buf argument a void pointer.  It's about arbitrary
data, not only characters.

Signed-off-by: Rene Scharfe <rene.scharfe@lsrfire.ath.cx>
---
 streaming.c |    2 +-
 streaming.h |    2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/streaming.c b/streaming.c
index 7e7ee2b..3a3cd12 100644
--- a/streaming.c
+++ b/streaming.c
@@ -99,7 +99,7 @@ int close_istream(struct git_istream *st)
 	return r;
 }
 
-ssize_t read_istream(struct git_istream *st, char *buf, size_t sz)
+ssize_t read_istream(struct git_istream *st, void *buf, size_t sz)
 {
 	return st->vtbl->read(st, buf, sz);
 }
diff --git a/streaming.h b/streaming.h
index 3e82770..1d05c2a 100644
--- a/streaming.h
+++ b/streaming.h
@@ -10,7 +10,7 @@ struct git_istream;
 
 extern struct git_istream *open_istream(const unsigned char *, enum object_type *, unsigned long *, struct stream_filter *);
 extern int close_istream(struct git_istream *);
-extern ssize_t read_istream(struct git_istream *, char *, size_t);
+extern ssize_t read_istream(struct git_istream *, void *, size_t);
 
 extern int stream_blob_to_fd(int fd, const unsigned char *, struct stream_filter *, int can_seek);
 
-- 
1.7.10

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH 4/5] archive-tar: stream large blobs to tar file
  2012-04-30 21:36     ` Junio C Hamano
@ 2012-04-30 22:12       ` René Scharfe
  0 siblings, 0 replies; 23+ messages in thread
From: René Scharfe @ 2012-04-30 22:12 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Nguyễn Thái Ngọc Duy, git

Am 30.04.2012 23:36, schrieb Junio C Hamano:
> René Scharfe<rene.scharfe@lsrfire.ath.cx>  writes:
>
>>> +static int stream_blob_to_file(const unsigned char *sha1)
>>> +{
>>> +	struct git_istream *st;
>>> +	enum object_type type;
>>> +	unsigned long sz;
>>> +
>>> +	st = open_istream(sha1,&type,&sz, NULL);
>>> +	if (!st)
>>> +		return error("cannot stream blob %s", sha1_to_hex(sha1));
>>> +	for (;;) {
>>> +		char buf[BLOCKSIZE];
>>> +		ssize_t readlen;
>>> +
>>> +		readlen = read_istream(st, buf, sizeof(buf));
>>> +
>>> +		if (readlen<= 0)
>>> +			return readlen;
>>> +		write_blocked(buf, readlen);
>>> +	}
>>> +	close_istream(st);
>>> +	return 0;
>>> +}
>>
>> The stream is never closed.  Perhaps squash this in?
>>
>> diff --git a/archive-tar.c b/archive-tar.c
>> index 506c8cb..6109fd3 100644
>> --- a/archive-tar.c
>> +++ b/archive-tar.c
>> @@ -66,6 +66,7 @@ static void write_blocked(const void *data, unsigned long size)
>>   static int stream_blob_to_file(const unsigned char *sha1)
>>   {
>>   	struct git_istream *st;
>> +	ssize_t readlen;
>>   	enum object_type type;
>>   	unsigned long sz;
>>
>> @@ -74,16 +75,15 @@ static int stream_blob_to_file(const unsigned char *sha1)
>>   		return error("cannot stream blob %s", sha1_to_hex(sha1));
>>   	for (;;) {
>>   		char buf[BLOCKSIZE];
>> -		ssize_t readlen;
>>
>>   		readlen = read_istream(st, buf, sizeof(buf));
>>
>>   		if (readlen<= 0)
>> -			return readlen;
>> +			break;
>>   		write_blocked(buf, readlen);
>>   	}
>>   	close_istream(st);
>> -	return 0;
>> +	return readlen;
>>   }
>
> Your patch on top obviouly is the right thing to do, but reading the code
> again, I am not sure if the original is correct.  read_istream() itself
> does not promise that it will always fill the buffer before returning (it
> could return with a short read).  It seems incorrect that the caller does
> not loop to avoid padding a short read with padding by calling
> write_blocked().

Yes, indeed, good catch.  It could also write to block directly and 
avoid copying the buffer again.  The tail clearing part of 
write_blocked() can certainly be reused, but the rest will probably have 
to be reimplemented around read_istream().

René

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH 6a/5] archive-zip: remove uncompressed_size
  2012-04-30  4:57 ` [PATCH 5/5] archive-zip: stream large blobs into zip file Nguyễn Thái Ngọc Duy
  2012-04-30 19:12   ` Junio C Hamano
  2012-04-30 22:11   ` [PATCH 5a/5] streaming: void pointer instead of char pointer René Scharfe
@ 2012-04-30 22:12   ` René Scharfe
  2012-04-30 22:12   ` [PATCH 7a/5] archive-zip: factor out helpers for writing sizes and CRC René Scharfe
                     ` (2 subsequent siblings)
  5 siblings, 0 replies; 23+ messages in thread
From: René Scharfe @ 2012-04-30 22:12 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git, Junio C Hamano

We only need size and compressed_size.

Signed-off-by: Rene Scharfe <rene.scharfe@lsrfire.ath.cx>
---
 archive-zip.c |    8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/archive-zip.c b/archive-zip.c
index 716cc42..400ba38 100644
--- a/archive-zip.c
+++ b/archive-zip.c
@@ -129,7 +129,6 @@ static int write_zip_entry(struct archiver_args *args,
 	struct zip_dir_header dirent;
 	unsigned long attr2;
 	unsigned long compressed_size;
-	unsigned long uncompressed_size;
 	unsigned long crc;
 	unsigned long direntsize;
 	int method;
@@ -149,7 +148,7 @@ static int write_zip_entry(struct archiver_args *args,
 		method = 0;
 		attr2 = 16;
 		out = NULL;
-		uncompressed_size = 0;
+		size = 0;
 		compressed_size = 0;
 		buffer = NULL;
 		size = 0;
@@ -166,7 +165,6 @@ static int write_zip_entry(struct archiver_args *args,
 			method = 8;
 		crc = crc32(crc, buffer, size);
 		out = buffer;
-		uncompressed_size = size;
 		compressed_size = size;
 	} else {
 		return error("unsupported file mode: 0%o (SHA1: %s)", mode,
@@ -204,7 +202,7 @@ static int write_zip_entry(struct archiver_args *args,
 	copy_le16(dirent.mdate, zip_date);
 	copy_le32(dirent.crc32, crc);
 	copy_le32(dirent.compressed_size, compressed_size);
-	copy_le32(dirent.size, uncompressed_size);
+	copy_le32(dirent.size, size);
 	copy_le16(dirent.filename_length, pathlen);
 	copy_le16(dirent.extra_length, 0);
 	copy_le16(dirent.comment_length, 0);
@@ -226,7 +224,7 @@ static int write_zip_entry(struct archiver_args *args,
 	copy_le16(header.mdate, zip_date);
 	copy_le32(header.crc32, crc);
 	copy_le32(header.compressed_size, compressed_size);
-	copy_le32(header.size, uncompressed_size);
+	copy_le32(header.size, size);
 	copy_le16(header.filename_length, pathlen);
 	copy_le16(header.extra_length, 0);
 	write_or_die(1, &header, ZIP_LOCAL_HEADER_SIZE);
-- 
1.7.10

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH 7a/5] archive-zip: factor out helpers for writing sizes and CRC
  2012-04-30  4:57 ` [PATCH 5/5] archive-zip: stream large blobs into zip file Nguyễn Thái Ngọc Duy
                     ` (2 preceding siblings ...)
  2012-04-30 22:12   ` [PATCH 6a/5] archive-zip: remove uncompressed_size René Scharfe
@ 2012-04-30 22:12   ` René Scharfe
  2012-04-30 22:12   ` [PATCH 8a/5] archive-zip: streaming for stored files René Scharfe
  2012-04-30 22:12   ` [PATCH 9a/5] archive-zip: streaming for deflated files René Scharfe
  5 siblings, 0 replies; 23+ messages in thread
From: René Scharfe @ 2012-04-30 22:12 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git, Junio C Hamano

We're going to reuse them soon for streaming.  Also, update the ZIP
directory only at the very end, which will also make streaming easier.

Signed-off-by: Rene Scharfe <rene.scharfe@lsrfire.ath.cx>
---
 archive-zip.c |   39 ++++++++++++++++++++++++++++-----------
 1 file changed, 28 insertions(+), 11 deletions(-)

diff --git a/archive-zip.c b/archive-zip.c
index 400ba38..678569a 100644
--- a/archive-zip.c
+++ b/archive-zip.c
@@ -120,6 +120,26 @@ static void *zlib_deflate(void *data, unsigned long size,
 	return buffer;
 }
 
+static void set_zip_dir_data_desc(struct zip_dir_header *header,
+				  unsigned long size,
+				  unsigned long compressed_size,
+				  unsigned long crc)
+{
+	copy_le32(header->crc32, crc);
+	copy_le32(header->compressed_size, compressed_size);
+	copy_le32(header->size, size);
+}
+
+static void set_zip_header_data_desc(struct zip_local_header *header,
+				     unsigned long size,
+				     unsigned long compressed_size,
+				     unsigned long crc)
+{
+	copy_le32(header->crc32, crc);
+	copy_le32(header->compressed_size, compressed_size);
+	copy_le32(header->size, size);
+}
+
 static int write_zip_entry(struct archiver_args *args,
 			   const unsigned char *sha1,
 			   const char *path, size_t pathlen,
@@ -200,9 +220,7 @@ static int write_zip_entry(struct archiver_args *args,
 	copy_le16(dirent.compression_method, method);
 	copy_le16(dirent.mtime, zip_time);
 	copy_le16(dirent.mdate, zip_date);
-	copy_le32(dirent.crc32, crc);
-	copy_le32(dirent.compressed_size, compressed_size);
-	copy_le32(dirent.size, size);
+	set_zip_dir_data_desc(&dirent, size, compressed_size, crc);
 	copy_le16(dirent.filename_length, pathlen);
 	copy_le16(dirent.extra_length, 0);
 	copy_le16(dirent.comment_length, 0);
@@ -210,11 +228,6 @@ static int write_zip_entry(struct archiver_args *args,
 	copy_le16(dirent.attr1, 0);
 	copy_le32(dirent.attr2, attr2);
 	copy_le32(dirent.offset, zip_offset);
-	memcpy(zip_dir + zip_dir_offset, &dirent, ZIP_DIR_HEADER_SIZE);
-	zip_dir_offset += ZIP_DIR_HEADER_SIZE;
-	memcpy(zip_dir + zip_dir_offset, path, pathlen);
-	zip_dir_offset += pathlen;
-	zip_dir_entries++;
 
 	copy_le32(header.magic, 0x04034b50);
 	copy_le16(header.version, 10);
@@ -222,9 +235,7 @@ static int write_zip_entry(struct archiver_args *args,
 	copy_le16(header.compression_method, method);
 	copy_le16(header.mtime, zip_time);
 	copy_le16(header.mdate, zip_date);
-	copy_le32(header.crc32, crc);
-	copy_le32(header.compressed_size, compressed_size);
-	copy_le32(header.size, size);
+	set_zip_header_data_desc(&header, size, compressed_size, crc);
 	copy_le16(header.filename_length, pathlen);
 	copy_le16(header.extra_length, 0);
 	write_or_die(1, &header, ZIP_LOCAL_HEADER_SIZE);
@@ -239,6 +250,12 @@ static int write_zip_entry(struct archiver_args *args,
 	free(deflated);
 	free(buffer);
 
+	memcpy(zip_dir + zip_dir_offset, &dirent, ZIP_DIR_HEADER_SIZE);
+	zip_dir_offset += ZIP_DIR_HEADER_SIZE;
+	memcpy(zip_dir + zip_dir_offset, path, pathlen);
+	zip_dir_offset += pathlen;
+	zip_dir_entries++;
+
 	return 0;
 }
 
-- 
1.7.10

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH 8a/5] archive-zip: streaming for stored files
  2012-04-30  4:57 ` [PATCH 5/5] archive-zip: stream large blobs into zip file Nguyễn Thái Ngọc Duy
                     ` (3 preceding siblings ...)
  2012-04-30 22:12   ` [PATCH 7a/5] archive-zip: factor out helpers for writing sizes and CRC René Scharfe
@ 2012-04-30 22:12   ` René Scharfe
  2012-04-30 22:12   ` [PATCH 9a/5] archive-zip: streaming for deflated files René Scharfe
  5 siblings, 0 replies; 23+ messages in thread
From: René Scharfe @ 2012-04-30 22:12 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git, Junio C Hamano

Write a data descriptor containing the CRC of the entry and its sizes
after streaming it out.  For simplicity, do that only if we're storing
files (option -0) for now.

Signed-off-by: Rene Scharfe <rene.scharfe@lsrfire.ath.cx>
---
 archive-zip.c    |   90 ++++++++++++++++++++++++++++++++++++++++++++++--------
 t/t1050-large.sh |    4 +++
 2 files changed, 82 insertions(+), 12 deletions(-)

diff --git a/archive-zip.c b/archive-zip.c
index 678569a..1c6c39d 100644
--- a/archive-zip.c
+++ b/archive-zip.c
@@ -3,6 +3,7 @@
  */
 #include "cache.h"
 #include "archive.h"
+#include "streaming.h"
 
 static int zip_date;
 static int zip_time;
@@ -15,6 +16,7 @@ static unsigned int zip_dir_offset;
 static unsigned int zip_dir_entries;
 
 #define ZIP_DIRECTORY_MIN_SIZE	(1024 * 1024)
+#define ZIP_STREAM (8)
 
 struct zip_local_header {
 	unsigned char magic[4];
@@ -31,6 +33,14 @@ struct zip_local_header {
 	unsigned char _end[1];
 };
 
+struct zip_data_desc {
+	unsigned char magic[4];
+	unsigned char crc32[4];
+	unsigned char compressed_size[4];
+	unsigned char size[4];
+	unsigned char _end[1];
+};
+
 struct zip_dir_header {
 	unsigned char magic[4];
 	unsigned char creator_version[2];
@@ -70,6 +80,7 @@ struct zip_dir_trailer {
  * we're interested in.
  */
 #define ZIP_LOCAL_HEADER_SIZE	offsetof(struct zip_local_header, _end)
+#define ZIP_DATA_DESC_SIZE	offsetof(struct zip_data_desc, _end)
 #define ZIP_DIR_HEADER_SIZE	offsetof(struct zip_dir_header, _end)
 #define ZIP_DIR_TRAILER_SIZE	offsetof(struct zip_dir_trailer, _end)
 
@@ -120,6 +131,19 @@ static void *zlib_deflate(void *data, unsigned long size,
 	return buffer;
 }
 
+static void write_zip_data_desc(unsigned long size,
+				unsigned long compressed_size,
+				unsigned long crc)
+{
+	struct zip_data_desc trailer;
+
+	copy_le32(trailer.magic, 0x08074b50);
+	copy_le32(trailer.crc32, crc);
+	copy_le32(trailer.compressed_size, compressed_size);
+	copy_le32(trailer.size, size);
+	write_or_die(1, &trailer, ZIP_DATA_DESC_SIZE);
+}
+
 static void set_zip_dir_data_desc(struct zip_dir_header *header,
 				  unsigned long size,
 				  unsigned long compressed_size,
@@ -140,6 +164,8 @@ static void set_zip_header_data_desc(struct zip_local_header *header,
 	copy_le32(header->size, size);
 }
 
+#define STREAM_BUFFER_SIZE (1024 * 16)
+
 static int write_zip_entry(struct archiver_args *args,
 			   const unsigned char *sha1,
 			   const char *path, size_t pathlen,
@@ -155,6 +181,8 @@ static int write_zip_entry(struct archiver_args *args,
 	unsigned char *out;
 	void *deflated = NULL;
 	void *buffer;
+	struct git_istream *stream = NULL;
+	unsigned long flags = 0;
 	unsigned long size;
 
 	crc = crc32(0, NULL, 0);
@@ -173,25 +201,38 @@ static int write_zip_entry(struct archiver_args *args,
 		buffer = NULL;
 		size = 0;
 	} else if (S_ISREG(mode) || S_ISLNK(mode)) {
-		enum object_type type;
-		buffer = sha1_file_to_archive(args, path, sha1, mode, &type, &size);
-		if (!buffer)
-			return error("cannot read %s", sha1_to_hex(sha1));
+		enum object_type type = sha1_object_info(sha1, &size);
 
 		method = 0;
 		attr2 = S_ISLNK(mode) ? ((mode | 0777) << 16) :
 			(mode & 0111) ? ((mode) << 16) : 0;
-		if (S_ISREG(mode) && args->compression_level != 0)
+		if (S_ISREG(mode) && args->compression_level != 0 && size > 0)
 			method = 8;
-		crc = crc32(crc, buffer, size);
-		out = buffer;
 		compressed_size = size;
+
+		if (S_ISREG(mode) && type == OBJ_BLOB && !args->convert &&
+		    size > big_file_threshold && method == 0) {
+			stream = open_istream(sha1, &type, &size, NULL);
+			if (!stream)
+				return error("cannot stream blob %s",
+					     sha1_to_hex(sha1));
+			flags |= ZIP_STREAM;
+			out = buffer = NULL;
+		} else {
+			buffer = sha1_file_to_archive(args, path, sha1, mode,
+						      &type, &size);
+			if (!buffer)
+				return error("cannot read %s",
+					     sha1_to_hex(sha1));
+			crc = crc32(crc, buffer, size);
+			out = buffer;
+		}
 	} else {
 		return error("unsupported file mode: 0%o (SHA1: %s)", mode,
 				sha1_to_hex(sha1));
 	}
 
-	if (method == 8) {
+	if (buffer && method == 8) {
 		deflated = zlib_deflate(buffer, size, args->compression_level,
 				&compressed_size);
 		if (deflated && compressed_size - 6 < size) {
@@ -216,7 +257,7 @@ static int write_zip_entry(struct archiver_args *args,
 	copy_le16(dirent.creator_version,
 		S_ISLNK(mode) || (S_ISREG(mode) && (mode & 0111)) ? 0x0317 : 0);
 	copy_le16(dirent.version, 10);
-	copy_le16(dirent.flags, 0);
+	copy_le16(dirent.flags, flags);
 	copy_le16(dirent.compression_method, method);
 	copy_le16(dirent.mtime, zip_time);
 	copy_le16(dirent.mdate, zip_date);
@@ -231,18 +272,43 @@ static int write_zip_entry(struct archiver_args *args,
 
 	copy_le32(header.magic, 0x04034b50);
 	copy_le16(header.version, 10);
-	copy_le16(header.flags, 0);
+	copy_le16(header.flags, flags);
 	copy_le16(header.compression_method, method);
 	copy_le16(header.mtime, zip_time);
 	copy_le16(header.mdate, zip_date);
-	set_zip_header_data_desc(&header, size, compressed_size, crc);
+	if (flags & ZIP_STREAM)
+		set_zip_header_data_desc(&header, 0, 0, 0);
+	else
+		set_zip_header_data_desc(&header, size, compressed_size, crc);
 	copy_le16(header.filename_length, pathlen);
 	copy_le16(header.extra_length, 0);
 	write_or_die(1, &header, ZIP_LOCAL_HEADER_SIZE);
 	zip_offset += ZIP_LOCAL_HEADER_SIZE;
 	write_or_die(1, path, pathlen);
 	zip_offset += pathlen;
-	if (compressed_size > 0) {
+	if (stream && method == 0) {
+		unsigned char buf[STREAM_BUFFER_SIZE];
+		ssize_t readlen;
+
+		for (;;) {
+			readlen = read_istream(stream, buf, sizeof(buf));
+			if (readlen <= 0)
+				break;
+			crc = crc32(crc, buf, readlen);
+			write_or_die(1, buf, readlen);
+		}
+		close_istream(stream);
+		if (readlen)
+			return readlen;
+
+		compressed_size = size;
+		zip_offset += compressed_size;
+
+		write_zip_data_desc(size, compressed_size, crc);
+		zip_offset += ZIP_DATA_DESC_SIZE;
+
+		set_zip_dir_data_desc(&dirent, size, compressed_size, crc);
+	} else if (compressed_size > 0) {
 		write_or_die(1, out, compressed_size);
 		zip_offset += compressed_size;
 	}
diff --git a/t/t1050-large.sh b/t/t1050-large.sh
index fe47554..9db54b5 100755
--- a/t/t1050-large.sh
+++ b/t/t1050-large.sh
@@ -138,4 +138,8 @@ test_expect_success 'tar achiving' '
 	git archive --format=tar HEAD >/dev/null
 '
 
+test_expect_success 'zip achiving, store only' '
+	git archive --format=zip -0 HEAD >/dev/null
+'
+
 test_done
-- 
1.7.10

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH 9a/5] archive-zip: streaming for deflated files
  2012-04-30  4:57 ` [PATCH 5/5] archive-zip: stream large blobs into zip file Nguyễn Thái Ngọc Duy
                     ` (4 preceding siblings ...)
  2012-04-30 22:12   ` [PATCH 8a/5] archive-zip: streaming for stored files René Scharfe
@ 2012-04-30 22:12   ` René Scharfe
  5 siblings, 0 replies; 23+ messages in thread
From: René Scharfe @ 2012-04-30 22:12 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git, Junio C Hamano

After an entry has been streamed out, its CRC and sizes are written as
part of a data descriptor.

For simplicity, we make the buffer for the compressed chunks twice as
big as for the uncompressed ones, to be sure the result fit in even
if deflate makes them bigger.

Signed-off-by: Rene Scharfe <rene.scharfe@lsrfire.ath.cx>
---
 archive-zip.c    |   64 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 t/t1050-large.sh |    4 ++++
 2 files changed, 67 insertions(+), 1 deletion(-)

diff --git a/archive-zip.c b/archive-zip.c
index 1c6c39d..f5af81f 100644
--- a/archive-zip.c
+++ b/archive-zip.c
@@ -211,7 +211,7 @@ static int write_zip_entry(struct archiver_args *args,
 		compressed_size = size;
 
 		if (S_ISREG(mode) && type == OBJ_BLOB && !args->convert &&
-		    size > big_file_threshold && method == 0) {
+		    size > big_file_threshold) {
 			stream = open_istream(sha1, &type, &size, NULL);
 			if (!stream)
 				return error("cannot stream blob %s",
@@ -308,6 +308,68 @@ static int write_zip_entry(struct archiver_args *args,
 		zip_offset += ZIP_DATA_DESC_SIZE;
 
 		set_zip_dir_data_desc(&dirent, size, compressed_size, crc);
+	} else if (stream && method == 8) {
+		unsigned char buf[STREAM_BUFFER_SIZE];
+		ssize_t readlen;
+		git_zstream zstream;
+		int result;
+		size_t out_len;
+		unsigned char compressed[STREAM_BUFFER_SIZE * 2];
+
+		memset(&zstream, 0, sizeof(zstream));
+		git_deflate_init(&zstream, args->compression_level);
+
+		compressed_size = 0;
+		zstream.next_out = compressed;
+		zstream.avail_out = sizeof(compressed);
+
+		for (;;) {
+			readlen = read_istream(stream, buf, sizeof(buf));
+			if (readlen <= 0)
+				break;
+			crc = crc32(crc, buf, readlen);
+
+			zstream.next_in = buf;
+			zstream.avail_in = readlen;
+			result = git_deflate(&zstream, 0);
+			if (result != Z_OK)
+				die("deflate error (%d)", result);
+			out = compressed;
+			if (!compressed_size)
+				out += 2;
+			out_len = zstream.next_out - out;
+
+			if (out_len > 0) {
+				write_or_die(1, out, out_len);
+				compressed_size += out_len;
+				zstream.next_out = compressed;
+				zstream.avail_out = sizeof(compressed);
+			}
+
+		}
+		close_istream(stream);
+		if (readlen)
+			return readlen;
+
+		zstream.next_in = buf;
+		zstream.avail_in = 0;
+		result = git_deflate(&zstream, Z_FINISH);
+		if (result != Z_STREAM_END)
+			die("deflate error (%d)", result);
+
+		git_deflate_end(&zstream);
+		out = compressed;
+		if (!compressed_size)
+			out += 2;
+		out_len = zstream.next_out - out - 4;
+		write_or_die(1, out, out_len);
+		compressed_size += out_len;
+		zip_offset += compressed_size;
+
+		write_zip_data_desc(size, compressed_size, crc);
+		zip_offset += ZIP_DATA_DESC_SIZE;
+
+		set_zip_dir_data_desc(&dirent, size, compressed_size, crc);
 	} else if (compressed_size > 0) {
 		write_or_die(1, out, compressed_size);
 		zip_offset += compressed_size;
diff --git a/t/t1050-large.sh b/t/t1050-large.sh
index 9db54b5..55ed955 100755
--- a/t/t1050-large.sh
+++ b/t/t1050-large.sh
@@ -142,4 +142,8 @@ test_expect_success 'zip achiving, store only' '
 	git archive --format=zip -0 HEAD >/dev/null
 '
 
+test_expect_success 'zip achiving, deflate' '
+	git archive --format=zip HEAD >/dev/null
+'
+
 test_done
-- 
1.7.10

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH 5/5] archive-zip: stream large blobs into zip file
  2012-04-30 19:12   ` Junio C Hamano
@ 2012-04-30 22:54     ` René Scharfe
  0 siblings, 0 replies; 23+ messages in thread
From: René Scharfe @ 2012-04-30 22:54 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Nguyễn Thái Ngọc Duy, git

Am 30.04.2012 21:12, schrieb Junio C Hamano:
> Nguyễn Thái Ngọc Duy<pclouds@gmail.com>  writes:
>
>> A large blob will be read twice. One for calculating crc32, one for
>> actual writing.
>
> Is that because you need the checksum before the payload?  That is
> unfortunate.  It would be nice (read: not a rejection of this patch---it
> is a good first step to do it stupid but correct way before trying to
> optimize it) to avoid it when the output is seekable, especially because
> we are talking about a *large* payload.

The ZIP format optionally allows writing the CRC and the sizes after the 
data.  This adds a data descriptor with a size of 16 bytes to each 
streamed entry.  Seeking back and correcting these values in an output 
file would avoid that.

>> diff --git a/t/t1050-large.sh b/t/t1050-large.sh
>> index fe47554..458fdde 100755
>> --- a/t/t1050-large.sh
>> +++ b/t/t1050-large.sh
>> @@ -138,4 +138,8 @@ test_expect_success 'tar achiving' '
>>   	git archive --format=tar HEAD>/dev/null
>>   '
>>
>> +test_expect_success 'zip achiving' '
>> +	git archive --format=zip HEAD>/dev/null
>> +'
>
> Can't we do better than "we only check if it finishes without barfing; we
> cannot afford to check the correctness of the output"?  The same comment
> applies to all the tests you added to this file in the past 3 months.

Streaming to tar can be tested by setting core.big_file_threshold big 
enough, creating a non-streamed version of the archive and comparing it 
to the streamed one.  With the seek trick, this would work for ZIP as 
well.  For streaming with an added data descriptor we'd need to actually 
unpack the ZIP file, though.

René

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 0/5] Large file support for git-archive
  2012-04-30 21:07 ` René Scharfe
@ 2012-05-01 10:19   ` Nguyen Thai Ngoc Duy
  0 siblings, 0 replies; 23+ messages in thread
From: Nguyen Thai Ngoc Duy @ 2012-05-01 10:19 UTC (permalink / raw)
  To: René Scharfe, Junio C Hamano; +Cc: git

Thanks both. Will resend with fixes in the first patches and René's
patches with better tests that also verify output. I think I will move
zip tests back to t5000 though because it may change output structure.

2012/5/1 René Scharfe <rene.scharfe@lsrfire.ath.cx>:
> Am 30.04.2012 06:57, schrieb Nguyễn Thái Ngọc Duy:
>
>> This is a spin-off from the large file support series I posted some
>> time ago [1]. Both tar and zip format now support streaming large
>> blobs. Although zip will store uncompressed large blobs.
>>
>> [1] http://thread.gmane.org/gmane.comp.version-control.git/191605
>>
>> Nguyễn Thái Ngọc Duy (5):
>>   archive-tar: turn write_tar_entry into blob-writing only
>>   archive-tar: unindent write_tar_entry by one level
>>   archive: delegate blob reading to backend
>>   archive-tar: stream large blobs to tar file
>>   archive-zip: stream large blobs into zip file
>>
>>  Documentation/git-archive.txt |    3 +
>>  archive-tar.c                 |  184
>> ++++++++++++++++++++++++++++-------------
>>  archive-zip.c                 |   56 ++++++++++++-
>>  archive.c                     |   28 +++----
>>  archive.h                     |   10 ++-
>>  t/t1050-large.sh              |    8 ++
>>  6 files changed, 209 insertions(+), 80 deletions(-)
>
>
> I like the cleanups in the first two patches.
>
> I'm neutral to positive on the third one; it certainly simplifies the
> interface to the backends with only little code duplication.
>
> The ZIP format supports streaming natively (look for "Data descriptor" in
> http://www.pkware.com/documents/casestudies/APPNOTE.TXT). Incidentally, I
> had been working on supporting that, but without any presentable results.
>  So far.  I've adapted the pieces I had to your series now; will send
> shortly.
>
> René



-- 
Duy

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2012-05-01 10:19 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-04-30  4:57 [PATCH 0/5] Large file support for git-archive Nguyễn Thái Ngọc Duy
2012-04-30  4:57 ` [PATCH 1/5] archive-tar: turn write_tar_entry into blob-writing only Nguyễn Thái Ngọc Duy
2012-04-30 18:15   ` Junio C Hamano
2012-04-30 22:11     ` René Scharfe
2012-04-30  4:57 ` [PATCH 2/5] archive-tar: unindent write_tar_entry by one level Nguyễn Thái Ngọc Duy
2012-04-30  4:57 ` [PATCH 3/5] archive: delegate blob reading to backend Nguyễn Thái Ngọc Duy
2012-04-30 21:07   ` René Scharfe
2012-04-30  4:57 ` [PATCH 4/5] archive-tar: stream large blobs to tar file Nguyễn Thái Ngọc Duy
2012-04-30 19:01   ` Junio C Hamano
2012-04-30 21:08   ` René Scharfe
2012-04-30 21:36     ` Junio C Hamano
2012-04-30 22:12       ` René Scharfe
2012-04-30  4:57 ` [PATCH 5/5] archive-zip: stream large blobs into zip file Nguyễn Thái Ngọc Duy
2012-04-30 19:12   ` Junio C Hamano
2012-04-30 22:54     ` René Scharfe
2012-04-30 22:11   ` [PATCH 5a/5] streaming: void pointer instead of char pointer René Scharfe
2012-04-30 22:12   ` [PATCH 6a/5] archive-zip: remove uncompressed_size René Scharfe
2012-04-30 22:12   ` [PATCH 7a/5] archive-zip: factor out helpers for writing sizes and CRC René Scharfe
2012-04-30 22:12   ` [PATCH 8a/5] archive-zip: streaming for stored files René Scharfe
2012-04-30 22:12   ` [PATCH 9a/5] archive-zip: streaming for deflated files René Scharfe
2012-04-30 19:15 ` [PATCH 0/5] Large file support for git-archive Junio C Hamano
2012-04-30 21:07 ` René Scharfe
2012-05-01 10:19   ` Nguyen Thai Ngoc Duy

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.