All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] fetch-pack: optionally save packs to disk
       [not found] <CAGZ79kaS4utvDbXOo7emmSUH6M-8LY-oA65Ss3PLDkFModkbSg@mail.gmail.com>
@ 2015-06-11 18:59 ` Augie Fackler
  2015-06-12  6:22   ` Johannes Sixt
  0 siblings, 1 reply; 33+ messages in thread
From: Augie Fackler @ 2015-06-11 18:59 UTC (permalink / raw)
  To: git; +Cc: Stefan Beller, Junio C Hamano, Augie Fackler

When developing server software, it's often helpful to save a
potentially-bogus pack for later analysis. This makes that trivial,
instead of painful. This is made a little complicated by the fact that
in some cases (like cloning from smart-http, but not from a local repo)
the fetch code reads the pack header before sending the pack to
index-pack (which then gets a --pack_header flag). The included tests
cover both of these cases.

To use the new feature, set GIT_SAVE_FETCHED_PACK_TO to a file path and
git-fetch will do the rest. The resulting pack can be examined with
git-index-pack or similar tools (although if it's corrupt, custom tools
can be especially helpful.)

Signed-off-by: Augie Fackler <augie@google.com>
---
 Documentation/git.txt       |  6 ++++++
 fetch-pack.c                | 44 ++++++++++++++++++++++++++++++++++++++++----
 t/t5551-http-fetch-smart.sh | 12 ++++++++++++
 t/t5601-clone.sh            |  9 +++++++++
 4 files changed, 67 insertions(+), 4 deletions(-)

diff --git a/Documentation/git.txt b/Documentation/git.txt
index 45b64a7..31bc3b5 100644
--- a/Documentation/git.txt
+++ b/Documentation/git.txt
@@ -1060,6 +1060,12 @@ GIT_ICASE_PATHSPECS::
 	an operation has touched every ref (e.g., because you are
 	cloning a repository to make a backup).
 
+`GIT_SAVE_FETCHED_PACK_TO`::
+	If set, save any fetched pack to the path in the
+	variable. This is mostly useful if you're writing a custom
+	server and are producing broken packs, as the saved pack won't
+	be cleaned up even if it's corrupt.
+
 
 Discussion[[Discussion]]
 ------------------------
diff --git a/fetch-pack.c b/fetch-pack.c
index a912935..fe6ba58 100644
--- a/fetch-pack.c
+++ b/fetch-pack.c
@@ -684,7 +684,7 @@ static int get_pack(struct fetch_pack_args *args,
 	const char *argv[22];
 	char keep_arg[256];
 	char hdr_arg[256];
-	const char **av, *cmd_name;
+	const char **av, *cmd_name, *savepath;
 	int do_keep = args->keep_pack;
 	struct child_process cmd = CHILD_PROCESS_INIT;
 	int ret;
@@ -708,9 +708,8 @@ static int get_pack(struct fetch_pack_args *args,
 	cmd.argv = argv;
 	av = argv;
 	*hdr_arg = 0;
+	struct pack_header header;
 	if (!args->keep_pack && unpack_limit) {
-		struct pack_header header;
-
 		if (read_pack_header(demux.out, &header))
 			die("protocol error: bad pack header");
 		snprintf(hdr_arg, sizeof(hdr_arg),
@@ -762,7 +761,44 @@ static int get_pack(struct fetch_pack_args *args,
 		*av++ = "--strict";
 	*av++ = NULL;
 
-	cmd.in = demux.out;
+	savepath = getenv("GIT_SAVE_FETCHED_PACK_TO");
+	if (savepath) {
+		struct child_process cmd2 = CHILD_PROCESS_INIT;
+		const char *argv2[22];
+		int pipefds[2];
+		int e;
+		const char **av2;
+		cmd2.argv = argv2;
+		av2 = argv2;
+		*av2++ = "tee";
+		if (*hdr_arg) {
+			/* hdr_arg being nonempty means we already read the
+			 * pack header from demux, so we need to drop a pack
+			 * header in place for tee to append to, otherwise
+			 * we'll end up with a broken pack on disk.
+			 */
+			int fp;
+			struct sha1file *s;
+			fp = open(savepath, O_CREAT | O_TRUNC | O_WRONLY, 0666);
+			s = sha1fd_throughput(fp, savepath, NULL);
+			sha1write(s, &header, sizeof(header));
+			sha1flush(s);
+			close(fp);
+			/* -a is supported by both GNU and BSD tee */
+			*av2++ = "-a";
+		}
+		*av2++ = savepath;
+		*av2++ = NULL;
+		cmd2.in = demux.out;
+		e = pipe(pipefds);
+		if (e != 0)
+			die("couldn't make pipe to save pack");
+		cmd2.out = pipefds[1];
+		cmd.in = pipefds[0];
+		if (start_command(&cmd2))
+			die("couldn't start tee to save a pack");
+	} else
+		cmd.in = demux.out;
 	cmd.git_cmd = 1;
 	if (start_command(&cmd))
 		die("fetch-pack: unable to fork off %s", cmd_name);
diff --git a/t/t5551-http-fetch-smart.sh b/t/t5551-http-fetch-smart.sh
index 58207d8..bf4640d 100755
--- a/t/t5551-http-fetch-smart.sh
+++ b/t/t5551-http-fetch-smart.sh
@@ -82,11 +82,23 @@ test_expect_success 'fetch changes via http' '
 	test_cmp file clone/file
 '
 
+test_expect_success 'fetch changes via http and save pack' '
+	echo content >>file &&
+	git commit -a -m two &&
+	git push public &&
+	GIT_SAVE_FETCHED_PACK_TO=saved.pack &&
+	export GIT_SAVE_FETCHED_PACK_TO &&
+	(cd clone && git pull) &&
+	git index-pack clone/saved.pack
+'
+
 cat >exp <<EOF
 GET  /smart/repo.git/info/refs?service=git-upload-pack HTTP/1.1 200
 POST /smart/repo.git/git-upload-pack HTTP/1.1 200
 GET  /smart/repo.git/info/refs?service=git-upload-pack HTTP/1.1 200
 POST /smart/repo.git/git-upload-pack HTTP/1.1 200
+GET  /smart/repo.git/info/refs?service=git-upload-pack HTTP/1.1 200
+POST /smart/repo.git/git-upload-pack HTTP/1.1 200
 EOF
 test_expect_success 'used upload-pack service' '
 	sed -e "
diff --git a/t/t5601-clone.sh b/t/t5601-clone.sh
index bfdaf75..73f9e1c 100755
--- a/t/t5601-clone.sh
+++ b/t/t5601-clone.sh
@@ -40,6 +40,15 @@ test_expect_success C_LOCALE_OUTPUT 'output from clone' '
 	test $(grep Clon output | wc -l) = 1
 '
 
+test_expect_success 'clone allows saving a pack' '
+	rm -fr dst saved.pack &&
+	GIT_SAVE_FETCHED_PACK_TO=saved.pack &&
+	export GIT_SAVE_FETCHED_PACK_TO &&
+	git clone -n "file://$(pwd)/src" dst >output 2>&1 &&
+	test -e saved.pack &&
+	git index-pack saved.pack
+'
+
 test_expect_success 'clone does not keep pack' '
 
 	rm -fr dst &&
-- 
2.4.3.369.gda395ba.dirty

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* Re: [PATCH v2] fetch-pack: optionally save packs to disk
  2015-06-11 18:59 ` [PATCH v2] fetch-pack: optionally save packs to disk Augie Fackler
@ 2015-06-12  6:22   ` Johannes Sixt
  2015-06-12 15:07     ` Junio C Hamano
  2015-06-12 16:54     ` [PATCH v2] fetch-pack: optionally save packs to disk Augie Fackler
  0 siblings, 2 replies; 33+ messages in thread
From: Johannes Sixt @ 2015-06-12  6:22 UTC (permalink / raw)
  To: Augie Fackler, git; +Cc: Stefan Beller, Junio C Hamano

Am 11.06.2015 um 20:59 schrieb Augie Fackler:
> When developing server software, it's often helpful to save a
> potentially-bogus pack for later analysis. This makes that trivial,
> instead of painful.

When you develop server software, shouldn't you test drive the server 
via the bare metal protocol anyway? That *is* painful, but unavoidable 
because you must harden the server against any garbage that a 
potentially malicous client could throw at it. Restricting yourself to a 
well-behaved client such as fetch-pack is only half the deal.

That said, I do think that fetch-pack could learn a mode that makes it 
easier to debug the normal behavior of a server (if such a mode is 
missing currently).

What is the problem with the current fetch-pack implementation? Does it 
remove a bogus packfile after download? Does it abort during download 
when it detects a broken packfile? Does --keep not do what you need?

Instead of your approach (which forks off tee to dump a copy of the 
packfile), would it not be simpler to add an option --debug-pack 
(probably not the best name) that skips the cleanup step when a broken 
packfile is detected and prints the name of the downloaded packfile?

> diff --git a/fetch-pack.c b/fetch-pack.c
> index a912935..fe6ba58 100644
> --- a/fetch-pack.c
> +++ b/fetch-pack.c
> @@ -684,7 +684,7 @@ static int get_pack(struct fetch_pack_args *args,
>   	const char *argv[22];
>   	char keep_arg[256];
>   	char hdr_arg[256];
> -	const char **av, *cmd_name;
> +	const char **av, *cmd_name, *savepath;
>   	int do_keep = args->keep_pack;
>   	struct child_process cmd = CHILD_PROCESS_INIT;
>   	int ret;
> @@ -708,9 +708,8 @@ static int get_pack(struct fetch_pack_args *args,
>   	cmd.argv = argv;
>   	av = argv;
>   	*hdr_arg = 0;
> +	struct pack_header header;
>   	if (!args->keep_pack && unpack_limit) {
> -		struct pack_header header;
> -
>   		if (read_pack_header(demux.out, &header))
>   			die("protocol error: bad pack header");
>   		snprintf(hdr_arg, sizeof(hdr_arg),
> @@ -762,7 +761,44 @@ static int get_pack(struct fetch_pack_args *args,
>   		*av++ = "--strict";
>   	*av++ = NULL;
>
> -	cmd.in = demux.out;
> +	savepath = getenv("GIT_SAVE_FETCHED_PACK_TO");
> +	if (savepath) {
> +		struct child_process cmd2 = CHILD_PROCESS_INIT;
> +		const char *argv2[22];
> +		int pipefds[2];
> +		int e;
> +		const char **av2;
> +		cmd2.argv = argv2;
> +		av2 = argv2;
> +		*av2++ = "tee";
> +		if (*hdr_arg) {
> +			/* hdr_arg being nonempty means we already read the
> +			 * pack header from demux, so we need to drop a pack
> +			 * header in place for tee to append to, otherwise
> +			 * we'll end up with a broken pack on disk.
> +			 */

			/*
			 * Write multi-line comments
			 * like this (/* on its own line)
			 */

> +			int fp;
> +			struct sha1file *s;
> +			fp = open(savepath, O_CREAT | O_TRUNC | O_WRONLY, 0666);
> +			s = sha1fd_throughput(fp, savepath, NULL);
> +			sha1write(s, &header, sizeof(header));
> +			sha1flush(s);

Are you abusing sha1write() and sha1flush() to write a byte sequence to 
a file? Is write_in_full() not sufficient?

> +			close(fp);
> +			/* -a is supported by both GNU and BSD tee */
> +			*av2++ = "-a";
> +		}
> +		*av2++ = savepath;
> +		*av2++ = NULL;
> +		cmd2.in = demux.out;
> +		e = pipe(pipefds);
> +		if (e != 0)
> +			die("couldn't make pipe to save pack");

start_command() can create the pipe for you. Just say cmd2.out = -1.

> +		cmd2.out = pipefds[1];
> +		cmd.in = pipefds[0];
> +		if (start_command(&cmd2))
> +			die("couldn't start tee to save a pack");

When you call start_command(), you must also call finish_command(). 
start_command() prints an error message for you; you don't have to do 
that (the start_command() in the context below is a bad example).

> +	} else
> +		cmd.in = demux.out;
>   	cmd.git_cmd = 1;
>   	if (start_command(&cmd))
>   		die("fetch-pack: unable to fork off %s", cmd_name);
> diff --git a/t/t5551-http-fetch-smart.sh b/t/t5551-http-fetch-smart.sh
> index 58207d8..bf4640d 100755
> --- a/t/t5551-http-fetch-smart.sh
> +++ b/t/t5551-http-fetch-smart.sh
> @@ -82,11 +82,23 @@ test_expect_success 'fetch changes via http' '
>   	test_cmp file clone/file
>   '
>
> +test_expect_success 'fetch changes via http and save pack' '
> +	echo content >>file &&
> +	git commit -a -m two &&
> +	git push public &&
> +	GIT_SAVE_FETCHED_PACK_TO=saved.pack &&
> +	export GIT_SAVE_FETCHED_PACK_TO &&
> +	(cd clone && git pull) &&

This can be written as

	(
		cd clone &&
		GIT_SAVE_FETCHED_PACK_TO=../saved.pack git pull
	) &&

without 'export'.

> +	git index-pack clone/saved.pack
> +'
> +
>   cat >exp <<EOF
>   GET  /smart/repo.git/info/refs?service=git-upload-pack HTTP/1.1 200
>   POST /smart/repo.git/git-upload-pack HTTP/1.1 200
>   GET  /smart/repo.git/info/refs?service=git-upload-pack HTTP/1.1 200
>   POST /smart/repo.git/git-upload-pack HTTP/1.1 200
> +GET  /smart/repo.git/info/refs?service=git-upload-pack HTTP/1.1 200
> +POST /smart/repo.git/git-upload-pack HTTP/1.1 200
>   EOF
>   test_expect_success 'used upload-pack service' '
>   	sed -e "
> diff --git a/t/t5601-clone.sh b/t/t5601-clone.sh
> index bfdaf75..73f9e1c 100755
> --- a/t/t5601-clone.sh
> +++ b/t/t5601-clone.sh
> @@ -40,6 +40,15 @@ test_expect_success C_LOCALE_OUTPUT 'output from clone' '
>   	test $(grep Clon output | wc -l) = 1
>   '
>
> +test_expect_success 'clone allows saving a pack' '
> +	rm -fr dst saved.pack &&
> +	GIT_SAVE_FETCHED_PACK_TO=saved.pack &&
> +	export GIT_SAVE_FETCHED_PACK_TO &&
> +	git clone -n "file://$(pwd)/src" dst >output 2>&1 &&

Same here.

> +	test -e saved.pack &&
> +	git index-pack saved.pack
> +'
> +
>   test_expect_success 'clone does not keep pack' '
>
>   	rm -fr dst &&
>

-- Hannes

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2] fetch-pack: optionally save packs to disk
  2015-06-12  6:22   ` Johannes Sixt
@ 2015-06-12 15:07     ` Junio C Hamano
  2015-06-12 17:02       ` Augie Fackler
  2015-06-12 18:00       ` Jeff King
  2015-06-12 16:54     ` [PATCH v2] fetch-pack: optionally save packs to disk Augie Fackler
  1 sibling, 2 replies; 33+ messages in thread
From: Junio C Hamano @ 2015-06-12 15:07 UTC (permalink / raw)
  To: Johannes Sixt; +Cc: Augie Fackler, git, Stefan Beller

Johannes Sixt <j6t@kdbg.org> writes:

> What is the problem with the current fetch-pack implementation? Does
> it remove a bogus packfile after download? Does it abort during
> download when it detects a broken packfile? Does --keep not do what
> you need?

Doesn't the incoming data still go through the fattening process,
though?  You will not be able to inspect the byte-for-byte identical
stream that came out of the server end whose packfile generation
logic is suspect.

For the purpose of debugging your own new server implementation, it
might be a better approach to capture the pack as it comes out at
the server end, instead of doing it at the fetch-pack end as it
comes in. But the approach to add this "dump" at the fetch-pack side
is that it gives us a tool to diagnose problems that come from
broken server (re)implementations by other people we cannot debug,
i.e. "you are spewing this corrupt pack against this request; here
is a dump we took to help you go fix your server".

> Instead of your approach (which forks off tee to dump a copy of the
> packfile), would it not be simpler to add an option --debug-pack
> (probably not the best name) that skips the cleanup step when a broken
> packfile is detected and prints the name of the downloaded packfile?

As long as we need to debug a thin pack that comes from the other
end, that approach is not sufficient, I am afraid.

I anticipated that you'd have problem with its use of "tee".  It
probably can do this internally with async interface, perhaps,
instead?

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2] fetch-pack: optionally save packs to disk
  2015-06-12  6:22   ` Johannes Sixt
  2015-06-12 15:07     ` Junio C Hamano
@ 2015-06-12 16:54     ` Augie Fackler
  1 sibling, 0 replies; 33+ messages in thread
From: Augie Fackler @ 2015-06-12 16:54 UTC (permalink / raw)
  To: Johannes Sixt; +Cc: git, Stefan Beller, Junio C Hamano

On Fri, Jun 12, 2015 at 2:22 AM, Johannes Sixt <j6t@kdbg.org> wrote:
>
> Am 11.06.2015 um 20:59 schrieb Augie Fackler:
>>
>> When developing server software, it's often helpful to save a
>> potentially-bogus pack for later analysis. This makes that trivial,
>> instead of painful.
>
>
> When you develop server software, shouldn't you test drive the server via
the bare metal protocol anyway? That *is* painful, but unavoidable because
you must harden the server against any garbage that a potentially malicous
client could throw at it. Restricting yourself to a well-behaved client
such as fetch-pack is only half the deal.


We do that too, but sometimes I've encountered an edge case that's
trivially reproduced from an existing repo, and going through the work to
manually drive the server is a monumental pain in the butt, and all I
*really* need is to see the bytes sent from the server to the client. If it
weren't for SSL-everywhere, I'd probably just do this with wireshark, but
that's not the world I live in.

>
>
> That said, I do think that fetch-pack could learn a mode that makes it
easier to debug the normal behavior of a server (if such a mode is missing
currently).
>
> What is the problem with the current fetch-pack implementation? Does it
remove a bogus packfile after download? Does it abort during download when
it detects a broken packfile? Does --keep not do what you need?


fetch-pack doesn't store the pack anywhere - it's sending it to index-pack
(or unpack-objects) using --stdin, which means that the raw bytes from the
server currently are never materialized anywhere on disk. Having index-pack
do this is too late, because it's doing things like rewriting the pack
header in a potentially new format.

(Junio also covered this well, thanks!)

>
>
> Instead of your approach (which forks off tee to dump a copy of the
packfile), would it not be simpler to add an option --debug-pack (probably
not the best name) that skips the cleanup step when a broken packfile is
detected and prints the name of the downloaded packfile?
>
>
>> diff --git a/fetch-pack.c b/fetch-pack.c
>> index a912935..fe6ba58 100644
>> --- a/fetch-pack.c
>> +++ b/fetch-pack.c
>> @@ -684,7 +684,7 @@ static int get_pack(struct fetch_pack_args *args,
>>         const char *argv[22];
>>         char keep_arg[256];
>>         char hdr_arg[256];
>> -       const char **av, *cmd_name;
>> +       const char **av, *cmd_name, *savepath;
>>         int do_keep = args->keep_pack;
>>         struct child_process cmd = CHILD_PROCESS_INIT;
>>         int ret;
>> @@ -708,9 +708,8 @@ static int get_pack(struct fetch_pack_args *args,
>>         cmd.argv = argv;
>>         av = argv;
>>         *hdr_arg = 0;
>> +       struct pack_header header;
>>         if (!args->keep_pack && unpack_limit) {
>> -               struct pack_header header;
>> -
>>                 if (read_pack_header(demux.out, &header))
>>                         die("protocol error: bad pack header");
>>                 snprintf(hdr_arg, sizeof(hdr_arg),
>> @@ -762,7 +761,44 @@ static int get_pack(struct fetch_pack_args *args,
>>                 *av++ = "--strict";
>>         *av++ = NULL;
>>
>> -       cmd.in = demux.out;
>> +       savepath = getenv("GIT_SAVE_FETCHED_PACK_TO");
>> +       if (savepath) {
>> +               struct child_process cmd2 = CHILD_PROCESS_INIT;
>> +               const char *argv2[22];
>> +               int pipefds[2];
>> +               int e;
>> +               const char **av2;
>> +               cmd2.argv = argv2;
>> +               av2 = argv2;
>> +               *av2++ = "tee";
>> +               if (*hdr_arg) {
>> +                       /* hdr_arg being nonempty means we already read
the
>> +                        * pack header from demux, so we need to drop a
pack
>> +                        * header in place for tee to append to,
otherwise
>> +                        * we'll end up with a broken pack on disk.
>> +                        */
>
>
>                         /*
>                          * Write multi-line comments
>                          * like this (/* on its own line)
>                          */
>
>> +                       int fp;
>> +                       struct sha1file *s;
>> +                       fp = open(savepath, O_CREAT | O_TRUNC |
O_WRONLY, 0666);
>> +                       s = sha1fd_throughput(fp, savepath, NULL);
>> +                       sha1write(s, &header, sizeof(header));
>> +                       sha1flush(s);
>
>
> Are you abusing sha1write() and sha1flush() to write a byte sequence to a
file? Is write_in_full() not sufficient?


I didn't know about write_in_full. I'm very much *not* familiar with git's
codebase - I know the protocols and formats reasonably well, but have
needed only occasionally to look at the code. That works, thanks.

>
>
>
>> +                       close(fp);
>> +                       /* -a is supported by both GNU and BSD tee */
>> +                       *av2++ = "-a";
>> +               }
>> +               *av2++ = savepath;
>> +               *av2++ = NULL;
>> +               cmd2.in = demux.out;
>> +               e = pipe(pipefds);
>> +               if (e != 0)
>> +                       die("couldn't make pipe to save pack");
>
>
> start_command() can create the pipe for you. Just say cmd2.out = -1.
>
>> +               cmd2.out = pipefds[1];
>> +               cmd.in = pipefds[0];
>> +               if (start_command(&cmd2))
>> +                       die("couldn't start tee to save a pack");
>
>
> When you call start_command(), you must also call finish_command().
start_command() prints an error message for you; you don't have to do that
(the start_command() in the context below is a bad example).

I looked around, and there are nonzero exit paths from start_command()
which do not print an error and die, so this seems safer. It's also in line
with the vast majority of uses of start_command in the codebase, so I left
this as-is. If you've got something specific you'd like to see here
instead, do let me know (presumably I still need to check the error code
from start_command()...)

>
>
>
>> +       } else
>> +               cmd.in = demux.out;
>>         cmd.git_cmd = 1;
>>         if (start_command(&cmd))
>>                 die("fetch-pack: unable to fork off %s", cmd_name);

[snip some good comments about test cleanups, all addressed]

>
> -- Hannes
>

I'll wait to mail a v3 until at least I know what's going on with
start_command() and error checking - perhaps until I get consensus on the
use of tee vs something else to save the bytes from the server.

https://github.com/durin42/git/commit/save-pack has the current version of
the patch if you want to see where it stands now.

Thanks for the review!

Augie

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2] fetch-pack: optionally save packs to disk
  2015-06-12 15:07     ` Junio C Hamano
@ 2015-06-12 17:02       ` Augie Fackler
  2015-06-12 18:00       ` Jeff King
  1 sibling, 0 replies; 33+ messages in thread
From: Augie Fackler @ 2015-06-12 17:02 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Johannes Sixt, git, Stefan Beller

On Fri, Jun 12, 2015 at 11:07 AM, Junio C Hamano <gitster@pobox.com> wrote:
>> What is the problem with the current fetch-pack implementation? Does
>> it remove a bogus packfile after download? Does it abort during
>> download when it detects a broken packfile? Does --keep not do what
>> you need?
>
> Doesn't the incoming data still go through the fattening process,
> though?  You will not be able to inspect the byte-for-byte identical
> stream that came out of the server end whose packfile generation
> logic is suspect.
>
> For the purpose of debugging your own new server implementation, it
> might be a better approach to capture the pack as it comes out at
> the server end, instead of doing it at the fetch-pack end as it
> comes in. But the approach to add this "dump" at the fetch-pack side
> is that it gives us a tool to diagnose problems that come from
> broken server (re)implementations by other people we cannot debug,
> i.e. "you are spewing this corrupt pack against this request; here
> is a dump we took to help you go fix your server".

*nod* that's an important part of it. Also, in the small-pull case,
the pack data gets sent to unpack-objects anyway, so git is never
saving the packfile anywhere in that case (I think it's for a pull of
less than 100 objects, which characterizes most of my reduced test
cases for weirdness.)

>
>> Instead of your approach (which forks off tee to dump a copy of the
>> packfile), would it not be simpler to add an option --debug-pack
>> (probably not the best name) that skips the cleanup step when a broken
>> packfile is detected and prints the name of the downloaded packfile?
>
> As long as we need to debug a thin pack that comes from the other
> end, that approach is not sufficient, I am afraid.
>
> I anticipated that you'd have problem with its use of "tee".  It
> probably can do this internally with async interface, perhaps,
> instead?

I'd be happy to make such changes if that's the consensus and someone
can give me pointers. My C is admittedly pretty rusty from non-use,
and I'm not at all familiar with git's codebase, but I'll at least
try.

Thanks!

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2] fetch-pack: optionally save packs to disk
  2015-06-12 15:07     ` Junio C Hamano
  2015-06-12 17:02       ` Augie Fackler
@ 2015-06-12 18:00       ` Jeff King
  2015-06-12 21:25         ` Jeff King
  1 sibling, 1 reply; 33+ messages in thread
From: Jeff King @ 2015-06-12 18:00 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Johannes Sixt, Augie Fackler, git, Stefan Beller

On Fri, Jun 12, 2015 at 08:07:36AM -0700, Junio C Hamano wrote:

> Johannes Sixt <j6t@kdbg.org> writes:
> 
> > What is the problem with the current fetch-pack implementation? Does
> > it remove a bogus packfile after download? Does it abort during
> > download when it detects a broken packfile? Does --keep not do what
> > you need?
> 
> Doesn't the incoming data still go through the fattening process,
> though?  You will not be able to inspect the byte-for-byte identical
> stream that came out of the server end whose packfile generation
> logic is suspect.
> 
> For the purpose of debugging your own new server implementation, it
> might be a better approach to capture the pack as it comes out at
> the server end, instead of doing it at the fetch-pack end as it
> comes in. But the approach to add this "dump" at the fetch-pack side
> is that it gives us a tool to diagnose problems that come from
> broken server (re)implementations by other people we cannot debug,
> i.e. "you are spewing this corrupt pack against this request; here
> is a dump we took to help you go fix your server".

When I added GIT_TRACE_PACKET long ago, I had always intended to
follow-up with a GIT_TRACE_PACKFILE. The former stops tracing when we
get to the binary data, but I had intended the latter to store the pure
on-the-wire packfile transmission for later debugging to. I never got
around to it, but I think it is something like the patch below.

With:

  GIT_TRACE_PACKET=1 GIT_TRACE_PACK=/tmp/foo.pack git clone ...

this yields a usable pack in /tmp/foo.pack (note that it only kicks in
when packet-tracing is on at all; if we restructure the code a bit, we
can remove that limitation).

In theory it would also work when receiving a pack via push, but I
think we actually skip the pkt-line protocol there. We'd have to
manually check GIT_TRACE_PACK.

Also, as a bonus, it means we do not stop tracing completely when we
start to receive a sideband pack. The current GIT_TRACE_PACKET code
misses any sideband-2 messages that come after we start receiving the
pack.

diff --git a/pkt-line.c b/pkt-line.c
index 187a229..f82871a 100644
--- a/pkt-line.c
+++ b/pkt-line.c
@@ -4,20 +4,39 @@
 char packet_buffer[LARGE_PACKET_MAX];
 static const char *packet_trace_prefix = "git";
 static struct trace_key trace_packet = TRACE_KEY_INIT(PACKET);
+static struct trace_key trace_pack = TRACE_KEY_INIT(PACK);
 
 void packet_trace_identity(const char *prog)
 {
 	packet_trace_prefix = xstrdup(prog);
 }
 
+static int packet_trace_pack(const char *buf, unsigned int len, int sideband)
+{
+	if (!sideband) {
+		trace_verbatim(&trace_pack, buf, len);
+		return 1;
+	} else if (len && *buf == '\1') {
+		trace_verbatim(&trace_pack, buf + 1, len - 1);
+		return 1;
+	} else {
+		/* it's another non-pack sideband */
+		return 0;
+	}
+}
+
 static void packet_trace(const char *buf, unsigned int len, int write)
 {
 	int i;
 	struct strbuf out;
+	static int in_pack, sideband;
 
 	if (!trace_want(&trace_packet))
 		return;
 
+	if (in_pack && packet_trace_pack(buf, len, sideband))
+		return;
+
 	/* +32 is just a guess for header + quoting */
 	strbuf_init(&out, len+32);
 
@@ -27,7 +46,9 @@ static void packet_trace(const char *buf, unsigned int len, int write)
 	if ((len >= 4 && starts_with(buf, "PACK")) ||
 	    (len >= 5 && starts_with(buf+1, "PACK"))) {
 		strbuf_addstr(&out, "PACK ...");
-		trace_disable(&trace_packet);
+		in_pack = 1;
+		sideband = *buf == '\1';
+		packet_trace_pack(buf, len, sideband);
 	}
 	else {
 		/* XXX we should really handle printable utf8 */
diff --git a/trace.c b/trace.c
index 3c3bd8f..7393926 100644
--- a/trace.c
+++ b/trace.c
@@ -120,6 +120,13 @@ static int prepare_trace_line(const char *file, int line,
 	return 1;
 }
 
+void trace_verbatim(struct trace_key *key, const void *buf, unsigned len)
+{
+	if (!trace_want(key))
+		return;
+	write_or_whine_pipe(get_trace_fd(key), buf, len, err_msg);
+}
+
 static void print_trace_line(struct trace_key *key, struct strbuf *buf)
 {
 	strbuf_complete_line(buf);
diff --git a/trace.h b/trace.h
index ae6a332..179b249 100644
--- a/trace.h
+++ b/trace.h
@@ -18,6 +18,7 @@ extern int trace_want(struct trace_key *key);
 extern void trace_disable(struct trace_key *key);
 extern uint64_t getnanotime(void);
 extern void trace_command_performance(const char **argv);
+extern void trace_verbatim(struct trace_key *key, const void *buf, unsigned len);
 
 #ifndef HAVE_VARIADIC_MACROS
 

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* Re: [PATCH v2] fetch-pack: optionally save packs to disk
  2015-06-12 18:00       ` Jeff King
@ 2015-06-12 21:25         ` Jeff King
  2015-06-12 21:28           ` [PATCH 1/3] pkt-line: simplify starts_with checks in packet tracing Jeff King
                             ` (2 more replies)
  0 siblings, 3 replies; 33+ messages in thread
From: Jeff King @ 2015-06-12 21:25 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Johannes Sixt, Augie Fackler, git, Stefan Beller

On Fri, Jun 12, 2015 at 02:00:05PM -0400, Jeff King wrote:

> When I added GIT_TRACE_PACKET long ago, I had always intended to
> follow-up with a GIT_TRACE_PACKFILE. The former stops tracing when we
> get to the binary data, but I had intended the latter to store the pure
> on-the-wire packfile transmission for later debugging to. I never got
> around to it, but I think it is something like the patch below.

Here it is, a bit more cleaned up. I think this is a nice improvement,
and it does fix some minor issues in the existing GIT_TRACE_PACKET code.
But I don't think it will ever work for receive-pack, because we hand
off the socket directly to index-pack.

I can live with that. But another approach would be to make it easier to
capture the stdin of programs with GIT_TRACE, rather than just their
arguments. That makes it more generically usable (e.g., I have hacky
patches on our servers for capturing "pack-objects" input so I can
replay expensive fetch requests). As Augie noted, that's not a full
pack-file due to the --pack-header stuff. But given a full trace (which
has both the arguments and stdin), you could reconstruct the packfile,
which we could do as a post-processing step.

I dunno. I prefer introducing a nicely generic mechanism, but it would
probably be a little more irritating to use in practice.

I guess yet another way to do it would be to put the GIT_TRACE_PACK
intelligence into index-pack and unpack-objects (rather than
fetch-pack). Then it at least applies to both the pushing and fetching
sides.

Anyway, here are the patches I came up with:

  [1/3]: pkt-line: simplify starts_with checks in packet tracing
  [2/3]: pkt-line: tighten sideband PACK check when tracing
  [3/3]: pkt-line: support tracing verbatim pack contents

-Peff

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 1/3] pkt-line: simplify starts_with checks in packet tracing
  2015-06-12 21:25         ` Jeff King
@ 2015-06-12 21:28           ` Jeff King
  2015-06-12 21:35             ` Stefan Beller
  2015-06-12 21:28           ` [PATCH 2/3] pkt-line: tighten sideband PACK check when tracing Jeff King
  2015-06-12 21:28           ` [PATCH 3/3] pkt-line: support tracing verbatim pack contents Jeff King
  2 siblings, 1 reply; 33+ messages in thread
From: Jeff King @ 2015-06-12 21:28 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Johannes Sixt, Augie Fackler, git, Stefan Beller

We carefully check that our pkt buffer has enough characters
before seeing if it starts with "PACK". The intent is to
avoid reading random memory if we get a short buffer like
"PAC".

However, we know that the traced packets are always
NUL-terminated. They come from one of these sources:

  1. A string literal.

  2. `format_packet`, which uses a strbuf.

  3. `packet_read`, which defensively NUL-terminates what we
     read.

We can therefore drop the length checks, as we know we will
hit the trailing NUL if we have a short input.

Signed-off-by: Jeff King <peff@peff.net>
---
 pkt-line.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pkt-line.c b/pkt-line.c
index 187a229..0477d2e 100644
--- a/pkt-line.c
+++ b/pkt-line.c
@@ -24,8 +24,7 @@ static void packet_trace(const char *buf, unsigned int len, int write)
 	strbuf_addf(&out, "packet: %12s%c ",
 		    packet_trace_prefix, write ? '>' : '<');
 
-	if ((len >= 4 && starts_with(buf, "PACK")) ||
-	    (len >= 5 && starts_with(buf+1, "PACK"))) {
+	if (starts_with(buf, "PACK") || starts_with(buf + 1, "PACK")) {
 		strbuf_addstr(&out, "PACK ...");
 		trace_disable(&trace_packet);
 	}
-- 
2.4.2.752.geeb594a

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 2/3] pkt-line: tighten sideband PACK check when tracing
  2015-06-12 21:25         ` Jeff King
  2015-06-12 21:28           ` [PATCH 1/3] pkt-line: simplify starts_with checks in packet tracing Jeff King
@ 2015-06-12 21:28           ` Jeff King
  2015-06-12 21:39             ` Stefan Beller
  2015-06-12 21:28           ` [PATCH 3/3] pkt-line: support tracing verbatim pack contents Jeff King
  2 siblings, 1 reply; 33+ messages in thread
From: Jeff King @ 2015-06-12 21:28 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Johannes Sixt, Augie Fackler, git, Stefan Beller

To find the start of the pack data, we accept the word PACK
at the beginning of any sideband channel, even though what
we really want is to find the pack data on channel 1. In
practice this doesn't matter, as sideband-2 messages tend to
start with "error:" or similar, but it is a good idea to be
explicit (especially as we add more code in this area, we
will rely on this assumption).

Signed-off-by: Jeff King <peff@peff.net>
---
 pkt-line.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkt-line.c b/pkt-line.c
index 0477d2e..e75af02 100644
--- a/pkt-line.c
+++ b/pkt-line.c
@@ -24,7 +24,7 @@ static void packet_trace(const char *buf, unsigned int len, int write)
 	strbuf_addf(&out, "packet: %12s%c ",
 		    packet_trace_prefix, write ? '>' : '<');
 
-	if (starts_with(buf, "PACK") || starts_with(buf + 1, "PACK")) {
+	if (starts_with(buf, "PACK") || starts_with(buf, "\1PACK")) {
 		strbuf_addstr(&out, "PACK ...");
 		trace_disable(&trace_packet);
 	}
-- 
2.4.2.752.geeb594a

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 3/3] pkt-line: support tracing verbatim pack contents
  2015-06-12 21:25         ` Jeff King
  2015-06-12 21:28           ` [PATCH 1/3] pkt-line: simplify starts_with checks in packet tracing Jeff King
  2015-06-12 21:28           ` [PATCH 2/3] pkt-line: tighten sideband PACK check when tracing Jeff King
@ 2015-06-12 21:28           ` Jeff King
  2015-06-16 15:38             ` Augie Fackler
  2 siblings, 1 reply; 33+ messages in thread
From: Jeff King @ 2015-06-12 21:28 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Johannes Sixt, Augie Fackler, git, Stefan Beller

When debugging the pack protocol, it is sometimes useful to
store the verbatim pack that we sent or received on the
wire. Looking at the on-disk result is often not helpful for
a few reasons:

  1. If the operation is a clone, we destroy the repo on
     failure, leaving nothing on disk.

  2. If the pack is small, we unpack it immediately, and the
     full pack never hits the disk.

  3. If we feed the pack to "index-pack --fix-thin", the
     resulting pack has the extra delta bases added to it.

We already have a GIT_TRACE_PACKET mechanism for tracing
packets. Let's extend it with GIT_TRACE_PACK to dump the
verbatim packfile.

There are a few other positive fallouts that come from
rearranging this code:

 - We currently disable the packet trace after seeing the
   PACK header, even though we may get human-readable lines
   on other sidebands; now we include them in the trace.

 - We currently try to print "PACK ..." in the trace to
   indicate that the packfile has started. But because we
   disable packet tracing, we never printed this line. We
   will now do so.

Signed-off-by: Jeff King <peff@peff.net>
---
 Documentation/git.txt | 13 +++++++++++-
 pkt-line.c            | 59 ++++++++++++++++++++++++++++++++++++++-------------
 t/t5601-clone.sh      |  7 ++++++
 trace.c               |  7 ++++++
 trace.h               |  1 +
 5 files changed, 71 insertions(+), 16 deletions(-)

diff --git a/Documentation/git.txt b/Documentation/git.txt
index 45b64a7..8c44d14 100644
--- a/Documentation/git.txt
+++ b/Documentation/git.txt
@@ -1000,9 +1000,20 @@ Unsetting the variable, or setting it to empty, "0" or
 	Enables trace messages for all packets coming in or out of a
 	given program. This can help with debugging object negotiation
 	or other protocol issues. Tracing is turned off at a packet
-	starting with "PACK".
+	starting with "PACK" (but see 'GIT_TRACE_PACK' below).
 	See 'GIT_TRACE' for available trace output options.
 
+'GIT_TRACE_PACK'::
+	Enables tracing of packfiles sent or received by a
+	given program. Unlike other trace output, this trace is
+	verbatim: no headers, and no quoting of binary data. You almost
+	certainly want to direct into a file (e.g.,
+	`GIT_TRACE_PACK=/tmp/my.pack`) rather than displaying it on the
+	terminal or mixing it with other trace output.
++
+Note that this is currently only implemented for the client side
+of clones and fetches.
+
 'GIT_TRACE_PERFORMANCE'::
 	Enables performance related trace messages, e.g. total execution
 	time of each Git command.
diff --git a/pkt-line.c b/pkt-line.c
index e75af02..2e122c0 100644
--- a/pkt-line.c
+++ b/pkt-line.c
@@ -4,16 +4,51 @@
 char packet_buffer[LARGE_PACKET_MAX];
 static const char *packet_trace_prefix = "git";
 static struct trace_key trace_packet = TRACE_KEY_INIT(PACKET);
+static struct trace_key trace_pack = TRACE_KEY_INIT(PACK);
 
 void packet_trace_identity(const char *prog)
 {
 	packet_trace_prefix = xstrdup(prog);
 }
 
+static int packet_trace_pack(const char *buf, unsigned int len, int sideband)
+{
+	if (!sideband) {
+		trace_verbatim(&trace_pack, buf, len);
+		return 1;
+	} else if (len && *buf == '\1') {
+		trace_verbatim(&trace_pack, buf + 1, len - 1);
+		return 1;
+	} else {
+		/* it's another non-pack sideband */
+		return 0;
+	}
+}
+
 static void packet_trace(const char *buf, unsigned int len, int write)
 {
 	int i;
 	struct strbuf out;
+	static int in_pack, sideband;
+
+	if (!trace_want(&trace_packet) && !trace_want(&trace_pack))
+		return;
+
+	if (in_pack) {
+		if (packet_trace_pack(buf, len, sideband))
+			return;
+	} else if (starts_with(buf, "PACK") || starts_with(buf, "\1PACK")) {
+		in_pack = 1;
+		sideband = *buf == '\1';
+		packet_trace_pack(buf, len, sideband);
+
+		/*
+		 * Make a note in the human-readable trace that the pack data
+		 * started.
+		 */
+		buf = "PACK ...";
+		len = strlen(buf);
+	}
 
 	if (!trace_want(&trace_packet))
 		return;
@@ -24,21 +59,15 @@ static void packet_trace(const char *buf, unsigned int len, int write)
 	strbuf_addf(&out, "packet: %12s%c ",
 		    packet_trace_prefix, write ? '>' : '<');
 
-	if (starts_with(buf, "PACK") || starts_with(buf, "\1PACK")) {
-		strbuf_addstr(&out, "PACK ...");
-		trace_disable(&trace_packet);
-	}
-	else {
-		/* XXX we should really handle printable utf8 */
-		for (i = 0; i < len; i++) {
-			/* suppress newlines */
-			if (buf[i] == '\n')
-				continue;
-			if (buf[i] >= 0x20 && buf[i] <= 0x7e)
-				strbuf_addch(&out, buf[i]);
-			else
-				strbuf_addf(&out, "\\%o", buf[i]);
-		}
+	/* XXX we should really handle printable utf8 */
+	for (i = 0; i < len; i++) {
+		/* suppress newlines */
+		if (buf[i] == '\n')
+			continue;
+		if (buf[i] >= 0x20 && buf[i] <= 0x7e)
+			strbuf_addch(&out, buf[i]);
+		else
+			strbuf_addf(&out, "\\%o", buf[i]);
 	}
 
 	strbuf_addch(&out, '\n');
diff --git a/t/t5601-clone.sh b/t/t5601-clone.sh
index bfdaf75..795ece0 100755
--- a/t/t5601-clone.sh
+++ b/t/t5601-clone.sh
@@ -496,4 +496,11 @@ test_expect_success 'shallow clone locally' '
 	( cd ddsstt && git fsck )
 '
 
+test_expect_success 'GIT_TRACE_PACK produces a usable pack' '
+	rm -rf dst.git &&
+	GIT_TRACE_PACK=$PWD/tmp.pack git clone --no-local --bare src dst.git &&
+	git init --bare replay.git &&
+	git -C replay.git index-pack -v --stdin <tmp.pack
+'
+
 test_done
diff --git a/trace.c b/trace.c
index 3c3bd8f..7393926 100644
--- a/trace.c
+++ b/trace.c
@@ -120,6 +120,13 @@ static int prepare_trace_line(const char *file, int line,
 	return 1;
 }
 
+void trace_verbatim(struct trace_key *key, const void *buf, unsigned len)
+{
+	if (!trace_want(key))
+		return;
+	write_or_whine_pipe(get_trace_fd(key), buf, len, err_msg);
+}
+
 static void print_trace_line(struct trace_key *key, struct strbuf *buf)
 {
 	strbuf_complete_line(buf);
diff --git a/trace.h b/trace.h
index ae6a332..179b249 100644
--- a/trace.h
+++ b/trace.h
@@ -18,6 +18,7 @@ extern int trace_want(struct trace_key *key);
 extern void trace_disable(struct trace_key *key);
 extern uint64_t getnanotime(void);
 extern void trace_command_performance(const char **argv);
+extern void trace_verbatim(struct trace_key *key, const void *buf, unsigned len);
 
 #ifndef HAVE_VARIADIC_MACROS
 
-- 
2.4.2.752.geeb594a

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* Re: [PATCH 1/3] pkt-line: simplify starts_with checks in packet tracing
  2015-06-12 21:28           ` [PATCH 1/3] pkt-line: simplify starts_with checks in packet tracing Jeff King
@ 2015-06-12 21:35             ` Stefan Beller
  0 siblings, 0 replies; 33+ messages in thread
From: Stefan Beller @ 2015-06-12 21:35 UTC (permalink / raw)
  To: Jeff King; +Cc: Junio C Hamano, Johannes Sixt, Augie Fackler, git

On Fri, Jun 12, 2015 at 2:28 PM, Jeff King <peff@peff.net> wrote:
> We carefully check that our pkt buffer has enough characters
> before seeing if it starts with "PACK". The intent is to
> avoid reading random memory if we get a short buffer like
> "PAC".
>
> However, we know that the traced packets are always
> NUL-terminated. They come from one of these sources:
>
>   1. A string literal.
>
>   2. `format_packet`, which uses a strbuf.
>
>   3. `packet_read`, which defensively NUL-terminates what we
>      read.
>
> We can therefore drop the length checks, as we know we will
> hit the trailing NUL if we have a short input.
>
> Signed-off-by: Jeff King <peff@peff.net>
> ---
>  pkt-line.c | 3 +--
>  1 file changed, 1 insertion(+), 2 deletions(-)
>
> diff --git a/pkt-line.c b/pkt-line.c
> index 187a229..0477d2e 100644
> --- a/pkt-line.c
> +++ b/pkt-line.c
> @@ -24,8 +24,7 @@ static void packet_trace(const char *buf, unsigned int len, int write)
>         strbuf_addf(&out, "packet: %12s%c ",
>                     packet_trace_prefix, write ? '>' : '<');
>
> -       if ((len >= 4 && starts_with(buf, "PACK")) ||
> -           (len >= 5 && starts_with(buf+1, "PACK"))) {

So I wondered why there is a possible extra character in front of PACK.
So I run the blame machinery and ended up with bbc30f9963 (add
packet tracing debug code, 2011-02-24), which was also authored
by you. Where does the extra char come from?

Would it be better for readability to write it as

    int offset = 0;
    if (*buf == CHARACTER_STEFAN_IS_WONDERING_ABOUT)
        /* ignore char foo because bar */
        offset++;
    if (starts_with(buf+offset, "PACK") {
        ...



> +       if (starts_with(buf, "PACK") || starts_with(buf + 1, "PACK")) {
>                 strbuf_addstr(&out, "PACK ...");
>                 trace_disable(&trace_packet);
>         }
> --
> 2.4.2.752.geeb594a
>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/3] pkt-line: tighten sideband PACK check when tracing
  2015-06-12 21:28           ` [PATCH 2/3] pkt-line: tighten sideband PACK check when tracing Jeff King
@ 2015-06-12 21:39             ` Stefan Beller
  2015-06-12 21:41               ` Jeff King
  0 siblings, 1 reply; 33+ messages in thread
From: Stefan Beller @ 2015-06-12 21:39 UTC (permalink / raw)
  To: Jeff King; +Cc: Junio C Hamano, Johannes Sixt, Augie Fackler, git

On Fri, Jun 12, 2015 at 2:28 PM, Jeff King <peff@peff.net> wrote:
> To find the start of the pack data, we accept the word PACK
> at the beginning of any sideband channel, even though what
> we really want is to find the pack data on channel 1. In
> practice this doesn't matter, as sideband-2 messages tend to
> start with "error:" or similar, but it is a good idea to be
> explicit (especially as we add more code in this area, we
> will rely on this assumption).
>
> Signed-off-by: Jeff King <peff@peff.net>
> ---
>  pkt-line.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/pkt-line.c b/pkt-line.c
> index 0477d2e..e75af02 100644
> --- a/pkt-line.c
> +++ b/pkt-line.c
> @@ -24,7 +24,7 @@ static void packet_trace(const char *buf, unsigned int len, int write)
>         strbuf_addf(&out, "packet: %12s%c ",
>                     packet_trace_prefix, write ? '>' : '<');
>
> -       if (starts_with(buf, "PACK") || starts_with(buf + 1, "PACK")) {
> +       if (starts_with(buf, "PACK") || starts_with(buf, "\1PACK")) {

This answers the question on the previous patch actually, maybe the
code could be improved to

    if (is_sidechannel(out, ...)
        out++;
    if (starts_with(buf, "PACK") {
        ...


>                 strbuf_addstr(&out, "PACK ...");
>                 trace_disable(&trace_packet);
>         }
> --
> 2.4.2.752.geeb594a
>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/3] pkt-line: tighten sideband PACK check when tracing
  2015-06-12 21:39             ` Stefan Beller
@ 2015-06-12 21:41               ` Jeff King
  2015-06-12 21:43                 ` Stefan Beller
  0 siblings, 1 reply; 33+ messages in thread
From: Jeff King @ 2015-06-12 21:41 UTC (permalink / raw)
  To: Stefan Beller; +Cc: Junio C Hamano, Johannes Sixt, Augie Fackler, git

On Fri, Jun 12, 2015 at 02:39:01PM -0700, Stefan Beller wrote:

> > -       if (starts_with(buf, "PACK") || starts_with(buf + 1, "PACK")) {
> > +       if (starts_with(buf, "PACK") || starts_with(buf, "\1PACK")) {
> 
> This answers the question on the previous patch actually, maybe the
> code could be improved to
> 
>     if (is_sidechannel(out, ...)
>         out++;
>     if (starts_with(buf, "PACK") {
>         ...

If it's not a PACK, then we don't want to skip past the side-channel
character (we show it as part of the trace).

Hopefully the end result after patch 3 reads well, as it sets an
explicit "sideband" flag.

-Peff

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/3] pkt-line: tighten sideband PACK check when tracing
  2015-06-12 21:41               ` Jeff King
@ 2015-06-12 21:43                 ` Stefan Beller
  0 siblings, 0 replies; 33+ messages in thread
From: Stefan Beller @ 2015-06-12 21:43 UTC (permalink / raw)
  To: Jeff King; +Cc: Junio C Hamano, Johannes Sixt, Augie Fackler, git

On Fri, Jun 12, 2015 at 2:41 PM, Jeff King <peff@peff.net> wrote:
> On Fri, Jun 12, 2015 at 02:39:01PM -0700, Stefan Beller wrote:
>
>> > -       if (starts_with(buf, "PACK") || starts_with(buf + 1, "PACK")) {
>> > +       if (starts_with(buf, "PACK") || starts_with(buf, "\1PACK")) {
>>
>> This answers the question on the previous patch actually, maybe the
>> code could be improved to
>>
>>     if (is_sidechannel(out, ...)
>>         out++;
>>     if (starts_with(buf, "PACK") {
>>         ...
>
> If it's not a PACK, then we don't want to skip past the side-channel
> character (we show it as part of the trace).
>
> Hopefully the end result after patch 3 reads well, as it sets an
> explicit "sideband" flag.

Yeah, I just looked at that, and realized I should not worry about
these first two patches
as the line is deleted anyway. :( Writing faster than I can think.

>
> -Peff

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 3/3] pkt-line: support tracing verbatim pack contents
  2015-06-12 21:28           ` [PATCH 3/3] pkt-line: support tracing verbatim pack contents Jeff King
@ 2015-06-16 15:38             ` Augie Fackler
  2015-06-16 16:39               ` Junio C Hamano
  2015-06-16 17:10               ` Jeff King
  0 siblings, 2 replies; 33+ messages in thread
From: Augie Fackler @ 2015-06-16 15:38 UTC (permalink / raw)
  To: Jeff King; +Cc: Junio C Hamano, Johannes Sixt, git, Stefan Beller

On Fri, Jun 12, 2015 at 5:28 PM, Jeff King <peff@peff.net> wrote:
> When debugging the pack protocol, it is sometimes useful to
> store the verbatim pack that we sent or received on the
> wire. Looking at the on-disk result is often not helpful for
> a few reasons:
>
>   1. If the operation is a clone, we destroy the repo on
>      failure, leaving nothing on disk.
>
>   2. If the pack is small, we unpack it immediately, and the
>      full pack never hits the disk.
>
>   3. If we feed the pack to "index-pack --fix-thin", the
>      resulting pack has the extra delta bases added to it.
>
> We already have a GIT_TRACE_PACKET mechanism for tracing
> packets. Let's extend it with GIT_TRACE_PACK to dump the
> verbatim packfile.

FWIW, this also works for me - I have no preference between my patches
and Jeff's. I suspect yours are much better given that you have a clue
about git internals ;).

One bit of feedback is that it might be worth mentioning (though I
don't feel strongly) that GIT_TRACE_PACK works with or without
GIT_TRACE_PACKET - that wasn't immediately obvious to me, but it makes
sense once I read the code.

Thanks!

>
> There are a few other positive fallouts that come from
> rearranging this code:
>
>  - We currently disable the packet trace after seeing the
>    PACK header, even though we may get human-readable lines
>    on other sidebands; now we include them in the trace.
>
>  - We currently try to print "PACK ..." in the trace to
>    indicate that the packfile has started. But because we
>    disable packet tracing, we never printed this line. We
>    will now do so.
>
> Signed-off-by: Jeff King <peff@peff.net>
> ---
>  Documentation/git.txt | 13 +++++++++++-
>  pkt-line.c            | 59 ++++++++++++++++++++++++++++++++++++++-------------
>  t/t5601-clone.sh      |  7 ++++++
>  trace.c               |  7 ++++++
>  trace.h               |  1 +
>  5 files changed, 71 insertions(+), 16 deletions(-)
>
> diff --git a/Documentation/git.txt b/Documentation/git.txt
> index 45b64a7..8c44d14 100644
> --- a/Documentation/git.txt
> +++ b/Documentation/git.txt
> @@ -1000,9 +1000,20 @@ Unsetting the variable, or setting it to empty, "0" or
>         Enables trace messages for all packets coming in or out of a
>         given program. This can help with debugging object negotiation
>         or other protocol issues. Tracing is turned off at a packet
> -       starting with "PACK".
> +       starting with "PACK" (but see 'GIT_TRACE_PACK' below).
>         See 'GIT_TRACE' for available trace output options.
>
> +'GIT_TRACE_PACK'::
> +       Enables tracing of packfiles sent or received by a
> +       given program. Unlike other trace output, this trace is
> +       verbatim: no headers, and no quoting of binary data. You almost
> +       certainly want to direct into a file (e.g.,
> +       `GIT_TRACE_PACK=/tmp/my.pack`) rather than displaying it on the
> +       terminal or mixing it with other trace output.
> ++
> +Note that this is currently only implemented for the client side
> +of clones and fetches.
> +
>  'GIT_TRACE_PERFORMANCE'::
>         Enables performance related trace messages, e.g. total execution
>         time of each Git command.
> diff --git a/pkt-line.c b/pkt-line.c
> index e75af02..2e122c0 100644
> --- a/pkt-line.c
> +++ b/pkt-line.c
> @@ -4,16 +4,51 @@
>  char packet_buffer[LARGE_PACKET_MAX];
>  static const char *packet_trace_prefix = "git";
>  static struct trace_key trace_packet = TRACE_KEY_INIT(PACKET);
> +static struct trace_key trace_pack = TRACE_KEY_INIT(PACK);
>
>  void packet_trace_identity(const char *prog)
>  {
>         packet_trace_prefix = xstrdup(prog);
>  }
>
> +static int packet_trace_pack(const char *buf, unsigned int len, int sideband)
> +{
> +       if (!sideband) {
> +               trace_verbatim(&trace_pack, buf, len);
> +               return 1;
> +       } else if (len && *buf == '\1') {
> +               trace_verbatim(&trace_pack, buf + 1, len - 1);
> +               return 1;
> +       } else {
> +               /* it's another non-pack sideband */
> +               return 0;
> +       }
> +}
> +
>  static void packet_trace(const char *buf, unsigned int len, int write)
>  {
>         int i;
>         struct strbuf out;
> +       static int in_pack, sideband;
> +
> +       if (!trace_want(&trace_packet) && !trace_want(&trace_pack))
> +               return;
> +
> +       if (in_pack) {
> +               if (packet_trace_pack(buf, len, sideband))
> +                       return;
> +       } else if (starts_with(buf, "PACK") || starts_with(buf, "\1PACK")) {
> +               in_pack = 1;
> +               sideband = *buf == '\1';
> +               packet_trace_pack(buf, len, sideband);
> +
> +               /*
> +                * Make a note in the human-readable trace that the pack data
> +                * started.
> +                */
> +               buf = "PACK ...";
> +               len = strlen(buf);
> +       }
>
>         if (!trace_want(&trace_packet))
>                 return;
> @@ -24,21 +59,15 @@ static void packet_trace(const char *buf, unsigned int len, int write)
>         strbuf_addf(&out, "packet: %12s%c ",
>                     packet_trace_prefix, write ? '>' : '<');
>
> -       if (starts_with(buf, "PACK") || starts_with(buf, "\1PACK")) {
> -               strbuf_addstr(&out, "PACK ...");
> -               trace_disable(&trace_packet);
> -       }
> -       else {
> -               /* XXX we should really handle printable utf8 */
> -               for (i = 0; i < len; i++) {
> -                       /* suppress newlines */
> -                       if (buf[i] == '\n')
> -                               continue;
> -                       if (buf[i] >= 0x20 && buf[i] <= 0x7e)
> -                               strbuf_addch(&out, buf[i]);
> -                       else
> -                               strbuf_addf(&out, "\\%o", buf[i]);
> -               }
> +       /* XXX we should really handle printable utf8 */
> +       for (i = 0; i < len; i++) {
> +               /* suppress newlines */
> +               if (buf[i] == '\n')
> +                       continue;
> +               if (buf[i] >= 0x20 && buf[i] <= 0x7e)
> +                       strbuf_addch(&out, buf[i]);
> +               else
> +                       strbuf_addf(&out, "\\%o", buf[i]);
>         }
>
>         strbuf_addch(&out, '\n');
> diff --git a/t/t5601-clone.sh b/t/t5601-clone.sh
> index bfdaf75..795ece0 100755
> --- a/t/t5601-clone.sh
> +++ b/t/t5601-clone.sh
> @@ -496,4 +496,11 @@ test_expect_success 'shallow clone locally' '
>         ( cd ddsstt && git fsck )
>  '
>
> +test_expect_success 'GIT_TRACE_PACK produces a usable pack' '
> +       rm -rf dst.git &&
> +       GIT_TRACE_PACK=$PWD/tmp.pack git clone --no-local --bare src dst.git &&
> +       git init --bare replay.git &&
> +       git -C replay.git index-pack -v --stdin <tmp.pack
> +'
> +
>  test_done
> diff --git a/trace.c b/trace.c
> index 3c3bd8f..7393926 100644
> --- a/trace.c
> +++ b/trace.c
> @@ -120,6 +120,13 @@ static int prepare_trace_line(const char *file, int line,
>         return 1;
>  }
>
> +void trace_verbatim(struct trace_key *key, const void *buf, unsigned len)
> +{
> +       if (!trace_want(key))
> +               return;
> +       write_or_whine_pipe(get_trace_fd(key), buf, len, err_msg);
> +}
> +
>  static void print_trace_line(struct trace_key *key, struct strbuf *buf)
>  {
>         strbuf_complete_line(buf);
> diff --git a/trace.h b/trace.h
> index ae6a332..179b249 100644
> --- a/trace.h
> +++ b/trace.h
> @@ -18,6 +18,7 @@ extern int trace_want(struct trace_key *key);
>  extern void trace_disable(struct trace_key *key);
>  extern uint64_t getnanotime(void);
>  extern void trace_command_performance(const char **argv);
> +extern void trace_verbatim(struct trace_key *key, const void *buf, unsigned len);
>
>  #ifndef HAVE_VARIADIC_MACROS
>
> --
> 2.4.2.752.geeb594a

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 3/3] pkt-line: support tracing verbatim pack contents
  2015-06-16 15:38             ` Augie Fackler
@ 2015-06-16 16:39               ` Junio C Hamano
  2015-06-16 16:43                 ` Jeff King
  2015-06-16 17:10               ` Jeff King
  1 sibling, 1 reply; 33+ messages in thread
From: Junio C Hamano @ 2015-06-16 16:39 UTC (permalink / raw)
  To: Augie Fackler; +Cc: Jeff King, Johannes Sixt, git, Stefan Beller

Augie Fackler <augie@google.com> writes:

> On Fri, Jun 12, 2015 at 5:28 PM, Jeff King <peff@peff.net> wrote:
>> When debugging the pack protocol, it is sometimes useful to
>> store the verbatim pack that we sent or received on the
>> wire. Looking at the on-disk result is often not helpful for
>> a few reasons:
>>
>>   1. If the operation is a clone, we destroy the repo on
>>      failure, leaving nothing on disk.
>>
>>   2. If the pack is small, we unpack it immediately, and the
>>      full pack never hits the disk.
>>
>>   3. If we feed the pack to "index-pack --fix-thin", the
>>      resulting pack has the extra delta bases added to it.
>>
>> We already have a GIT_TRACE_PACKET mechanism for tracing
>> packets. Let's extend it with GIT_TRACE_PACK to dump the
>> verbatim packfile.
>
> FWIW, this also works for me - I have no preference between my patches
> and Jeff's. I suspect yours are much better given that you have a clue
> about git internals ;).
>
> One bit of feedback is that it might be worth mentioning (though I
> don't feel strongly) that GIT_TRACE_PACK works with or without
> GIT_TRACE_PACKET - that wasn't immediately obvious to me, but it makes
> sense once I read the code.
>
> Thanks!

Thanks, both.  I think this series makes sense.

As to the documentation, I have a feeling that, unless the reader
and/or the user intimately knows that TRACE_PACK is implemented by
hooking into the same mechanism that TRACE_PACKET needs to, s/he
would not even wonder if TRACE_PACKET needs to be enabled when
asking for TRACE_PACK.  Yes, one is a proper substring of the other,
but the similarity between the two stops there.  While I do not
think it would hurt very much to mention that they are independent,
I have a slight suspicion that it might make it more likely to get
user confused.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 3/3] pkt-line: support tracing verbatim pack contents
  2015-06-16 16:39               ` Junio C Hamano
@ 2015-06-16 16:43                 ` Jeff King
  2015-06-16 16:52                   ` Augie Fackler
  0 siblings, 1 reply; 33+ messages in thread
From: Jeff King @ 2015-06-16 16:43 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Augie Fackler, Johannes Sixt, git, Stefan Beller

On Tue, Jun 16, 2015 at 09:39:19AM -0700, Junio C Hamano wrote:

> As to the documentation, I have a feeling that, unless the reader
> and/or the user intimately knows that TRACE_PACK is implemented by
> hooking into the same mechanism that TRACE_PACKET needs to, s/he
> would not even wonder if TRACE_PACKET needs to be enabled when
> asking for TRACE_PACK.  Yes, one is a proper substring of the other,
> but the similarity between the two stops there.  While I do not
> think it would hurt very much to mention that they are independent,
> I have a slight suspicion that it might make it more likely to get
> user confused.

Yes, I was just re-reading the documentation based on Augie's comment,
and it seems pretty clear to me. Of course I wrote it, so that is not
saying much. Augie, I'd be happy to hear a proposed wording change if
you have one.

I do kind of hate the name TRACE_PACK for two reasons:

  - it _is_ so close to TRACE_PACKET; maybe TRACE_PACKFILE would be
    better

  - it does not indicate that it is about on-the-wire packs. I.e., it
    has nothing to do with "git repack". But I could not think of a good
    succinct name to indicate that.

-Peff

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 3/3] pkt-line: support tracing verbatim pack contents
  2015-06-16 16:43                 ` Jeff King
@ 2015-06-16 16:52                   ` Augie Fackler
  2015-06-16 17:23                     ` Jeff King
  0 siblings, 1 reply; 33+ messages in thread
From: Augie Fackler @ 2015-06-16 16:52 UTC (permalink / raw)
  To: Jeff King; +Cc: Junio C Hamano, Johannes Sixt, git, Stefan Beller

On Tue, Jun 16, 2015 at 12:43 PM, Jeff King <peff@peff.net> wrote:
> On Tue, Jun 16, 2015 at 09:39:19AM -0700, Junio C Hamano wrote:
>
>> As to the documentation, I have a feeling that, unless the reader
>> and/or the user intimately knows that TRACE_PACK is implemented by
>> hooking into the same mechanism that TRACE_PACKET needs to, s/he
>> would not even wonder if TRACE_PACKET needs to be enabled when
>> asking for TRACE_PACK.  Yes, one is a proper substring of the other,
>> but the similarity between the two stops there.  While I do not
>> think it would hurt very much to mention that they are independent,
>> I have a slight suspicion that it might make it more likely to get
>> user confused.
>
> Yes, I was just re-reading the documentation based on Augie's comment,
> and it seems pretty clear to me. Of course I wrote it, so that is not
> saying much. Augie, I'd be happy to hear a proposed wording change if
> you have one.

Yeah, I don't have one - I may have been making assumptions because I
knew something of the background on your implementation. i'd say let
it live and if people provide feedback later figure out a reword
(writing for humans is not a strong suit of mine, alas.)

>
> I do kind of hate the name TRACE_PACK for two reasons:
>
>   - it _is_ so close to TRACE_PACKET; maybe TRACE_PACKFILE would be
>     better

+1 on TRACE_PACKFILE - also makes it harder to accidentally leave off
the ET and get binary spew when one didn't expect that.

AF

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 3/3] pkt-line: support tracing verbatim pack contents
  2015-06-16 15:38             ` Augie Fackler
  2015-06-16 16:39               ` Junio C Hamano
@ 2015-06-16 17:10               ` Jeff King
  2015-06-16 17:14                 ` Augie Fackler
  1 sibling, 1 reply; 33+ messages in thread
From: Jeff King @ 2015-06-16 17:10 UTC (permalink / raw)
  To: Augie Fackler; +Cc: Junio C Hamano, Johannes Sixt, git, Stefan Beller

On Tue, Jun 16, 2015 at 11:38:41AM -0400, Augie Fackler wrote:

> > We already have a GIT_TRACE_PACKET mechanism for tracing
> > packets. Let's extend it with GIT_TRACE_PACK to dump the
> > verbatim packfile.
> 
> FWIW, this also works for me - I have no preference between my patches
> and Jeff's. I suspect yours are much better given that you have a clue
> about git internals ;).

I like mine better than yours, if only because it hooks into the
existing tracing mechanism. But I am sad that neither of our proposals
works for tracing pushed packs (something that is useful for the
opposite situation: you have a sane git server, and some unknown
misbehaving _client_ is sending you bogus packs).

Here's a rough cut at the "trace stdin" idea I mentioned earlier (which
is essentially an internal "tee"). You can collect the incoming pack
like:

  GIT_TRACE=1 \
  GIT_TRACE_STDIN=/tmp/foo.pack \
  GIT_TRACE_STDIN_FOR=index-pack \
  git fetch ...

The "STDIN_FOR" hack is there because otherwise you get the stdin for
multiple processes intermingled. I think a cleaner way to do it would be
to allow something like:

  GIT_TRACE_STDIN=/tmp/stdin.%p

and replace "%p" with the PID of the tracing process.

I like this approach because it works everywhere (for pushed packs, but
also for any other git commands you may want to debug).  The downside
versus the other patches is that to replay the index-pack command, you
need to collect both its stdin and its arguments (which you can get from
the GIT_TRACE output). On the other hand, it gives you a more realistic
replay of what happened (e.g., if there is a bug triggered by the
--pack-header code).

Here is the patch:

---
diff --git a/git.c b/git.c
index 44374b1..d58866e 100644
--- a/git.c
+++ b/git.c
@@ -616,6 +616,80 @@ static void restore_sigpipe_to_default(void)
 	signal(SIGPIPE, SIG_DFL);
 }
 
+static int copy_stdin(int in, int out, void *data)
+{
+	struct trace_key *key = data;
+	while (1) {
+		char buf[8192];
+		ssize_t len = xread(in, buf, sizeof(buf));
+		if (!len)
+			break;
+		if (len < 0) {
+			warning("error reading stdin trace: %s",
+				strerror(errno));
+			break;
+		}
+
+		trace_verbatim(key, buf, len);
+		if (write_in_full(out, buf, len) < 0) {
+			warning("error writing stdin trace: %s",
+				strerror(errno));
+			break;
+		}
+	}
+	close(in);
+	close(out);
+	return 0;
+}
+
+static int trace_stdin_for(const char *cmd)
+{
+	int ret = 1;
+	const char *x = getenv("GIT_TRACE_STDIN_FOR");
+
+	if (x) {
+		struct string_list items = STRING_LIST_INIT_DUP;
+		string_list_split(&items, x, ':', -1);
+		ret = unsorted_string_list_has_string(&items, cmd);
+		string_list_clear(&items, 0);
+	}
+
+	return ret;
+}
+
+static void trace_stdin(const char *cmd)
+{
+	static struct trace_key key = TRACE_KEY_INIT(STDIN);
+	static struct async async;
+
+	if (!trace_stdin_for(cmd) || !trace_want(&key))
+		return;
+
+	memset(&async, 0, sizeof(async));
+	async.proc = copy_stdin;
+	async.data = &key;
+	async.in = dup(0);
+	async.out = -1;
+
+	if (async.in < 0 || start_async(&async) < 0) {
+		warning("unable to trace stdin: %s", strerror(errno));
+		return ;
+	}
+
+	/*
+	 * At this point we've handed stdin off to the async process,
+	 * so there we are past the point of no return.
+	 */
+	if (dup2(async.out, 0))
+		die_errno("unable to redirect stdin from async process");
+	close(async.out);
+
+	/*
+	 * leak async; we would know to finish_async() only when we are
+	 * exiting, and there is no point then
+	 */
+}
+
 int main(int argc, char **av)
 {
 	const char **argv = (const char **) av;
@@ -674,6 +748,8 @@ int main(int argc, char **av)
 	}
 	cmd = argv[0];
 
+	trace_stdin(cmd);
+
 	/*
 	 * We use PATH to find git commands, but we prepend some higher
 	 * precedence paths: the "--exec-path" option, the GIT_EXEC_PATH

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* Re: [PATCH 3/3] pkt-line: support tracing verbatim pack contents
  2015-06-16 17:10               ` Jeff King
@ 2015-06-16 17:14                 ` Augie Fackler
  2015-06-16 17:18                   ` Jeff King
  0 siblings, 1 reply; 33+ messages in thread
From: Augie Fackler @ 2015-06-16 17:14 UTC (permalink / raw)
  To: Jeff King; +Cc: Junio C Hamano, Johannes Sixt, git, Stefan Beller

On Tue, Jun 16, 2015 at 1:10 PM, Jeff King <peff@peff.net> wrote:
> On Tue, Jun 16, 2015 at 11:38:41AM -0400, Augie Fackler wrote:
>
>> > We already have a GIT_TRACE_PACKET mechanism for tracing
>> > packets. Let's extend it with GIT_TRACE_PACK to dump the
>> > verbatim packfile.
>>
>> FWIW, this also works for me - I have no preference between my patches
>> and Jeff's. I suspect yours are much better given that you have a clue
>> about git internals ;).
>
> I like mine better than yours, if only because it hooks into the
> existing tracing mechanism. But I am sad that neither of our proposals
> works for tracing pushed packs (something that is useful for the
> opposite situation: you have a sane git server, and some unknown
> misbehaving _client_ is sending you bogus packs).

Yeah, not having it for the push side is a slight bummer, but in
general I haven't had problems debugging git clients pushing bogus
data in the same way that I've had problems with weirdness in new
server features.

>
> Here's a rough cut at the "trace stdin" idea I mentioned earlier (which
> is essentially an internal "tee"). You can collect the incoming pack
> like:

Neat, but not sure I like the extra overhead of having to grab the
full trace and then reconstruct some arguments to be able to diagnose
the pack. Having the verbatim pack just land on disk is really handy,
because then any existing tools one has cooked up (my team has a few
weird one-off ones by now) just work without extra fussing or looking
up steps to reconstruct the whole file.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 3/3] pkt-line: support tracing verbatim pack contents
  2015-06-16 17:14                 ` Augie Fackler
@ 2015-06-16 17:18                   ` Jeff King
  2015-06-16 17:23                     ` Augie Fackler
  0 siblings, 1 reply; 33+ messages in thread
From: Jeff King @ 2015-06-16 17:18 UTC (permalink / raw)
  To: Augie Fackler; +Cc: Junio C Hamano, Johannes Sixt, git, Stefan Beller

On Tue, Jun 16, 2015 at 01:14:03PM -0400, Augie Fackler wrote:

> Yeah, not having it for the push side is a slight bummer, but in
> general I haven't had problems debugging git clients pushing bogus
> data in the same way that I've had problems with weirdness in new
> server features.

Being in charge of a large git server farm, I think I have the opposite
problem. :)

> > Here's a rough cut at the "trace stdin" idea I mentioned earlier (which
> > is essentially an internal "tee"). You can collect the incoming pack
> > like:
> 
> Neat, but not sure I like the extra overhead of having to grab the
> full trace and then reconstruct some arguments to be able to diagnose
> the pack. Having the verbatim pack just land on disk is really handy,
> because then any existing tools one has cooked up (my team has a few
> weird one-off ones by now) just work without extra fussing or looking
> up steps to reconstruct the whole file.

I guess there is really room for both. Just because you _can_ accomplish
the same thing with both does not mean we cannot have two ways to do it
(an easy way, and a harder, more flexible way).

-Peff

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 3/3] pkt-line: support tracing verbatim pack contents
  2015-06-16 16:52                   ` Augie Fackler
@ 2015-06-16 17:23                     ` Jeff King
  0 siblings, 0 replies; 33+ messages in thread
From: Jeff King @ 2015-06-16 17:23 UTC (permalink / raw)
  To: Augie Fackler; +Cc: Junio C Hamano, Johannes Sixt, git, Stefan Beller

On Tue, Jun 16, 2015 at 12:52:46PM -0400, Augie Fackler wrote:

> Yeah, I don't have one - I may have been making assumptions because I
> knew something of the background on your implementation. i'd say let
> it live and if people provide feedback later figure out a reword
> (writing for humans is not a strong suit of mine, alas.)

OK, let's leave it as-is, then.

> +1 on TRACE_PACKFILE - also makes it harder to accidentally leave off
> the ET and get binary spew when one didn't expect that.

Here's a re-roll of 3/3, using TRACE_PACKFILE. I think the documentation
is enough to allay my other concern (that the name does not imply it is
about the network protocol).

-- >8 --
Subject: pkt-line: support tracing verbatim pack contents

When debugging the pack protocol, it is sometimes useful to
store the verbatim pack that we sent or received on the
wire. Looking at the on-disk result is often not helpful for
a few reasons:

  1. If the operation is a clone, we destroy the repo on
     failure, leaving nothing on disk.

  2. If the pack is small, we unpack it immediately, and the
     full pack never hits the disk.

  3. If we feed the pack to "index-pack --fix-thin", the
     resulting pack has the extra delta bases added to it.

We already have a GIT_TRACE_PACKET mechanism for tracing
packets. Let's extend it with GIT_TRACE_PACKFILE to dump the
verbatim packfile.

There are a few other positive fallouts that come from
rearranging this code:

 - We currently disable the packet trace after seeing the
   PACK header, even though we may get human-readable lines
   on other sidebands; now we include them in the trace.

 - We currently try to print "PACK ..." in the trace to
   indicate that the packfile has started. But because we
   disable packet tracing, we never printed this line. We
   will now do so.

Signed-off-by: Jeff King <peff@peff.net>
---
 Documentation/git.txt | 13 +++++++++++-
 pkt-line.c            | 59 ++++++++++++++++++++++++++++++++++++++-------------
 t/t5601-clone.sh      |  7 ++++++
 trace.c               |  7 ++++++
 trace.h               |  1 +
 5 files changed, 71 insertions(+), 16 deletions(-)

diff --git a/Documentation/git.txt b/Documentation/git.txt
index 45b64a7..3453669 100644
--- a/Documentation/git.txt
+++ b/Documentation/git.txt
@@ -1000,9 +1000,20 @@ Unsetting the variable, or setting it to empty, "0" or
 	Enables trace messages for all packets coming in or out of a
 	given program. This can help with debugging object negotiation
 	or other protocol issues. Tracing is turned off at a packet
-	starting with "PACK".
+	starting with "PACK" (but see 'GIT_TRACE_PACKFILE' below).
 	See 'GIT_TRACE' for available trace output options.
 
+'GIT_TRACE_PACKFILE'::
+	Enables tracing of packfiles sent or received by a
+	given program. Unlike other trace output, this trace is
+	verbatim: no headers, and no quoting of binary data. You almost
+	certainly want to direct into a file (e.g.,
+	`GIT_TRACE_PACKFILE=/tmp/my.pack`) rather than displaying it on
+	the terminal or mixing it with other trace output.
++
+Note that this is currently only implemented for the client side
+of clones and fetches.
+
 'GIT_TRACE_PERFORMANCE'::
 	Enables performance related trace messages, e.g. total execution
 	time of each Git command.
diff --git a/pkt-line.c b/pkt-line.c
index e75af02..08a1427 100644
--- a/pkt-line.c
+++ b/pkt-line.c
@@ -4,16 +4,51 @@
 char packet_buffer[LARGE_PACKET_MAX];
 static const char *packet_trace_prefix = "git";
 static struct trace_key trace_packet = TRACE_KEY_INIT(PACKET);
+static struct trace_key trace_pack = TRACE_KEY_INIT(PACKFILE);
 
 void packet_trace_identity(const char *prog)
 {
 	packet_trace_prefix = xstrdup(prog);
 }
 
+static int packet_trace_pack(const char *buf, unsigned int len, int sideband)
+{
+	if (!sideband) {
+		trace_verbatim(&trace_pack, buf, len);
+		return 1;
+	} else if (len && *buf == '\1') {
+		trace_verbatim(&trace_pack, buf + 1, len - 1);
+		return 1;
+	} else {
+		/* it's another non-pack sideband */
+		return 0;
+	}
+}
+
 static void packet_trace(const char *buf, unsigned int len, int write)
 {
 	int i;
 	struct strbuf out;
+	static int in_pack, sideband;
+
+	if (!trace_want(&trace_packet) && !trace_want(&trace_pack))
+		return;
+
+	if (in_pack) {
+		if (packet_trace_pack(buf, len, sideband))
+			return;
+	} else if (starts_with(buf, "PACK") || starts_with(buf, "\1PACK")) {
+		in_pack = 1;
+		sideband = *buf == '\1';
+		packet_trace_pack(buf, len, sideband);
+
+		/*
+		 * Make a note in the human-readable trace that the pack data
+		 * started.
+		 */
+		buf = "PACK ...";
+		len = strlen(buf);
+	}
 
 	if (!trace_want(&trace_packet))
 		return;
@@ -24,21 +59,15 @@ static void packet_trace(const char *buf, unsigned int len, int write)
 	strbuf_addf(&out, "packet: %12s%c ",
 		    packet_trace_prefix, write ? '>' : '<');
 
-	if (starts_with(buf, "PACK") || starts_with(buf, "\1PACK")) {
-		strbuf_addstr(&out, "PACK ...");
-		trace_disable(&trace_packet);
-	}
-	else {
-		/* XXX we should really handle printable utf8 */
-		for (i = 0; i < len; i++) {
-			/* suppress newlines */
-			if (buf[i] == '\n')
-				continue;
-			if (buf[i] >= 0x20 && buf[i] <= 0x7e)
-				strbuf_addch(&out, buf[i]);
-			else
-				strbuf_addf(&out, "\\%o", buf[i]);
-		}
+	/* XXX we should really handle printable utf8 */
+	for (i = 0; i < len; i++) {
+		/* suppress newlines */
+		if (buf[i] == '\n')
+			continue;
+		if (buf[i] >= 0x20 && buf[i] <= 0x7e)
+			strbuf_addch(&out, buf[i]);
+		else
+			strbuf_addf(&out, "\\%o", buf[i]);
 	}
 
 	strbuf_addch(&out, '\n');
diff --git a/t/t5601-clone.sh b/t/t5601-clone.sh
index bfdaf75..9b34f3c 100755
--- a/t/t5601-clone.sh
+++ b/t/t5601-clone.sh
@@ -496,4 +496,11 @@ test_expect_success 'shallow clone locally' '
 	( cd ddsstt && git fsck )
 '
 
+test_expect_success 'GIT_TRACE_PACKFILE produces a usable pack' '
+	rm -rf dst.git &&
+	GIT_TRACE_PACKFILE=$PWD/tmp.pack git clone --no-local --bare src dst.git &&
+	git init --bare replay.git &&
+	git -C replay.git index-pack -v --stdin <tmp.pack
+'
+
 test_done
diff --git a/trace.c b/trace.c
index 3c3bd8f..7393926 100644
--- a/trace.c
+++ b/trace.c
@@ -120,6 +120,13 @@ static int prepare_trace_line(const char *file, int line,
 	return 1;
 }
 
+void trace_verbatim(struct trace_key *key, const void *buf, unsigned len)
+{
+	if (!trace_want(key))
+		return;
+	write_or_whine_pipe(get_trace_fd(key), buf, len, err_msg);
+}
+
 static void print_trace_line(struct trace_key *key, struct strbuf *buf)
 {
 	strbuf_complete_line(buf);
diff --git a/trace.h b/trace.h
index ae6a332..179b249 100644
--- a/trace.h
+++ b/trace.h
@@ -18,6 +18,7 @@ extern int trace_want(struct trace_key *key);
 extern void trace_disable(struct trace_key *key);
 extern uint64_t getnanotime(void);
 extern void trace_command_performance(const char **argv);
+extern void trace_verbatim(struct trace_key *key, const void *buf, unsigned len);
 
 #ifndef HAVE_VARIADIC_MACROS
 
-- 
2.4.3.699.g84b4da7

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* Re: [PATCH 3/3] pkt-line: support tracing verbatim pack contents
  2015-06-16 17:18                   ` Jeff King
@ 2015-06-16 17:23                     ` Augie Fackler
  2015-06-16 19:31                       ` [PATCH/RFC 0/3] add GIT_TRACE_STDIN Jeff King
  0 siblings, 1 reply; 33+ messages in thread
From: Augie Fackler @ 2015-06-16 17:23 UTC (permalink / raw)
  To: Jeff King; +Cc: Junio C Hamano, Johannes Sixt, git, Stefan Beller

On Tue, Jun 16, 2015 at 1:18 PM, Jeff King <peff@peff.net> wrote:
> On Tue, Jun 16, 2015 at 01:14:03PM -0400, Augie Fackler wrote:
>
>> Yeah, not having it for the push side is a slight bummer, but in
>> general I haven't had problems debugging git clients pushing bogus
>> data in the same way that I've had problems with weirdness in new
>> server features.
>
> Being in charge of a large git server farm, I think I have the opposite
> problem. :)

I've got plenty of servers too, but we haven't typically seen
push-time problems. Maybe we're lucky, and now that I've said
something and tempted Murphy I'll suffer torment. :)

>
>> > Here's a rough cut at the "trace stdin" idea I mentioned earlier (which
>> > is essentially an internal "tee"). You can collect the incoming pack
>> > like:
>>
>> Neat, but not sure I like the extra overhead of having to grab the
>> full trace and then reconstruct some arguments to be able to diagnose
>> the pack. Having the verbatim pack just land on disk is really handy,
>> because then any existing tools one has cooked up (my team has a few
>> weird one-off ones by now) just work without extra fussing or looking
>> up steps to reconstruct the whole file.
>
> I guess there is really room for both. Just because you _can_ accomplish
> the same thing with both does not mean we cannot have two ways to do it
> (an easy way, and a harder, more flexible way).

*nod* that might make the most sense - given that we both seem to have
use cases in mind for verbatim packs on pulls, that seems like a good
thing to have easy to deploy.

(It still seems to me more likely to write custom servers than
clients, but then again custom VCS servers has been my life for a
while, so I likely have a weird perspective on things.)

>
> -Peff

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH/RFC 0/3] add GIT_TRACE_STDIN
  2015-06-16 17:23                     ` Augie Fackler
@ 2015-06-16 19:31                       ` Jeff King
  2015-06-16 19:35                         ` [PATCH 1/3] trace: implement %p placeholder for filenames Jeff King
                                           ` (2 more replies)
  0 siblings, 3 replies; 33+ messages in thread
From: Jeff King @ 2015-06-16 19:31 UTC (permalink / raw)
  To: Augie Fackler; +Cc: Junio C Hamano, Johannes Sixt, git, Stefan Beller

On Tue, Jun 16, 2015 at 01:23:31PM -0400, Augie Fackler wrote:

> > I guess there is really room for both. Just because you _can_ accomplish
> > the same thing with both does not mean we cannot have two ways to do it
> > (an easy way, and a harder, more flexible way).
> 
> *nod* that might make the most sense - given that we both seem to have
> use cases in mind for verbatim packs on pulls, that seems like a good
> thing to have easy to deploy.

My ulterior motive is that I actually already have a similar thing in
place _just_ for pack-objects, and I'd like to get rid of my custom
hack. :)

In that case it is not about saving the packfile, but rather saving the
parameters to create it (I was interested in finding out why git was
spending so much CPU to serve some particular requests, and being able
to run the same pack-generation repeatedly is helpful).

Here are the patches I came up with:

  [1/3]: trace: implement %p placeholder for filenames
  [2/3]: trace: add pid to each output line
  [3/3]: trace: add GIT_TRACE_STDIN

They apply on top of the TRACE_PACKFILE patches, only because they also
need the new trace_verbatim(). But I am not altogether happy with the
result; see the comments I'll add to 3/3.

-Peff

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 1/3] trace: implement %p placeholder for filenames
  2015-06-16 19:31                       ` [PATCH/RFC 0/3] add GIT_TRACE_STDIN Jeff King
@ 2015-06-16 19:35                         ` Jeff King
  2015-06-16 19:36                         ` [PATCH 2/3] trace: add pid to each output line Jeff King
  2015-06-16 19:37                         ` [PATCH 3/3] trace: add GIT_TRACE_STDIN Jeff King
  2 siblings, 0 replies; 33+ messages in thread
From: Jeff King @ 2015-06-16 19:35 UTC (permalink / raw)
  To: Augie Fackler; +Cc: Junio C Hamano, Johannes Sixt, git, Stefan Beller

This lets you trace output on a per-process basis by naming
the trace file after the PID. This is probably not all that
useful for GIT_TRACE itself (which is about a global view of
the process hierarchy anyway), but will be useful for traces
which produce large amounts of data (e.g., whole packfiles).

Signed-off-by: Jeff King <peff@peff.net>
---
This is the bare minimum to let you distinguish between the stdin of
different processes. If we are going to use the trace subsystem to
collect more long-term debugging logs, it would probably make sense to
add placeholders for the program name and a timestamp. Then you could,
for example, leave more verbose debugging enabled on your servers all
the time.

 Documentation/git.txt |  3 ++-
 trace.c               | 19 +++++++++++++++++--
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/Documentation/git.txt b/Documentation/git.txt
index 3453669..80f6392 100644
--- a/Documentation/git.txt
+++ b/Documentation/git.txt
@@ -984,7 +984,8 @@ trace messages into this file descriptor.
 Alternatively, if the variable is set to an absolute path
 (starting with a '/' character), Git will interpret this
 as a file path and will try to write the trace messages
-into it.
+into it. If the filename contains the string `%p`, that string
+will be replaced with the PID of the traced process.
 +
 Unsetting the variable, or setting it to empty, "0" or
 "false" (case insensitive) disables trace messages.
diff --git a/trace.c b/trace.c
index 7393926..e1d1360 100644
--- a/trace.c
+++ b/trace.c
@@ -25,6 +25,16 @@
 #include "cache.h"
 #include "quote.h"
 
+static size_t expand_trace_name(struct strbuf *out, const char *fmt,
+				void *data)
+{
+	if (*fmt == 'p') {
+		strbuf_addf(out, "%lu", (unsigned long)getpid());
+		return 1;
+	}
+	return 0;
+}
+
 /* Get a trace file descriptor from "key" env variable. */
 static int get_trace_fd(struct trace_key *key)
 {
@@ -49,17 +59,22 @@ static int get_trace_fd(struct trace_key *key)
 	else if (strlen(trace) == 1 && isdigit(*trace))
 		key->fd = atoi(trace);
 	else if (is_absolute_path(trace)) {
-		int fd = open(trace, O_WRONLY | O_APPEND | O_CREAT, 0666);
+		struct strbuf name = STRBUF_INIT;
+		int fd;
+
+		strbuf_expand(&name, trace, expand_trace_name, NULL);
+		fd = open(name.buf, O_WRONLY | O_APPEND | O_CREAT, 0666);
 		if (fd == -1) {
 			fprintf(stderr,
 				"Could not open '%s' for tracing: %s\n"
 				"Defaulting to tracing on stderr...\n",
-				trace, strerror(errno));
+				name.buf, strerror(errno));
 			key->fd = STDERR_FILENO;
 		} else {
 			key->fd = fd;
 			key->need_close = 1;
 		}
+		strbuf_release(&name);
 	} else {
 		fprintf(stderr, "What does '%s' for %s mean?\n"
 			"If you want to trace into a file, then please set "
-- 
2.4.3.699.g84b4da7

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 2/3] trace: add pid to each output line
  2015-06-16 19:31                       ` [PATCH/RFC 0/3] add GIT_TRACE_STDIN Jeff King
  2015-06-16 19:35                         ` [PATCH 1/3] trace: implement %p placeholder for filenames Jeff King
@ 2015-06-16 19:36                         ` Jeff King
  2015-06-16 19:37                         ` [PATCH 3/3] trace: add GIT_TRACE_STDIN Jeff King
  2 siblings, 0 replies; 33+ messages in thread
From: Jeff King @ 2015-06-16 19:36 UTC (permalink / raw)
  To: Augie Fackler; +Cc: Junio C Hamano, Johannes Sixt, git, Stefan Beller

When there are many git commands running, it can be
confusing which process is producing which message. And if
one is trying to correlate a specific invocation with a
trace-file created using "%p", it's rather impossible.

Let's print the pid alongside each line. This does extend
the already-wide trace lines. We could make it optional, but
it's probably not worth the complexity of extra knobs.
Looking at trace output is already an exceptional thing to
be doing, and verbose output is not going to bother anyone.

Signed-off-by: Jeff King <peff@peff.net>
---
I think you can technically get by without this for most cases, as doing
GIT_TRACE=base.%p will get you the "built-in: ..." line for index-pack,
pack-objects, etc, which you can then correlate with a matching
GIT_TRACE_STDIN filename.

 trace.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/trace.c b/trace.c
index e1d1360..a7ec484 100644
--- a/trace.c
+++ b/trace.c
@@ -123,12 +123,13 @@ static int prepare_trace_line(const char *file, int line,
 	localtime_r(&secs, &tm);
 	strbuf_addf(buf, "%02d:%02d:%02d.%06ld ", tm.tm_hour, tm.tm_min,
 		    tm.tm_sec, (long) tv.tv_usec);
+	strbuf_addf(buf, "[pid=%lu] ", (unsigned long)getpid());
 
 #ifdef HAVE_VARIADIC_MACROS
 	/* print file:line */
 	strbuf_addf(buf, "%s:%d ", file, line);
-	/* align trace output (column 40 catches most files names in git) */
-	while (buf->len < 40)
+	/* align trace output (column 50 catches most files names in git) */
+	while (buf->len < 50)
 		strbuf_addch(buf, ' ');
 #endif
 
-- 
2.4.3.699.g84b4da7

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 3/3] trace: add GIT_TRACE_STDIN
  2015-06-16 19:31                       ` [PATCH/RFC 0/3] add GIT_TRACE_STDIN Jeff King
  2015-06-16 19:35                         ` [PATCH 1/3] trace: implement %p placeholder for filenames Jeff King
  2015-06-16 19:36                         ` [PATCH 2/3] trace: add pid to each output line Jeff King
@ 2015-06-16 19:37                         ` Jeff King
  2015-06-16 19:49                           ` Jeff King
  2 siblings, 1 reply; 33+ messages in thread
From: Jeff King @ 2015-06-16 19:37 UTC (permalink / raw)
  To: Augie Fackler; +Cc: Junio C Hamano, Johannes Sixt, git, Stefan Beller

Sometimes tracing the invocation of git programs with
GIT_TRACE is not quite enough to replay a situation; the
interesting input to the program often comes over its
standard input. For instance, if you want to replay a
particular fetch (e.g., for performance analysis or
debugging), you would want both the arguments and stdin sent
to pack-objects.

This patch lets you capture the stdin of any git process.
For instance:

  GIT_TRACE=/tmp/processes.out \
  GIT_TRACE_STDIN=/tmp/stdin.%p \
  git daemon ...

After a fetch, processes.out will contain a line like:

  15:19:08.275493 [pid=13196] git.c:348             trace:
  built-in: git 'pack-objects' '--revs' '--thin' '--stdout'
  '--progress' '--delta-base-offset'

And stdin.13196 (the pid picked from the above line) will
contain its stdin.

Signed-off-by: Jeff King <peff@peff.net>
---
 git.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/git.c b/git.c
index 44374b1..e7e58e3 100644
--- a/git.c
+++ b/git.c
@@ -616,6 +616,65 @@ static void restore_sigpipe_to_default(void)
 	signal(SIGPIPE, SIG_DFL);
 }
 
+static int copy_stdin(int in, int out, void *data)
+{
+	struct trace_key *key = data;
+	while (1) {
+		char buf[8192];
+		ssize_t len = xread(in, buf, sizeof(buf));
+		if (!len)
+			break;
+		if (len < 0) {
+			warning("error reading stdin trace: %s",
+				strerror(errno));
+			break;
+		}
+
+		trace_verbatim(key, buf, len);
+		if (write_in_full(out, buf, len) < 0) {
+			warning("error writing stdin trace: %s",
+				strerror(errno));
+			break;
+		}
+	}
+	close(in);
+	close(out);
+	return 0;
+}
+
+static void trace_stdin(void)
+{
+	static struct trace_key key = TRACE_KEY_INIT(STDIN);
+	static struct async async;
+
+	if (!trace_want(&key))
+		return;
+
+	memset(&async, 0, sizeof(async));
+	async.proc = copy_stdin;
+	async.data = &key;
+	async.in = dup(0);
+	async.out = -1;
+
+	if (async.in < 0 || start_async(&async) < 0) {
+		warning("unable to trace stdin: %s", strerror(errno));
+		return ;
+	}
+
+	/*
+	 * At this point we've handed stdin off to the async process,
+	 * so there we are past the point of no return.
+	 */
+	if (dup2(async.out, 0))
+		die_errno("unable to redirect stdin from async process");
+	close(async.out);
+
+	/*
+	 * leak async; we would know to finish_async() only when we are
+	 * exiting, and there is no point then
+	 */
+}
+
 int main(int argc, char **av)
 {
 	const char **argv = (const char **) av;
@@ -640,6 +699,7 @@ int main(int argc, char **av)
 	git_setup_gettext();
 
 	trace_command_performance(argv);
+	trace_stdin();
 
 	/*
 	 * "git-xxxx" is the same as "git xxxx", but we obviously:
-- 
2.4.3.699.g84b4da7

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* Re: [PATCH 3/3] trace: add GIT_TRACE_STDIN
  2015-06-16 19:37                         ` [PATCH 3/3] trace: add GIT_TRACE_STDIN Jeff King
@ 2015-06-16 19:49                           ` Jeff King
  2015-06-16 21:20                             ` Jeff King
  0 siblings, 1 reply; 33+ messages in thread
From: Jeff King @ 2015-06-16 19:49 UTC (permalink / raw)
  To: Augie Fackler; +Cc: Junio C Hamano, Johannes Sixt, git, Stefan Beller

On Tue, Jun 16, 2015 at 03:37:16PM -0400, Jeff King wrote:

> For instance:
> 
>   GIT_TRACE=/tmp/processes.out \
>   GIT_TRACE_STDIN=/tmp/stdin.%p \
>   git daemon ...
> 
> After a fetch, processes.out will contain a line like:
> 
>   15:19:08.275493 [pid=13196] git.c:348             trace:
>   built-in: git 'pack-objects' '--revs' '--thin' '--stdout'
>   '--progress' '--delta-base-offset'
> 
> And stdin.13196 (the pid picked from the above line) will
> contain its stdin.

So this is the part that I don't like. Collecting stdin is expensive,
and the commands above would be totally inappropriate for a production
server, for two reasons:

  1. It requires restarting git-daemon to turn on debugging.

  2. It logs for _everything_. Every repo, and every process.

I mentioned before that I have a similar patch for logging pack-objects.
That is triggered via config, which solves both of these problems. What
I'd really like is to be able to do:

  git config trace.pack-objects.stdin fetches/stdin.%p

in a particular repository, collect data for a few minutes, and then
turn it back off.

The patch below implements that, but it doesn't quite work as you might
hope. The problem is that we cannot read the repository config so early
in the git.c startup; we do not even know if we have a repository yet.
Pushing the config-reading later is too late; we may already have looked
at GIT_TRACE_* and decided whether to trace (and possibly even written
trace records!).

It's possible we could hack around it by rearranging the startup
sequence, but that sequence is notoriously fragile.

Another option would be to stop trying to intercept stdin in git.c, and
instead make this a feature of run-command.c. That is, right before we
exec a process, tee its stdin there. That means that you cannot do:

  GIT_TRACE_STDIN=/tmp/foo.out git foo

to collect the stdin of foo. But that is not really an interesting case
anyway. You can run "tee" yourself if you want. The interesting cases
are the ones where git is spawning a sub-process, and you want to
intercept the data moving between the git processes.

-Peff

---
diff --git a/git.c b/git.c
index e7e58e3..cbb7e9b 100644
--- a/git.c
+++ b/git.c
@@ -675,6 +675,38 @@ static void trace_stdin(void)
 	 */
 }
 
+static int trace_config_cb(const char *var, const char *value, void *data)
+{
+	const char *want_cmd = data;
+	const char *have_cmd, *key;
+	int have_len;
+
+	if (!parse_config_key(var, "trace", &have_cmd, &have_len, &key) &&
+	    have_cmd &&
+	    !strncmp(want_cmd, have_cmd, have_len) &&
+	    want_cmd[have_len] == '\0') {
+		struct strbuf buf = STRBUF_INIT;
+		strbuf_addstr(&buf, "GIT_TRACE_");
+		while (*key)
+			strbuf_addch(&buf, toupper(*key++));
+		setenv(buf.buf, value, 0);
+		strbuf_release(&buf);
+	}
+	return 0;
+}
+
+static void load_trace_config(const char *cmd, const char **argv)
+{
+	/* XXX we don't know the cmd for sure until we parse the options */
+	if (!strcmp(cmd, "git"))
+		cmd = argv[1];
+	else
+		skip_prefix(cmd, "git-", &cmd);
+
+	if (cmd)
+		git_config(trace_config_cb, (void *)cmd);
+}
+
 int main(int argc, char **av)
 {
 	const char **argv = (const char **) av;
@@ -698,6 +730,7 @@ int main(int argc, char **av)
 
 	git_setup_gettext();
 
+	load_trace_config(cmd, argv);
 	trace_command_performance(argv);
 	trace_stdin();
 

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* Re: [PATCH 3/3] trace: add GIT_TRACE_STDIN
  2015-06-16 19:49                           ` Jeff King
@ 2015-06-16 21:20                             ` Jeff King
  2015-06-17 10:04                               ` Duy Nguyen
  0 siblings, 1 reply; 33+ messages in thread
From: Jeff King @ 2015-06-16 21:20 UTC (permalink / raw)
  To: Augie Fackler; +Cc: Junio C Hamano, Johannes Sixt, git, Stefan Beller

On Tue, Jun 16, 2015 at 03:49:07PM -0400, Jeff King wrote:

> Another option would be to stop trying to intercept stdin in git.c, and
> instead make this a feature of run-command.c. That is, right before we
> exec a process, tee its stdin there. That means that you cannot do:
> 
>   GIT_TRACE_STDIN=/tmp/foo.out git foo
> 
> to collect the stdin of foo. But that is not really an interesting case
> anyway. You can run "tee" yourself if you want. The interesting cases
> are the ones where git is spawning a sub-process, and you want to
> intercept the data moving between the git processes.

Hmm. I guess we do not actually have to move the stdin interception
there. We can just move the config-checking there, like the patch below.

It basically just converts trace.foo.bar into GIT_TRACE_BAR when we are
running "foo" as a git command.  This does work, but is perhaps
potentially confusing to the user, because it only kicks in when _git_
runs "foo". IOW, this works:

  git config trace.upload-pack.foo /path/to/foo.out
  git daemon

and will trace as you expect. But then running:

  git upload-pack

yourself will do nothing.

I dunno.

---
 run-command.c | 17 +++++++++++++++++
 trace.c       | 44 ++++++++++++++++++++++++++++++++++++++++++++
 trace.h       |  9 +++++++++
 3 files changed, 70 insertions(+)

diff --git a/run-command.c b/run-command.c
index 4d73e90..2febbb5 100644
--- a/run-command.c
+++ b/run-command.c
@@ -284,6 +284,23 @@ int start_command(struct child_process *cmd)
 
 	if (!cmd->argv)
 		cmd->argv = cmd->args.argv;
+	if (cmd->git_cmd) {
+		/*
+		 * Load any extra variables into env_array. But
+		 * if we weren't going to use it (in favor of "env"),
+		 * then consolidate the two. Make sure the original "env"
+		 * goes after what we add, so that it can override.
+		 *
+		 * We cannot just keep two lists, because we may hand off the
+		 * single list to a spawn() implementation.
+		 */
+		trace_config_for(cmd->argv[0], &cmd->env_array);
+		if (cmd->env_array.argc && cmd->env) {
+			for (; *cmd->env; cmd->env++)
+				argv_array_push(&cmd->env_array, *cmd->env);
+			cmd->env = NULL;
+		}
+	}
 	if (!cmd->env)
 		cmd->env = cmd->env_array.argv;
 
diff --git a/trace.c b/trace.c
index a7ec484..86c988e 100644
--- a/trace.c
+++ b/trace.c
@@ -24,6 +24,7 @@
 
 #include "cache.h"
 #include "quote.h"
+#include "argv-array.h"
 
 static size_t expand_trace_name(struct strbuf *out, const char *fmt,
 				void *data)
@@ -449,3 +450,46 @@ void trace_command_performance(const char **argv)
 	sq_quote_argv(&command_line, argv, 0);
 	command_start_time = getnanotime();
 }
+
+struct trace_config_data {
+	const char *want_cmd;
+	struct argv_array *out;
+};
+
+static int trace_config_cb(const char *var, const char *value, void *vdata)
+{
+	struct trace_config_data *data = vdata;
+	const char *have_cmd, *key;
+	int have_len;
+
+	if (!parse_config_key(var, "trace", &have_cmd, &have_len, &key) &&
+	    have_cmd &&
+	    !strncmp(data->want_cmd, have_cmd, have_len) &&
+	    data->want_cmd[have_len] == '\0') {
+		struct strbuf buf = STRBUF_INIT;
+
+		strbuf_addstr(&buf, "GIT_TRACE_");
+		while (*key)
+			strbuf_addch(&buf, toupper(*key++));
+
+		/*
+		 * Environment always takes precedence over config, so do not
+		 * override existing variables. We cannot rely on setenv()'s
+		 * overwrite flag here, because we may pass the list off to
+		 * a spawn() implementation, which always overwrites.
+		 */
+		if (!getenv(buf.buf))
+			argv_array_pushf(data->out, "%s=%s", buf.buf, value);
+
+		strbuf_release(&buf);
+	}
+	return 0;
+}
+
+void trace_config_for(const char *cmd, struct argv_array *out)
+{
+	struct trace_config_data data;
+	data.want_cmd = cmd;
+	data.out = out;
+	git_config(trace_config_cb, &data);
+}
diff --git a/trace.h b/trace.h
index 179b249..83618e9 100644
--- a/trace.h
+++ b/trace.h
@@ -4,6 +4,8 @@
 #include "git-compat-util.h"
 #include "strbuf.h"
 
+struct argv_array;
+
 struct trace_key {
 	const char * const key;
 	int fd;
@@ -20,6 +22,13 @@ extern uint64_t getnanotime(void);
 extern void trace_command_performance(const char **argv);
 extern void trace_verbatim(struct trace_key *key, const void *buf, unsigned len);
 
+/**
+ * Load any trace-related config for git command "cmd", and insert the matching
+ * environment variables into "out", which is suitable for use by run-command
+ * and friends.
+ */
+void trace_config_for(const char *cmd, struct argv_array *out);
+
 #ifndef HAVE_VARIADIC_MACROS
 
 __attribute__((format (printf, 1, 2)))
-- 
2.4.3.699.g84b4da7

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* Re: [PATCH 3/3] trace: add GIT_TRACE_STDIN
  2015-06-16 21:20                             ` Jeff King
@ 2015-06-17 10:04                               ` Duy Nguyen
  2015-06-17 19:10                                 ` Jeff King
  0 siblings, 1 reply; 33+ messages in thread
From: Duy Nguyen @ 2015-06-17 10:04 UTC (permalink / raw)
  To: Jeff King
  Cc: Augie Fackler, Junio C Hamano, Johannes Sixt, Git Mailing List,
	Stefan Beller

On Wed, Jun 17, 2015 at 4:20 AM, Jeff King <peff@peff.net> wrote:
> On Tue, Jun 16, 2015 at 03:49:07PM -0400, Jeff King wrote:
>
>> Another option would be to stop trying to intercept stdin in git.c, and
>> instead make this a feature of run-command.c. That is, right before we
>> exec a process, tee its stdin there. That means that you cannot do:
>>
>>   GIT_TRACE_STDIN=/tmp/foo.out git foo
>>
>> to collect the stdin of foo. But that is not really an interesting case
>> anyway. You can run "tee" yourself if you want. The interesting cases
>> are the ones where git is spawning a sub-process, and you want to
>> intercept the data moving between the git processes.
>
> Hmm. I guess we do not actually have to move the stdin interception
> there. We can just move the config-checking there, like the patch below.
>
> It basically just converts trace.foo.bar into GIT_TRACE_BAR when we are
> running "foo" as a git command.  This does work, but is perhaps
> potentially confusing to the user, because it only kicks in when _git_
> runs "foo". IOW, this works:
>
>   git config trace.upload-pack.foo /path/to/foo.out
>   git daemon

I wonder if we could do it a bit differently. Instead of
GIT_TRACE_STDIN, I would add GIT_TRACE_HOOK that points to a script.
Whenever a command is run via run-command interface, the actual
command line to be executed would be "<hook script> <command>
<arguments>".

Because this script is given full command line, it can decide to trace
something if the command name is matched (or arguments are matched) or
just execute the original command. It's more flexible that trace.*
config keys. We also have an opportunity to replace builtin commands,
like pack-objects, in command pipeline in fetch or push with something
else, to inject errors or whatever. It can be done manually, but it's
not easy or convenient.
-- 
Duy

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 3/3] trace: add GIT_TRACE_STDIN
  2015-06-17 10:04                               ` Duy Nguyen
@ 2015-06-17 19:10                                 ` Jeff King
  2015-06-18 10:20                                   ` Duy Nguyen
  2015-06-26 18:47                                   ` Junio C Hamano
  0 siblings, 2 replies; 33+ messages in thread
From: Jeff King @ 2015-06-17 19:10 UTC (permalink / raw)
  To: Duy Nguyen
  Cc: Augie Fackler, Junio C Hamano, Johannes Sixt, Git Mailing List,
	Stefan Beller

On Wed, Jun 17, 2015 at 05:04:04PM +0700, Duy Nguyen wrote:

> I wonder if we could do it a bit differently. Instead of
> GIT_TRACE_STDIN, I would add GIT_TRACE_HOOK that points to a script.
> Whenever a command is run via run-command interface, the actual
> command line to be executed would be "<hook script> <command>
> <arguments>".

Hmm, yeah, I like that. It's even more flexible, and it is much more
obvious why it works only for run-command. If we feed the resulting
"hooked" command to the shell, I think you could do:

  GIT_TRACE_HOOK='
    f() {
      case "$1 $2" in
      git pack-objects)
        tee /tmp/foo.out | "$@"
	;;
      esac
    }; f
  '

That is not 100% correct (you would miss "git --some-arg pack-objects"),
but it is probably fine in practice for debugging sessions. It is a bit
more complicated to use, but I really like the flexibility (I can
imagine that "GIT_TRACE_HOOK=gdbserver localhost:1234" would come in
handy).

> Because this script is given full command line, it can decide to trace
> something if the command name is matched (or arguments are matched) or
> just execute the original command. It's more flexible that trace.*
> config keys. We also have an opportunity to replace builtin commands,
> like pack-objects, in command pipeline in fetch or push with something
> else, to inject errors or whatever. It can be done manually, but it's
> not easy or convenient.

My other motive for trace.* was that we could have something like
"trace.prune", and have git-prune provide verbose debugging information.
We have custom patches like that on GitHub servers, which we've used to
debug occasional weirdness (e.g., you find that an object is missing
from a repo, but you have no clue why it went away; was it never there,
did somebody prune it, did it get dropped from a pack?).

I can send those upstream, but it would be nice not to introduce a
totally separate tracing facility when trace_* is so close. But it
needs:

  1. To be enabled by config, not environment.

  2. To support some basic output filename flexibility so the output can
     be organized (we write the equivalent of GIT_TRACE_FOO to
     $GIT_DIR/ghlog_foo/YYYY-MM-DD/YYYY-MM-DDTHH:MM:SS.PID).

For (1), we could just load trace.* in git_default_config; you couldn't
use it with any "early" tracing that happens before then, but I think in
practice it would be fine for most traces.

For (2), I think we could accomplish that with %-placeholders (like my
earlier patch), and the ability to write relative paths into $GIT_DIR
(again, you couldn't do this for "early" traces, but you could for other
stuff).

Or we could just do nothing. I'm not sure if anybody else is actually
interested in verbose-logging patches like these.

-Peff

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 3/3] trace: add GIT_TRACE_STDIN
  2015-06-17 19:10                                 ` Jeff King
@ 2015-06-18 10:20                                   ` Duy Nguyen
  2015-06-26 18:47                                   ` Junio C Hamano
  1 sibling, 0 replies; 33+ messages in thread
From: Duy Nguyen @ 2015-06-18 10:20 UTC (permalink / raw)
  To: Jeff King
  Cc: Augie Fackler, Junio C Hamano, Johannes Sixt, Git Mailing List,
	Stefan Beller

On Thu, Jun 18, 2015 at 2:10 AM, Jeff King <peff@peff.net> wrote:
> On Wed, Jun 17, 2015 at 05:04:04PM +0700, Duy Nguyen wrote:
>
>> I wonder if we could do it a bit differently. Instead of
>> GIT_TRACE_STDIN, I would add GIT_TRACE_HOOK that points to a script.
>> Whenever a command is run via run-command interface, the actual
>> command line to be executed would be "<hook script> <command>
>> <arguments>".
>
> Hmm, yeah, I like that. It's even more flexible, and it is much more
> obvious why it works only for run-command. If we feed the resulting
> "hooked" command to the shell, I think you could do:
>
>   GIT_TRACE_HOOK='
>     f() {
>       case "$1 $2" in
>       git pack-objects)
>         tee /tmp/foo.out | "$@"
>         ;;
>       esac
>     }; f
>   '
>
> That is not 100% correct (you would miss "git --some-arg pack-objects"),

Yeah, flexibility always comes with traps and pitfalls.

> but it is probably fine in practice for debugging sessions. It is a bit
> more complicated to use, but I really like the flexibility (I can
> imagine that "GIT_TRACE_HOOK=gdbserver localhost:1234" would come in
> handy).
>
>> Because this script is given full command line, it can decide to trace
>> something if the command name is matched (or arguments are matched) or
>> just execute the original command. It's more flexible that trace.*
>> config keys. We also have an opportunity to replace builtin commands,
>> like pack-objects, in command pipeline in fetch or push with something
>> else, to inject errors or whatever. It can be done manually, but it's
>> not easy or convenient.
>
> My other motive for trace.* was that we could have something like
> "trace.prune", and have git-prune provide verbose debugging information.
> We have custom patches like that on GitHub servers, which we've used to
> debug occasional weirdness (e.g., you find that an object is missing
> from a repo, but you have no clue why it went away; was it never there,
> did somebody prune it, did it get dropped from a pack?).
>
> I can send those upstream, but it would be nice not to introduce a
> totally separate tracing facility when trace_* is so close. But it
> needs:
>
>   1. To be enabled by config, not environment.
>
>   2. To support some basic output filename flexibility so the output can
>      be organized (we write the equivalent of GIT_TRACE_FOO to
>      $GIT_DIR/ghlog_foo/YYYY-MM-DD/YYYY-MM-DDTHH:MM:SS.PID).
>
> For (1), we could just load trace.* in git_default_config; you couldn't
> use it with any "early" tracing that happens before then, but I think in
> practice it would be fine for most traces.
>
> For (2), I think we could accomplish that with %-placeholders (like my
> earlier patch), and the ability to write relative paths into $GIT_DIR
> (again, you couldn't do this for "early" traces, but you could for other
> stuff).
>
> Or we could just do nothing. I'm not sure if anybody else is actually
> interested in verbose-logging patches like these.

I'm not stopping you from doing this, just to be clear. I was just
trying to convince you to do something extra that I wanted to use ;)
-- 
Duy

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 3/3] trace: add GIT_TRACE_STDIN
  2015-06-17 19:10                                 ` Jeff King
  2015-06-18 10:20                                   ` Duy Nguyen
@ 2015-06-26 18:47                                   ` Junio C Hamano
  1 sibling, 0 replies; 33+ messages in thread
From: Junio C Hamano @ 2015-06-26 18:47 UTC (permalink / raw)
  To: Jeff King
  Cc: Duy Nguyen, Augie Fackler, Johannes Sixt, Git Mailing List,
	Stefan Beller

Jeff King <peff@peff.net> writes:

> My other motive for trace.* was that we could have something like
> "trace.prune", and have git-prune provide verbose debugging information.
> We have custom patches like that on GitHub servers, which we've used to
> debug occasional weirdness (e.g., you find that an object is missing
> from a repo, but you have no clue why it went away; was it never there,
> did somebody prune it, did it get dropped from a pack?).
>
> I can send those upstream, but it would be nice not to introduce a
> totally separate tracing facility when trace_* is so close. But it
> needs:
>
>   1. To be enabled by config, not environment.
>
>   2. To support some basic output filename flexibility so the output can
>      be organized (we write the equivalent of GIT_TRACE_FOO to
>      $GIT_DIR/ghlog_foo/YYYY-MM-DD/YYYY-MM-DDTHH:MM:SS.PID).
>
> For (1), we could just load trace.* in git_default_config; you couldn't
> use it with any "early" tracing that happens before then, but I think in
> practice it would be fine for most traces.
>
> For (2), I think we could accomplish that with %-placeholders (like my
> earlier patch), and the ability to write relative paths into $GIT_DIR
> (again, you couldn't do this for "early" traces, but you could for other
> stuff).

Both sounds like a sensible compromise to me.

Thanks, all, for an interesting discussion.

^ permalink raw reply	[flat|nested] 33+ messages in thread

end of thread, other threads:[~2015-06-26 18:47 UTC | newest]

Thread overview: 33+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <CAGZ79kaS4utvDbXOo7emmSUH6M-8LY-oA65Ss3PLDkFModkbSg@mail.gmail.com>
2015-06-11 18:59 ` [PATCH v2] fetch-pack: optionally save packs to disk Augie Fackler
2015-06-12  6:22   ` Johannes Sixt
2015-06-12 15:07     ` Junio C Hamano
2015-06-12 17:02       ` Augie Fackler
2015-06-12 18:00       ` Jeff King
2015-06-12 21:25         ` Jeff King
2015-06-12 21:28           ` [PATCH 1/3] pkt-line: simplify starts_with checks in packet tracing Jeff King
2015-06-12 21:35             ` Stefan Beller
2015-06-12 21:28           ` [PATCH 2/3] pkt-line: tighten sideband PACK check when tracing Jeff King
2015-06-12 21:39             ` Stefan Beller
2015-06-12 21:41               ` Jeff King
2015-06-12 21:43                 ` Stefan Beller
2015-06-12 21:28           ` [PATCH 3/3] pkt-line: support tracing verbatim pack contents Jeff King
2015-06-16 15:38             ` Augie Fackler
2015-06-16 16:39               ` Junio C Hamano
2015-06-16 16:43                 ` Jeff King
2015-06-16 16:52                   ` Augie Fackler
2015-06-16 17:23                     ` Jeff King
2015-06-16 17:10               ` Jeff King
2015-06-16 17:14                 ` Augie Fackler
2015-06-16 17:18                   ` Jeff King
2015-06-16 17:23                     ` Augie Fackler
2015-06-16 19:31                       ` [PATCH/RFC 0/3] add GIT_TRACE_STDIN Jeff King
2015-06-16 19:35                         ` [PATCH 1/3] trace: implement %p placeholder for filenames Jeff King
2015-06-16 19:36                         ` [PATCH 2/3] trace: add pid to each output line Jeff King
2015-06-16 19:37                         ` [PATCH 3/3] trace: add GIT_TRACE_STDIN Jeff King
2015-06-16 19:49                           ` Jeff King
2015-06-16 21:20                             ` Jeff King
2015-06-17 10:04                               ` Duy Nguyen
2015-06-17 19:10                                 ` Jeff King
2015-06-18 10:20                                   ` Duy Nguyen
2015-06-26 18:47                                   ` Junio C Hamano
2015-06-12 16:54     ` [PATCH v2] fetch-pack: optionally save packs to disk Augie Fackler

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.