git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v3 1/3] patch-id: make it stable against hunk reordering
@ 2014-03-30 18:09 Michael S. Tsirkin
  2014-03-30 18:09 ` [PATCH v3 2/3] patch-id: document new behaviour Michael S. Tsirkin
                   ` (3 more replies)
  0 siblings, 4 replies; 15+ messages in thread
From: Michael S. Tsirkin @ 2014-03-30 18:09 UTC (permalink / raw)
  To: git; +Cc: sunshine, jrnieder, peff, gitster

Patch id changes if users
1. reorder file diffs that make up a patch
or
2. split a patch up to multiple diffs that touch the same path
(keeping hunks within a single diff ordered to make patch valid).

As the result is functionally equivalent, a different patch id is
surprising to many users.
In particular, reordering files using diff -O is helpful to make patches
more readable (e.g. API header diff before implementation diff).

Change patch-id behaviour making it stable against these two kinds
of patch change:
1. calculate SHA1 hash for each hunk separately and sum all hashes
(using a symmetrical sum) to get patch id
2. hash the file-level headers together with each hunk (not just the
first hunk)

We use a 20byte sum and not xor - since xor would give 0 output
for patches that have two identical diffs, which isn't all that
unlikely (e.g. append the same line in two places).

Add a new flag --unstable to get the historical behaviour.

Add --stable which is a nop, for symmetry.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---

changes from v2:
	several bugfixes
changes from v1:
	hanges from v1: documented motivation for supporting
	diff splitting (and not just file reordering).
	No code changes.

 builtin/patch-id.c | 72 ++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 56 insertions(+), 16 deletions(-)

diff --git a/builtin/patch-id.c b/builtin/patch-id.c
index 3cfe02d..7fd7007 100644
--- a/builtin/patch-id.c
+++ b/builtin/patch-id.c
@@ -1,17 +1,14 @@
 #include "builtin.h"
 
-static void flush_current_id(int patchlen, unsigned char *id, git_SHA_CTX *c)
+static void flush_current_id(int patchlen, unsigned char *id, unsigned char *result)
 {
-	unsigned char result[20];
 	char name[50];
 
 	if (!patchlen)
 		return;
 
-	git_SHA1_Final(result, c);
 	memcpy(name, sha1_to_hex(id), 41);
 	printf("%s %s\n", sha1_to_hex(result), name);
-	git_SHA1_Init(c);
 }
 
 static int remove_space(char *line)
@@ -56,10 +53,31 @@ static int scan_hunk_header(const char *p, int *p_before, int *p_after)
 	return 1;
 }
 
-static int get_one_patchid(unsigned char *next_sha1, git_SHA_CTX *ctx, struct strbuf *line_buf)
+static void flush_one_hunk(unsigned char *result, git_SHA_CTX *ctx)
 {
-	int patchlen = 0, found_next = 0;
+	unsigned char hash[20];
+	unsigned short carry = 0;
+	int i;
+
+	git_SHA1_Final(hash, ctx);
+	git_SHA1_Init(ctx);
+	/* 20-byte sum, with carry */
+	for (i = 0; i < 20; ++i) {
+		carry += result[i] + hash[i];
+		result[i] = carry;
+		carry >>= 8;
+	}
+}
+
+static int get_one_patchid(unsigned char *next_sha1, unsigned char *result,
+			   struct strbuf *line_buf, int stable)
+{
+	int patchlen = 0, found_next = 0, hunks = 0;
 	int before = -1, after = -1;
+	git_SHA_CTX ctx, header_ctx;
+
+	git_SHA1_Init(&ctx);
+	hashclr(result);
 
 	while (strbuf_getwholeline(line_buf, stdin, '\n') != EOF) {
 		char *line = line_buf->buf;
@@ -98,7 +116,19 @@ static int get_one_patchid(unsigned char *next_sha1, git_SHA_CTX *ctx, struct st
 		if (before == 0 && after == 0) {
 			if (!memcmp(line, "@@ -", 4)) {
 				/* Parse next hunk, but ignore line numbers.  */
+				if (stable) {
+					/* Hash the file-level headers together with each hunk. */
+					if (hunks) {
+						flush_one_hunk(result, &ctx);
+						/* Prepend saved header ctx for next hunk.  */
+						memcpy(&ctx, &header_ctx, sizeof ctx);
+					} else {
+						/* Save header ctx for next hunk.  */
+						memcpy(&header_ctx, &ctx, sizeof ctx);
+					}
+				}
 				scan_hunk_header(line, &before, &after);
+				hunks++;
 				continue;
 			}
 
@@ -107,7 +137,10 @@ static int get_one_patchid(unsigned char *next_sha1, git_SHA_CTX *ctx, struct st
 				break;
 
 			/* Else we're parsing another header.  */
+			if (stable && hunks)
+				flush_one_hunk(result, &ctx);
 			before = after = -1;
+			hunks = 0;
 		}
 
 		/* If we get here, we're inside a hunk.  */
@@ -119,39 +152,46 @@ static int get_one_patchid(unsigned char *next_sha1, git_SHA_CTX *ctx, struct st
 		/* Compute the sha without whitespace */
 		len = remove_space(line);
 		patchlen += len;
-		git_SHA1_Update(ctx, line, len);
+		git_SHA1_Update(&ctx, line, len);
 	}
 
 	if (!found_next)
 		hashclr(next_sha1);
 
+	flush_one_hunk(result, &ctx);
+
 	return patchlen;
 }
 
-static void generate_id_list(void)
+static void generate_id_list(int stable)
 {
-	unsigned char sha1[20], n[20];
-	git_SHA_CTX ctx;
+	unsigned char sha1[20], n[20], result[20];
 	int patchlen;
 	struct strbuf line_buf = STRBUF_INIT;
 
-	git_SHA1_Init(&ctx);
 	hashclr(sha1);
 	while (!feof(stdin)) {
-		patchlen = get_one_patchid(n, &ctx, &line_buf);
-		flush_current_id(patchlen, sha1, &ctx);
+		patchlen = get_one_patchid(n, result, &line_buf, stable);
+		flush_current_id(patchlen, sha1, result);
 		hashcpy(sha1, n);
 	}
 	strbuf_release(&line_buf);
 }
 
-static const char patch_id_usage[] = "git patch-id < patch";
+static const char patch_id_usage[] = "git patch-id [--stable | --unstable] < patch";
 
 int cmd_patch_id(int argc, const char **argv, const char *prefix)
 {
-	if (argc != 1)
+	int stable;
+	if (argc == 2 && !strcmp(argv[1], "--stable"))
+		stable = 1;
+	else if (argc == 2 && !strcmp(argv[1], "--unstable"))
+		stable = 0;
+	else if (argc == 1)
+		stable = 1;
+	else
 		usage(patch_id_usage);
 
-	generate_id_list();
+	generate_id_list(stable);
 	return 0;
 }
-- 
MST

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH v3 2/3] patch-id: document new behaviour
  2014-03-30 18:09 [PATCH v3 1/3] patch-id: make it stable against hunk reordering Michael S. Tsirkin
@ 2014-03-30 18:09 ` Michael S. Tsirkin
  2014-03-31 19:08   ` Junio C Hamano
  2014-03-30 18:09 ` [PATCH v3 3/3] patch-id-test: test --stable and --unstable flags Michael S. Tsirkin
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 15+ messages in thread
From: Michael S. Tsirkin @ 2014-03-30 18:09 UTC (permalink / raw)
  To: git; +Cc: sunshine, jrnieder, peff, gitster

Clarify that patch ID is now a sum of hashes, not a hash.
Document --stable and --unstable flags.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---

changes from v2:
	explicitly list the kinds of changes against which patch ID is stable

 Documentation/git-patch-id.txt | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/Documentation/git-patch-id.txt b/Documentation/git-patch-id.txt
index 312c3b1..30923e0 100644
--- a/Documentation/git-patch-id.txt
+++ b/Documentation/git-patch-id.txt
@@ -8,14 +8,14 @@ git-patch-id - Compute unique ID for a patch
 SYNOPSIS
 --------
 [verse]
-'git patch-id' < <patch>
+'git patch-id' [--stable | --unstable] < <patch>
 
 DESCRIPTION
 -----------
-A "patch ID" is nothing but a SHA-1 of the diff associated with a patch, with
-whitespace and line numbers ignored.  As such, it's "reasonably stable", but at
-the same time also reasonably unique, i.e., two patches that have the same "patch
-ID" are almost guaranteed to be the same thing.
+A "patch ID" is nothing but a sum of SHA-1 of the diff hunks associated with a
+patch, with whitespace and line numbers ignored.  As such, it's "reasonably
+stable", but at the same time also reasonably unique, i.e., two patches that
+have the same "patch ID" are almost guaranteed to be the same thing.
 
 IOW, you can use this thing to look for likely duplicate commits.
 
@@ -27,6 +27,19 @@ This can be used to make a mapping from patch ID to commit ID.
 
 OPTIONS
 -------
+
+--stable::
+	Use a symmetrical sum of hashes as the patch ID.
+	With this option, reordering file diffs that make up a patch or
+	splitting a diff up to multiple diffs that touch the same path
+	does not affect the ID.
+	This is the default.
+
+--unstable::
+	Use a non-symmetrical sum of hashes, such that reordering
+	or splitting the patch does affect the ID.
+	This was the default value for git 1.9 and older.
+
 <patch>::
 	The diff to create the ID of.
 
-- 
MST

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH v3 3/3] patch-id-test: test --stable and --unstable flags
  2014-03-30 18:09 [PATCH v3 1/3] patch-id: make it stable against hunk reordering Michael S. Tsirkin
  2014-03-30 18:09 ` [PATCH v3 2/3] patch-id: document new behaviour Michael S. Tsirkin
@ 2014-03-30 18:09 ` Michael S. Tsirkin
  2014-03-31 19:29   ` Junio C Hamano
  2014-03-31 17:59 ` [PATCH v3 1/3] patch-id: make it stable against hunk reordering Junio C Hamano
  2014-03-31 22:05 ` Junio C Hamano
  3 siblings, 1 reply; 15+ messages in thread
From: Michael S. Tsirkin @ 2014-03-30 18:09 UTC (permalink / raw)
  To: git; +Cc: sunshine, jrnieder, peff, gitster

Verify that patch ID is now stable against diff split and reordering.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---

Changes from v2:
	added test to verify patch ID is stable against diff splitting

 t/t4204-patch-id.sh | 117 ++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 109 insertions(+), 8 deletions(-)

diff --git a/t/t4204-patch-id.sh b/t/t4204-patch-id.sh
index d2c930d..1679714 100755
--- a/t/t4204-patch-id.sh
+++ b/t/t4204-patch-id.sh
@@ -5,12 +5,46 @@ test_description='git patch-id'
 . ./test-lib.sh
 
 test_expect_success 'setup' '
-	test_commit initial foo a &&
-	test_commit first foo b &&
-	git checkout -b same HEAD^ &&
-	test_commit same-msg foo b &&
-	git checkout -b notsame HEAD^ &&
-	test_commit notsame-msg foo c
+	cat > a <<-\EOF &&
+		a
+		a
+		a
+		a
+		a
+		a
+		a
+		a
+		EOF
+	(cat a; echo b) > ab &&
+	(echo d; cat a; echo b) > dab &&
+	cp a foo &&
+	cp a bar &&
+	git add foo bar &&
+	git commit -a -m initial &&
+	cp ab foo &&
+	cp ab bar &&
+	git commit -a -m first &&
+	git checkout -b same master &&
+	git commit --amend -m same-msg &&
+	git checkout -b notsame master &&
+	echo c > foo &&
+	echo c > bar &&
+	git commit --amend -a -m notsame-msg &&
+	git checkout -b split master &&
+	cp dab foo &&
+	cp dab bar &&
+	git commit -a -m split &&
+	git checkout -b merged master &&
+	git checkout split -- foo bar &&
+	git commit --amend -a -m merged &&
+	cat > bar-then-foo <<-\EOF &&
+		bar
+		foo
+		EOF
+	cat > foo-then-bar <<-\EOF
+		foo
+		bar
+		EOF
 '
 
 test_expect_success 'patch-id output is well-formed' '
@@ -23,11 +57,33 @@ calc_patch_id () {
 		sed "s# .*##" > patch-id_"$1"
 }
 
+calc_patch_id_unstable () {
+	git patch-id --unstable |
+		sed "s# .*##" > patch-id_"$1"
+}
+
+calc_patch_id_stable () {
+	git patch-id --stable |
+		sed "s# .*##" > patch-id_"$1"
+}
+
+
 get_patch_id () {
-	git log -p -1 "$1" | git patch-id |
+	git log -p -1 "$1" -O bar-then-foo -- | git patch-id |
 		sed "s# .*##" > patch-id_"$1"
 }
 
+get_patch_id_stable () {
+	git log -p -1 "$1" -O bar-then-foo | git patch-id --stable |
+		sed "s# .*##" > patch-id_"$1"
+}
+
+get_patch_id_unstable () {
+	git log -p -1 "$1" -O bar-then-foo | git patch-id --unstable |
+		sed "s# .*##" > patch-id_"$1"
+}
+
+
 test_expect_success 'patch-id detects equality' '
 	get_patch_id master &&
 	get_patch_id same &&
@@ -52,10 +108,55 @@ test_expect_success 'patch-id supports git-format-patch output' '
 test_expect_success 'whitespace is irrelevant in footer' '
 	get_patch_id master &&
 	git checkout same &&
-	git format-patch -1 --stdout | sed "s/ \$//" | calc_patch_id same &&
+	git format-patch -1 --stdout | sed "s/ \$//" |
+		calc_patch_id same &&
 	test_cmp patch-id_master patch-id_same
 '
 
+test_expect_success 'file order is irrelevant by default' '
+	get_patch_id master &&
+	git checkout same &&
+	git format-patch -1 --stdout -O foo-then-bar |
+		calc_patch_id same &&
+	test_cmp patch-id_master patch-id_same
+'
+
+test_expect_success 'file order is irrelevant with --stable' '
+	get_patch_id_stable master &&
+	git checkout same &&
+	git format-patch -1 --stdout -O foo-then-bar |
+		calc_patch_id_stable same &&
+	test_cmp patch-id_master patch-id_same
+'
+
+test_expect_success 'file order is relevant with --unstable' '
+	get_patch_id_unstable master &&
+	git checkout same &&
+	git format-patch -1 --stdout -O foo-then-bar | calc_patch_id_unstable notsame &&
+	! test_cmp patch-id_master patch-id_notsame
+'
+
+test_expect_success 'splitting patch does not affect id by default' '
+	get_patch_id merged &&
+	(git log -p -1 -O foo-then-bar split~1; git diff split~1..split) |
+		calc_patch_id split &&
+	test_cmp patch-id_merged patch-id_split
+'
+
+test_expect_success 'splitting patch does not affect id with --stable' '
+	get_patch_id_stable merged &&
+	(git log -p -1 -O foo-then-bar split~1; git diff split~1..split) |
+		calc_patch_id_stable split &&
+	test_cmp patch-id_merged patch-id_split
+'
+
+test_expect_success 'splitting patch affects id with --unstable' '
+	get_patch_id_unstable merged &&
+	(git log -p -1 -O foo-then-bar split~1; git diff split~1..split) |
+		calc_patch_id_unstable split &&
+	! test_cmp patch-id_merged patch-id_split
+'
+
 test_expect_success 'patch-id supports git-format-patch MIME output' '
 	get_patch_id master &&
 	git checkout same &&
-- 
MST

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH v3 1/3] patch-id: make it stable against hunk reordering
  2014-03-30 18:09 [PATCH v3 1/3] patch-id: make it stable against hunk reordering Michael S. Tsirkin
  2014-03-30 18:09 ` [PATCH v3 2/3] patch-id: document new behaviour Michael S. Tsirkin
  2014-03-30 18:09 ` [PATCH v3 3/3] patch-id-test: test --stable and --unstable flags Michael S. Tsirkin
@ 2014-03-31 17:59 ` Junio C Hamano
  2014-03-31 19:04   ` Michael S. Tsirkin
  2014-03-31 22:05 ` Junio C Hamano
  3 siblings, 1 reply; 15+ messages in thread
From: Junio C Hamano @ 2014-03-31 17:59 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: git, sunshine, jrnieder, peff

"Michael S. Tsirkin" <mst@redhat.com> writes:

> Patch id changes if users
> 1. reorder file diffs that make up a patch
> or
> 2. split a patch up to multiple diffs that touch the same path
> (keeping hunks within a single diff ordered to make patch valid).
>
> As the result is functionally equivalent, a different patch id is
> surprising to many users.
> In particular, reordering files using diff -O is helpful to make patches
> more readable (e.g. API header diff before implementation diff).
>
> Change patch-id behaviour making it stable against these two kinds
> of patch change:
> 1. calculate SHA1 hash for each hunk separately and sum all hashes
> (using a symmetrical sum) to get patch id
> 2. hash the file-level headers together with each hunk (not just the
> first hunk)
>
> We use a 20byte sum and not xor - since xor would give 0 output
> for patches that have two identical diffs, which isn't all that
> unlikely (e.g. append the same line in two places).
>
> Add a new flag --unstable to get the historical behaviour.
>
> Add --stable which is a nop, for symmetry.
>
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
>
> changes from v2:
> 	several bugfixes
> changes from v1:
> 	hanges from v1: documented motivation for supporting
> 	diff splitting (and not just file reordering).
> 	No code changes.
>
>  builtin/patch-id.c | 72 ++++++++++++++++++++++++++++++++++++++++++------------
>  1 file changed, 56 insertions(+), 16 deletions(-)

Does this have to interact or be consistent with patch-ids.c which
is the real patch-id machinery used to filter like-changes out by
"cherry-pick" and "log --cherry-pick"?

This series opens a very interesting opportunity by making it
possible to introduce the equivalence between two patches that touch
the same file and a single patch that concatenates hunks from these
two patches.

One example I am wondering about is perhaps this could be used to
detect two branches, both sides with many patches cherry-picked from
the same source, but some patches squashed together on one branch
but not on the other.  It would be very nice if you can detect that
two sets of patches are equivalent taken as a whole in such a
situation while rebasing one on top of the other.

Another example is that another mode that gives a set of broken-up
patch-ids for each hunk contained in the input.  Suppose there is a
patch that is only meant to be used on the proprietary fork of an
open source project, and the project releases the open source
portion by cherry-picking topics from the development tree used for
the proprietary "trunk".  The integration service of such a project
used to prepare the open source branch may want to have a
pre-receive hook that says "do not merge any commit to cause this
this hunk appear in the result, no matter what other changes the
patches in the commit may bring in", and broken-down patch-ids
(e.g. "diff HEAD...$commit | patch-id --individual") may be an
ingredient to implement such a hook.  There may be interesting
applications other than such a "broken-down patch-ids" that can be
based on the enhancement you are presenting here.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v3 1/3] patch-id: make it stable against hunk reordering
  2014-03-31 17:59 ` [PATCH v3 1/3] patch-id: make it stable against hunk reordering Junio C Hamano
@ 2014-03-31 19:04   ` Michael S. Tsirkin
  2014-03-31 19:35     ` Junio C Hamano
  0 siblings, 1 reply; 15+ messages in thread
From: Michael S. Tsirkin @ 2014-03-31 19:04 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git, sunshine, jrnieder, peff

On Mon, Mar 31, 2014 at 10:59:50AM -0700, Junio C Hamano wrote:
> "Michael S. Tsirkin" <mst@redhat.com> writes:
> 
> > Patch id changes if users
> > 1. reorder file diffs that make up a patch
> > or
> > 2. split a patch up to multiple diffs that touch the same path
> > (keeping hunks within a single diff ordered to make patch valid).
> >
> > As the result is functionally equivalent, a different patch id is
> > surprising to many users.
> > In particular, reordering files using diff -O is helpful to make patches
> > more readable (e.g. API header diff before implementation diff).
> >
> > Change patch-id behaviour making it stable against these two kinds
> > of patch change:
> > 1. calculate SHA1 hash for each hunk separately and sum all hashes
> > (using a symmetrical sum) to get patch id
> > 2. hash the file-level headers together with each hunk (not just the
> > first hunk)
> >
> > We use a 20byte sum and not xor - since xor would give 0 output
> > for patches that have two identical diffs, which isn't all that
> > unlikely (e.g. append the same line in two places).
> >
> > Add a new flag --unstable to get the historical behaviour.
> >
> > Add --stable which is a nop, for symmetry.
> >
> > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > ---
> >
> > changes from v2:
> > 	several bugfixes
> > changes from v1:
> > 	hanges from v1: documented motivation for supporting
> > 	diff splitting (and not just file reordering).
> > 	No code changes.
> >
> >  builtin/patch-id.c | 72 ++++++++++++++++++++++++++++++++++++++++++------------
> >  1 file changed, 56 insertions(+), 16 deletions(-)
> 
> Does this have to interact or be consistent with patch-ids.c which
> is the real patch-id machinery used to filter like-changes out by
> "cherry-pick" and "log --cherry-pick"?

I don't know off-hand.

Specifically, this is diff_flush_patch_id and in diff.c, isn't it?

> This series opens a very interesting opportunity by making it
> possible to introduce the equivalence between two patches that touch
> the same file and a single patch that concatenates hunks from these
> two patches.
> 
> One example I am wondering about is perhaps this could be used to
> detect two branches, both sides with many patches cherry-picked from
> the same source, but some patches squashed together on one branch
> but not on the other.  It would be very nice if you can detect that
> two sets of patches are equivalent taken as a whole in such a
> situation while rebasing one on top of the other.
> 
> Another example is that another mode that gives a set of broken-up
> patch-ids for each hunk contained in the input.  Suppose there is a
> patch that is only meant to be used on the proprietary fork of an
> open source project, and the project releases the open source
> portion by cherry-picking topics from the development tree used for
> the proprietary "trunk".  The integration service of such a project
> used to prepare the open source branch may want to have a
> pre-receive hook that says "do not merge any commit to cause this
> this hunk appear in the result, no matter what other changes the
> patches in the commit may bring in", and broken-down patch-ids
> (e.g. "diff HEAD...$commit | patch-id --individual") may be an
> ingredient to implement such a hook.  There may be interesting
> applications other than such a "broken-down patch-ids" that can be
> based on the enhancement you are presenting here.

OK sure, I can tweak that to use the same algorithm if desired,
though it does look like an unrelated enhancement to me.
Agree?

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v3 2/3] patch-id: document new behaviour
  2014-03-30 18:09 ` [PATCH v3 2/3] patch-id: document new behaviour Michael S. Tsirkin
@ 2014-03-31 19:08   ` Junio C Hamano
  2014-03-31 19:26     ` Michael S. Tsirkin
  0 siblings, 1 reply; 15+ messages in thread
From: Junio C Hamano @ 2014-03-31 19:08 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: git, sunshine, jrnieder, peff

"Michael S. Tsirkin" <mst@redhat.com> writes:

> Clarify that patch ID is now a sum of hashes, not a hash.
> Document --stable and --unstable flags.
>
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
>
> changes from v2:
> 	explicitly list the kinds of changes against which patch ID is stable
>
>  Documentation/git-patch-id.txt | 23 ++++++++++++++++++-----
>  1 file changed, 18 insertions(+), 5 deletions(-)
>
> diff --git a/Documentation/git-patch-id.txt b/Documentation/git-patch-id.txt
> index 312c3b1..30923e0 100644
> --- a/Documentation/git-patch-id.txt
> +++ b/Documentation/git-patch-id.txt
> @@ -8,14 +8,14 @@ git-patch-id - Compute unique ID for a patch
>  SYNOPSIS
>  --------
>  [verse]
> -'git patch-id' < <patch>
> +'git patch-id' [--stable | --unstable] < <patch>

Thanks.  It seems taht we are fairly inconsistent when writing
alternatives on the SYNOPSIS line.  A small minority seems to spell
the above as "[--stable|--unstable]", which may want to be fixed
(outside the context of this series, of course).

>  
>  DESCRIPTION
>  -----------
> -A "patch ID" is nothing but a SHA-1 of the diff associated with a patch, with
> -whitespace and line numbers ignored.  As such, it's "reasonably stable", but at
> -the same time also reasonably unique, i.e., two patches that have the same "patch
> -ID" are almost guaranteed to be the same thing.
> +A "patch ID" is nothing but a sum of SHA-1 of the diff hunks associated with a
> +patch, with whitespace and line numbers ignored.  As such, it's "reasonably
> +stable", but at the same time also reasonably unique, i.e., two patches that
> +have the same "patch ID" are almost guaranteed to be the same thing.

Perhaps "nothing but" can go by now?

>  
>  IOW, you can use this thing to look for likely duplicate commits.
>  
> @@ -27,6 +27,19 @@ This can be used to make a mapping from patch ID to commit ID.
>  
>  OPTIONS
>  -------
> +
> +--stable::
> +	Use a symmetrical sum of hashes as the patch ID.
> +	With this option, reordering file diffs that make up a patch or
> +	splitting a diff up to multiple diffs that touch the same path
> +	does not affect the ID.
> +	This is the default.
> +
> +--unstable::
> +	Use a non-symmetrical sum of hashes, such that reordering
> +	or splitting the patch does affect the ID.
> +	This was the default value for git 1.9 and older.

I am not sure if swapping the default in this series is a wise
decision.  We typically introduce a new shiny toy to play with in a
release and then later when the shiny toy proves to be useful, start
to think about changing the default, but not before.

>  <patch>::
>  	The diff to create the ID of.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v3 2/3] patch-id: document new behaviour
  2014-03-31 19:08   ` Junio C Hamano
@ 2014-03-31 19:26     ` Michael S. Tsirkin
  2014-03-31 19:54       ` Junio C Hamano
  0 siblings, 1 reply; 15+ messages in thread
From: Michael S. Tsirkin @ 2014-03-31 19:26 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git, sunshine, jrnieder, peff

On Mon, Mar 31, 2014 at 12:08:36PM -0700, Junio C Hamano wrote:
> "Michael S. Tsirkin" <mst@redhat.com> writes:
> 
> > Clarify that patch ID is now a sum of hashes, not a hash.
> > Document --stable and --unstable flags.
> >
> > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > ---
> >
> > changes from v2:
> > 	explicitly list the kinds of changes against which patch ID is stable
> >
> >  Documentation/git-patch-id.txt | 23 ++++++++++++++++++-----
> >  1 file changed, 18 insertions(+), 5 deletions(-)
> >
> > diff --git a/Documentation/git-patch-id.txt b/Documentation/git-patch-id.txt
> > index 312c3b1..30923e0 100644
> > --- a/Documentation/git-patch-id.txt
> > +++ b/Documentation/git-patch-id.txt
> > @@ -8,14 +8,14 @@ git-patch-id - Compute unique ID for a patch
> >  SYNOPSIS
> >  --------
> >  [verse]
> > -'git patch-id' < <patch>
> > +'git patch-id' [--stable | --unstable] < <patch>
> 
> Thanks.  It seems taht we are fairly inconsistent when writing
> alternatives on the SYNOPSIS line.  A small minority seems to spell
> the above as "[--stable|--unstable]", which may want to be fixed
> (outside the context of this series, of course).
> 
> >  
> >  DESCRIPTION
> >  -----------
> > -A "patch ID" is nothing but a SHA-1 of the diff associated with a patch, with
> > -whitespace and line numbers ignored.  As such, it's "reasonably stable", but at
> > -the same time also reasonably unique, i.e., two patches that have the same "patch
> > -ID" are almost guaranteed to be the same thing.
> > +A "patch ID" is nothing but a sum of SHA-1 of the diff hunks associated with a
> > +patch, with whitespace and line numbers ignored.  As such, it's "reasonably
> > +stable", but at the same time also reasonably unique, i.e., two patches that
> > +have the same "patch ID" are almost guaranteed to be the same thing.
> 
> Perhaps "nothing but" can go by now?

Sure. I was also wondering whether we should document the fact that we
are using SHA-1 internally, or just say "hashes".
In the end people can always read the source to find out so ...

> >  
> >  IOW, you can use this thing to look for likely duplicate commits.
> >  
> > @@ -27,6 +27,19 @@ This can be used to make a mapping from patch ID to commit ID.
> >  
> >  OPTIONS
> >  -------
> > +
> > +--stable::
> > +	Use a symmetrical sum of hashes as the patch ID.
> > +	With this option, reordering file diffs that make up a patch or
> > +	splitting a diff up to multiple diffs that touch the same path
> > +	does not affect the ID.
> > +	This is the default.
> > +
> > +--unstable::
> > +	Use a non-symmetrical sum of hashes, such that reordering
> > +	or splitting the patch does affect the ID.
> > +	This was the default value for git 1.9 and older.
> 
> I am not sure if swapping the default in this series is a wise
> decision.  We typically introduce a new shiny toy to play with in a
> release and then later when the shiny toy proves to be useful, start
> to think about changing the default, but not before.

Well I would claim that this is really a bugfix: --unstable
is here really just in case someone relies on the broken
behaviour.

The hash used is mostly an internal implementation detail, isn't it?
One of my motivators is to upstream
	[PATCH] diff: add a config option to control orderfile
so that I can use ordered diffs more easily, sending them to people and
not worrying about broken patch IDs.
If people have to remember to use --stable, that's of course harder.

If we keep+--unstable as default, I'd say we'll need a configuration
option to enable --stable: I can at least tell people to enable that.
We'll also need some way for people to discover what the default was.

As it is - it's simple: if --stable is there, it's the default.

> >  <patch>::
> >  	The diff to create the ID of.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v3 3/3] patch-id-test: test --stable and --unstable flags
  2014-03-30 18:09 ` [PATCH v3 3/3] patch-id-test: test --stable and --unstable flags Michael S. Tsirkin
@ 2014-03-31 19:29   ` Junio C Hamano
  0 siblings, 0 replies; 15+ messages in thread
From: Junio C Hamano @ 2014-03-31 19:29 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: git, sunshine, jrnieder, peff

"Michael S. Tsirkin" <mst@redhat.com> writes:

> Verify that patch ID is now stable against diff split and reordering.
>
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
>
> Changes from v2:
> 	added test to verify patch ID is stable against diff splitting
>
>  t/t4204-patch-id.sh | 117 ++++++++++++++++++++++++++++++++++++++++++++++++----
>  1 file changed, 109 insertions(+), 8 deletions(-)
>
> diff --git a/t/t4204-patch-id.sh b/t/t4204-patch-id.sh
> index d2c930d..1679714 100755
> --- a/t/t4204-patch-id.sh
> +++ b/t/t4204-patch-id.sh
> @@ -5,12 +5,46 @@ test_description='git patch-id'
>  . ./test-lib.sh
>  
>  test_expect_success 'setup' '
> -	test_commit initial foo a &&
> -	test_commit first foo b &&
> -	git checkout -b same HEAD^ &&
> -	test_commit same-msg foo b &&
> -	git checkout -b notsame HEAD^ &&
> -	test_commit notsame-msg foo c
> +	cat > a <<-\EOF &&

Please do not add an extra space between the redirection operator
(e.g. ">", "2>", etc.) and its operand.  The same style violations
everywhere in this patch.

> +		a
> +		a
> +		a
> +		a
> +		a
> +		a
> +		a
> +		a
> +		EOF

Please align EOF with the cat that receives it, and pretend that the
column the head of the cat is at is the leftmost column for the
payload, when writing <<- here-document, e.g.

        cat >a <<-\EOF &&
        a
        EOF

> +	(cat a; echo b) > ab &&
> +	(echo d; cat a; echo b) > dab &&
> +	cp a foo &&
> +	cp a bar &&

Probably a helper function would make it more apparent what is going
on?

	as="a a a a a a a a" && # eight a
	test_write_lines $as b >ab &&
	test_write_lines d $as b >dab &&
        test_write_lines $as >foo &&
        test_write_lines $as >bar &&

Or even better, use test_write_lines in places you do use the result
to overwrite foo/bar and get rid of ab and dab?

That helper can also be used to prepare bar-then-foo.

>  test_expect_success 'patch-id output is well-formed' '
> @@ -23,11 +57,33 @@ calc_patch_id () {
>  		sed "s# .*##" > patch-id_"$1"
>  }
>  
> +calc_patch_id_unstable () {
> +	git patch-id --unstable |
> +		sed "s# .*##" > patch-id_"$1"

Not a new problem, but can you align "git patch-id" and "sed" to the
same column?  Also, not using "/" when there is no slash involved in
the pattern makes readers waste their time wondering why the script
avoids it.

> +}
> +
> +calc_patch_id_stable () {
> +	git patch-id --stable |
> +		sed "s# .*##" > patch-id_"$1"
> +}
> +
> +

Extra blank line.

>  get_patch_id () {
> -	git log -p -1 "$1" | git patch-id |
> +	git log -p -1 "$1" -O bar-then-foo -- | git patch-id |
>  		sed "s# .*##" > patch-id_"$1"
>  }
>  
> +get_patch_id_stable () {
> +	git log -p -1 "$1" -O bar-then-foo | git patch-id --stable |
> +		sed "s# .*##" > patch-id_"$1"

Why doesn't it use calc_patch_id_stable?

> +}
> +
> +get_patch_id_unstable () {
> +	git log -p -1 "$1" -O bar-then-foo | git patch-id --unstable |
> +		sed "s# .*##" > patch-id_"$1"

Ditto.

> +}
> +
> +

Extra blank line.

>  test_expect_success 'patch-id detects equality' '
>  	get_patch_id master &&
>  	get_patch_id same &&
> @@ -52,10 +108,55 @@ test_expect_success 'patch-id supports git-format-patch output' '
>  test_expect_success 'whitespace is irrelevant in footer' '
>  	get_patch_id master &&
>  	git checkout same &&
> -	git format-patch -1 --stdout | sed "s/ \$//" | calc_patch_id same &&
> +	git format-patch -1 --stdout | sed "s/ \$//" |
> +		calc_patch_id same &&

What is this change about?

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v3 1/3] patch-id: make it stable against hunk reordering
  2014-03-31 19:04   ` Michael S. Tsirkin
@ 2014-03-31 19:35     ` Junio C Hamano
  0 siblings, 0 replies; 15+ messages in thread
From: Junio C Hamano @ 2014-03-31 19:35 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: git, sunshine, jrnieder, peff

"Michael S. Tsirkin" <mst@redhat.com> writes:

> though it does look like an unrelated enhancement to me.
> Agree?

Yes, that is exactly why I said "opens interesting opportunity" and
"making it possible" ;-) They are all very related, but they do not
have to graduate as parts of the same series.

The important thing is that the basic infrastructure of the new
codepath is designed in such a way that it later allows us to reuse
it in these changes.

I'm queuing these three patches almost as-is (I think I tweaked the
"sizeof" thing) on 'pu', without fixing the test styles and other
issues I may have pointed out in my reviews myself, expecting
further clean-ups.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v3 2/3] patch-id: document new behaviour
  2014-03-31 19:26     ` Michael S. Tsirkin
@ 2014-03-31 19:54       ` Junio C Hamano
  2014-03-31 20:42         ` Michael S. Tsirkin
  0 siblings, 1 reply; 15+ messages in thread
From: Junio C Hamano @ 2014-03-31 19:54 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: git, sunshine, jrnieder, peff

"Michael S. Tsirkin" <mst@redhat.com> writes:

> The hash used is mostly an internal implementation detail, isn't it?

Yes, but that does not mean we can break people who keep an external
database indexed with the patch-id by changing the default under
them, and "they can give --unstable option to work it around" is a
workaround, not a fix.  Without this change, they did not have to do
anything.

I would imagine that most of these people will be using the plain
vanilla "git show" output without any ordering or hunk splitting
when coming up with such a key.  A possible way forward to allow the
configuration that corresponds to "-O<orderfile>" while not breaking
the existing users could be to make the "patch-id --stable" kick in
automatically (of course, do this only when the user did not give
the "--unstable" command line option to override) when we see the
orderfile configuration in the repository, or when we see that the
incoming patch looks like reordered (e.g. has multiple "diff --git"
header lines that refer to the same path, or the set of files
mentioned by the "diff --git" lines are not in ascending order),
perhaps?

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v3 2/3] patch-id: document new behaviour
  2014-03-31 19:54       ` Junio C Hamano
@ 2014-03-31 20:42         ` Michael S. Tsirkin
  2014-04-02 18:18           ` Junio C Hamano
  0 siblings, 1 reply; 15+ messages in thread
From: Michael S. Tsirkin @ 2014-03-31 20:42 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git, sunshine, jrnieder, peff

On Mon, Mar 31, 2014 at 12:54:46PM -0700, Junio C Hamano wrote:
> "Michael S. Tsirkin" <mst@redhat.com> writes:
> 
> > The hash used is mostly an internal implementation detail, isn't it?
> 
> Yes, but that does not mean we can break people who keep an external
> database indexed with the patch-id by changing the default under
> them, and "they can give --unstable option to work it around" is a
> workaround, not a fix.  Without this change, they did not have to do
> anything.
> 
> I would imagine that most of these people will be using the plain
> vanilla "git show" output without any ordering or hunk splitting
> when coming up with such a key.  A possible way forward to allow the
> configuration that corresponds to "-O<orderfile>" while not breaking
> the existing users could be to make the "patch-id --stable" kick in
> automatically (of course, do this only when the user did not give
> the "--unstable" command line option to override) when we see the
> orderfile configuration in the repository, or when we see that the
> incoming patch looks like reordered (e.g. has multiple "diff --git"
> header lines that refer to the same path,

This would require us to track affected files in memory.
Issue?

> or the set of files
> mentioned by the "diff --git" lines are not in ascending order),
> perhaps?

I hope a patch-id configuration flag plus maybe checking the orderfile if not
specified together should be good enough.

-- 
MST

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v3 1/3] patch-id: make it stable against hunk reordering
  2014-03-30 18:09 [PATCH v3 1/3] patch-id: make it stable against hunk reordering Michael S. Tsirkin
                   ` (2 preceding siblings ...)
  2014-03-31 17:59 ` [PATCH v3 1/3] patch-id: make it stable against hunk reordering Junio C Hamano
@ 2014-03-31 22:05 ` Junio C Hamano
  3 siblings, 0 replies; 15+ messages in thread
From: Junio C Hamano @ 2014-03-31 22:05 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: git, sunshine, jrnieder, peff

"Michael S. Tsirkin" <mst@redhat.com> writes:

> +						memcpy(&ctx, &header_ctx, sizeof ctx);
> +					} else {
> +						/* Save header ctx for next hunk.  */
> +						memcpy(&header_ctx, &ctx, sizeof ctx);

I'll fix these to sizeof(ctx) when queuing.

Thanks.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v3 2/3] patch-id: document new behaviour
  2014-03-31 20:42         ` Michael S. Tsirkin
@ 2014-04-02 18:18           ` Junio C Hamano
  2014-04-02 19:02             ` Michael S. Tsirkin
  0 siblings, 1 reply; 15+ messages in thread
From: Junio C Hamano @ 2014-04-02 18:18 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: git, sunshine, jrnieder, peff

"Michael S. Tsirkin" <mst@redhat.com> writes:

> On Mon, Mar 31, 2014 at 12:54:46PM -0700, Junio C Hamano wrote:
>> "Michael S. Tsirkin" <mst@redhat.com> writes:
>> 
>> > The hash used is mostly an internal implementation detail, isn't it?
>> 
>> Yes, but that does not mean we can break people who keep an external
>> database indexed with the patch-id by changing the default under
>> them, and "they can give --unstable option to work it around" is a
>> workaround, not a fix.  Without this change, they did not have to do
>> anything.
>> 
>> I would imagine that most of these people will be using the plain
>> vanilla "git show" output without any ordering or hunk splitting
>> when coming up with such a key.  A possible way forward to allow the
>> configuration that corresponds to "-O<orderfile>" while not breaking
>> the existing users could be to make the "patch-id --stable" kick in
>> automatically (of course, do this only when the user did not give
>> the "--unstable" command line option to override) when we see the
>> orderfile configuration in the repository, or when we see that the
>> incoming patch looks like reordered (e.g. has multiple "diff --git"
>> header lines that refer to the same path,
>
> This would require us to track affected files in memory.
> Issue?

Don't we already do that in order to handle a patch that touches the
same path more than once anyway?  I think a possibly larger issue
might be that you would still want to do the hashing in a single
pass so you may need to always keep two accumulated hashes, before
you can decide if the patch is or is not a straight-forward one and
use one of the two, but that hopefully should not require a rocket
scientist.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v3 2/3] patch-id: document new behaviour
  2014-04-02 18:18           ` Junio C Hamano
@ 2014-04-02 19:02             ` Michael S. Tsirkin
  2014-04-03 15:42               ` Junio C Hamano
  0 siblings, 1 reply; 15+ messages in thread
From: Michael S. Tsirkin @ 2014-04-02 19:02 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git, sunshine, jrnieder, peff

On Wed, Apr 02, 2014 at 11:18:26AM -0700, Junio C Hamano wrote:
> "Michael S. Tsirkin" <mst@redhat.com> writes:
> 
> > On Mon, Mar 31, 2014 at 12:54:46PM -0700, Junio C Hamano wrote:
> >> "Michael S. Tsirkin" <mst@redhat.com> writes:
> >> 
> >> > The hash used is mostly an internal implementation detail, isn't it?
> >> 
> >> Yes, but that does not mean we can break people who keep an external
> >> database indexed with the patch-id by changing the default under
> >> them, and "they can give --unstable option to work it around" is a
> >> workaround, not a fix.  Without this change, they did not have to do
> >> anything.
> >> 
> >> I would imagine that most of these people will be using the plain
> >> vanilla "git show" output without any ordering or hunk splitting
> >> when coming up with such a key.  A possible way forward to allow the
> >> configuration that corresponds to "-O<orderfile>" while not breaking
> >> the existing users could be to make the "patch-id --stable" kick in
> >> automatically (of course, do this only when the user did not give
> >> the "--unstable" command line option to override) when we see the
> >> orderfile configuration in the repository, or when we see that the
> >> incoming patch looks like reordered (e.g. has multiple "diff --git"
> >> header lines that refer to the same path,
> >
> > This would require us to track affected files in memory.
> > Issue?
> 
> Don't we already do that in order to handle a patch that touches the
> same path more than once anyway?

At least I don't see it in builtin/patch-id.c

> I think a possibly larger issue
> might be that you would still want to do the hashing in a single
> pass so you may need to always keep two accumulated hashes, before
> you can decide if the patch is or is not a straight-forward one and
> use one of the two, but that hopefully should not require a rocket
> scientist.

But the issue is that equivalent patches would get a different hash.
This is what I tried to fix, after all.

So I think I prefer using an option and not a heuristic if you
are fine with that.
At some point in the future we might flip the default.

-- 
MST

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v3 2/3] patch-id: document new behaviour
  2014-04-02 19:02             ` Michael S. Tsirkin
@ 2014-04-03 15:42               ` Junio C Hamano
  0 siblings, 0 replies; 15+ messages in thread
From: Junio C Hamano @ 2014-04-03 15:42 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: git, sunshine, jrnieder, peff

"Michael S. Tsirkin" <mst@redhat.com> writes:

> So I think I prefer using an option and not a heuristic if you
> are fine with that.

Sure. Changing behaviour only by explicit user request (command line
or configuration) is much safer than heuristics that does not work
reliably.

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2014-04-03 15:42 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-03-30 18:09 [PATCH v3 1/3] patch-id: make it stable against hunk reordering Michael S. Tsirkin
2014-03-30 18:09 ` [PATCH v3 2/3] patch-id: document new behaviour Michael S. Tsirkin
2014-03-31 19:08   ` Junio C Hamano
2014-03-31 19:26     ` Michael S. Tsirkin
2014-03-31 19:54       ` Junio C Hamano
2014-03-31 20:42         ` Michael S. Tsirkin
2014-04-02 18:18           ` Junio C Hamano
2014-04-02 19:02             ` Michael S. Tsirkin
2014-04-03 15:42               ` Junio C Hamano
2014-03-30 18:09 ` [PATCH v3 3/3] patch-id-test: test --stable and --unstable flags Michael S. Tsirkin
2014-03-31 19:29   ` Junio C Hamano
2014-03-31 17:59 ` [PATCH v3 1/3] patch-id: make it stable against hunk reordering Junio C Hamano
2014-03-31 19:04   ` Michael S. Tsirkin
2014-03-31 19:35     ` Junio C Hamano
2014-03-31 22:05 ` Junio C Hamano

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).