All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC/PATCH] patch-ids: check modified paths before calculating diff
@ 2013-05-19 13:17 John Keeping
  2013-05-19 19:36 ` Jonathan Nieder
  2013-05-29  6:20 ` Jeff King
  0 siblings, 2 replies; 13+ messages in thread
From: John Keeping @ 2013-05-19 13:17 UTC (permalink / raw)
  To: git; +Cc: Kevin Bracey, Junio C Hamano, John Keeping

When using "git cherry" or "git log --cherry-pick" we often have a small
number of commits on one side and a large number on the other.  In
revision.c::cherry_pick_list we store the patch IDs for the small side
before comparing the large side to this.

In this case, it is likely that only a small number of paths are touched
by the commits on the smaller side of the range and by storing these we
can ignore many commits on the other side before unpacking blobs and
diffing them.

This improves performance in every example I have tried (times are best
of three, using git.git):

Before:
$ time git log --cherry master...jk/submodule-subdirectory-ok >/dev/null

real    0m0.373s
user    0m0.341s
sys     0m0.031s

After:
$ time git log --cherry master...jk/submodule-subdirectory-ok >/dev/null

real    0m0.060s
user    0m0.055s
sys     0m0.005s

Before:
$ time git log --cherry next...pu >/dev/null

real    0m0.661s
user    0m0.617s
sys     0m0.044s

After:
$ time git log --cherry next...pu >/dev/null

real    0m0.509s
user    0m0.478s
sys     0m0.030s

Signed-off-by: John Keeping <john@keeping.me.uk>
---
 patch-ids.c | 142 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 patch-ids.h |   3 ++
 2 files changed, 145 insertions(+)

diff --git a/patch-ids.c b/patch-ids.c
index bc8a28f..912f40c 100644
--- a/patch-ids.c
+++ b/patch-ids.c
@@ -1,5 +1,6 @@
 #include "cache.h"
 #include "diff.h"
+#include "diffcore.h"
 #include "commit.h"
 #include "sha1-lookup.h"
 #include "patch-ids.h"
@@ -16,6 +17,137 @@ static int commit_patch_id(struct commit *commit, struct diff_options *options,
 	return diff_flush_patch_id(options, sha1);
 }
 
+struct collect_paths_info {
+	struct string_list *paths;
+	int searching;
+};
+
+static int collect_paths_recursive(int n, struct tree_desc *t,
+	const char *base, struct pathspec *pathspec,
+	struct collect_paths_info *data);
+
+static int same_entry(struct name_entry *a, struct name_entry *b)
+{
+	if (!a->sha1 || !b->sha1)
+		return a->sha1 == b->sha1;
+	return	!hashcmp(a->sha1, b->sha1) &&
+		a->mode == b->mode;
+}
+
+static char *traverse_path(const struct traverse_info *info,
+		const struct name_entry *n)
+{
+	char *path = xmalloc(traverse_path_len(info, n) + 1);
+	return make_traverse_path(path, info, n);
+}
+
+static int add_touched_path(struct collect_paths_info *info, const char *path)
+{
+	if (info->searching) {
+		if (!string_list_has_string(info->paths, path))
+			return -1;
+	}
+	string_list_insert(info->paths, path);
+	return 0;
+}
+
+static inline const unsigned char *dir_sha1(struct name_entry *e)
+{
+	if (S_ISDIR(e->mode))
+		return e->sha1;
+	return NULL;
+}
+
+static int collect_touched_paths_cb(int n, unsigned long mask,
+		unsigned long dirmask, struct name_entry *entry,
+		struct traverse_info *info)
+{
+	struct collect_paths_info *collect_info = info->data;
+	if (n == 1) {
+		/* We're handling a root commit - add all the paths. */
+		if (entry[0].sha1 && !S_ISDIR(entry[0].mode)) {
+			if (add_touched_path(collect_info, entry[0].path))
+				return -1;
+		} else if (S_ISDIR(entry[0].mode)) {
+			char *newbase = traverse_path(info, entry);
+			struct tree_desc t[1];
+			void *buf0 = fill_tree_descriptor(t, entry[0].sha1);
+			int error = collect_paths_recursive(1, t, newbase,
+					info->pathspec, collect_info);
+			free(buf0);
+			free(newbase);
+			if (error < 0)
+				return error;
+		}
+		return mask;
+	}
+
+	if (same_entry(entry+0, entry+1))
+		return mask;
+
+	if (entry[0].mode && !S_ISDIR(entry[0].mode))
+		if (add_touched_path(collect_info, entry[0].path))
+			return -1;
+	if (entry[1].mode && !S_ISDIR(entry[1].mode))
+		if (add_touched_path(collect_info, entry[1].path))
+			return -1;
+
+	if ((entry[0].sha1 && S_ISDIR(entry[0].mode)) ||
+	    (entry[1].sha1 && S_ISDIR(entry[1].mode))) {
+		char *newbase = traverse_path(info,
+				S_ISDIR(entry[0].mode) ? entry+0 : entry+1);
+		struct tree_desc t[2];
+		void *buf0 = fill_tree_descriptor(t+0, dir_sha1(entry+0));
+		void *buf1 = fill_tree_descriptor(t+1, dir_sha1(entry+1));
+		int error = collect_paths_recursive(2, t, newbase,
+				info->pathspec, collect_info);
+		free(buf0);
+		free(buf1);
+		free(newbase);
+		if (error < 0)
+			return error;
+	}
+
+	return mask;
+}
+
+static int collect_paths_recursive(int n, struct tree_desc *t,
+		const char *base, struct pathspec *pathspec,
+		struct collect_paths_info *data)
+{
+	struct traverse_info info;
+
+	setup_traverse_info(&info, base);
+	info.data = data;
+	info.fn = collect_touched_paths_cb;
+	info.pathspec = pathspec;
+
+	return traverse_trees(n, t, &info);
+}
+
+static int collect_touched_paths(struct commit *commit, struct patch_ids *ids,
+		int searching)
+{
+	struct tree_desc trees[2];
+	struct collect_paths_info info = { &ids->touched_paths, searching };
+	void *commitbuf;
+	int result;
+
+	commitbuf = fill_tree_descriptor(trees + 1, commit->object.sha1);
+	if (commit->parents) {
+		void *parentbuf = fill_tree_descriptor(trees + 0,
+					commit->parents->item->object.sha1);
+		result = collect_paths_recursive(2, trees, "",
+				&ids->diffopts.pathspec, &info);
+		free(parentbuf);
+	} else {
+		result = collect_paths_recursive(1, trees + 1, "",
+				&ids->diffopts.pathspec, &info);
+	}
+	free(commitbuf);
+	return result;
+}
+
 static const unsigned char *patch_id_access(size_t index, void *table)
 {
 	struct patch_id **id_table = table;
@@ -40,6 +172,7 @@ int init_patch_ids(struct patch_ids *ids)
 	diff_setup(&ids->diffopts);
 	DIFF_OPT_SET(&ids->diffopts, RECURSIVE);
 	diff_setup_done(&ids->diffopts);
+	ids->touched_paths.strdup_strings = 1;
 	return 0;
 }
 
@@ -52,6 +185,8 @@ int free_patch_ids(struct patch_ids *ids)
 		next = patches->next;
 		free(patches);
 	}
+
+	string_list_clear(&ids->touched_paths, 0);
 	return 0;
 }
 
@@ -64,6 +199,13 @@ static struct patch_id *add_commit(struct commit *commit,
 	unsigned char sha1[20];
 	int pos;
 
+	if (no_add) {
+		if (collect_touched_paths(commit, ids, 1) < 0)
+			return NULL;
+	} else {
+		collect_touched_paths(commit, ids, 0);
+	}
+
 	if (commit_patch_id(commit, &ids->diffopts, sha1))
 		return NULL;
 	pos = patch_pos(ids->table, ids->nr, sha1);
diff --git a/patch-ids.h b/patch-ids.h
index c8c7ca1..c90bec5 100644
--- a/patch-ids.h
+++ b/patch-ids.h
@@ -1,6 +1,8 @@
 #ifndef PATCH_IDS_H
 #define PATCH_IDS_H
 
+#include "string-list.h"
+
 struct patch_id {
 	unsigned char patch_id[20];
 	char seen;
@@ -8,6 +10,7 @@ struct patch_id {
 
 struct patch_ids {
 	struct diff_options diffopts;
+	struct string_list touched_paths;
 	int nr, alloc;
 	struct patch_id **table;
 	struct patch_id_bucket *patches;
-- 
1.8.3.rc3.372.g721bad8

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [RFC/PATCH] patch-ids: check modified paths before calculating diff
  2013-05-19 13:17 [RFC/PATCH] patch-ids: check modified paths before calculating diff John Keeping
@ 2013-05-19 19:36 ` Jonathan Nieder
  2013-05-19 22:02   ` John Keeping
  2013-05-20  4:46   ` Junio C Hamano
  2013-05-29  6:20 ` Jeff King
  1 sibling, 2 replies; 13+ messages in thread
From: Jonathan Nieder @ 2013-05-19 19:36 UTC (permalink / raw)
  To: John Keeping; +Cc: git, Kevin Bracey, Junio C Hamano

John Keeping wrote:

> In this case, it is likely that only a small number of paths are touched
> by the commits on the smaller side of the range and by storing these we
> can ignore many commits on the other side before unpacking blobs and
> diffing them.

I like this idea a lot.

> --- a/patch-ids.c
> +++ b/patch-ids.c
> @@ -1,5 +1,6 @@
>  #include "cache.h"
>  #include "diff.h"
> +#include "diffcore.h"
>  #include "commit.h"
>  #include "sha1-lookup.h"
>  #include "patch-ids.h"
> @@ -16,6 +17,137 @@ static int commit_patch_id(struct commit *commit, struct diff_options *options,
>  	return diff_flush_patch_id(options, sha1);
>  }
>  
> +struct collect_paths_info {
> +	struct string_list *paths;
> +	int searching;
> +};
> +
> +static int collect_paths_recursive(int n, struct tree_desc *t,
> +	const char *base, struct pathspec *pathspec,
> +	struct collect_paths_info *data);

Can you say a little about how this function works (e.g., in a
comment)?  What are the preconditions and postconditions?  How does
the state evolve (e.g, when is "searching" set)?

> +
> +static int same_entry(struct name_entry *a, struct name_entry *b)
> +{
> +	if (!a->sha1 || !b->sha1)
> +		return a->sha1 == b->sha1;
> +	return	!hashcmp(a->sha1, b->sha1) &&
> +		a->mode == b->mode;

Style: unusual whitespace (the tab after "return").  I'd just do

	if (!a->sha1 || ...)
		return ...
	return !hashcmp(a->sha1, b->sha1) && a->mode == b->mode;

since it is not too long.

[...]
> +static char *traverse_path(const struct traverse_info *info,
> +		const struct name_entry *n)
> +{
> +	char *path = xmalloc(traverse_path_len(info, n) + 1);
> +	return make_traverse_path(path, info, n);
> +}

This function seems to be the same as one of the same name in
builtin/merge-tree.c.  Should it be put somewhere more public, for
example as part of the tree-walk API?  Who is responsible for freeing
"path"?  Would it make sense to use a strbuf here?

	strbuf_setlen(&buf, traverse_path_len(info, n));
	make_traverse_path(&buf.buf, info, n);

> +
> +static int add_touched_path(struct collect_paths_info *info, const char *path)
> +{
> +	if (info->searching) {
> +		if (!string_list_has_string(info->paths, path))
> +			return -1;
> +	}
> +	string_list_insert(info->paths, path);
> +	return 0;
> +}

Same question about internal API: when I see

	add_touched_path(info, path)

what should I expect it to do?

In the info->searching case, "string_list_insert(info->paths, path)"
will always be a no-op, right?  What does it mean that this is adding
a touched path in that case?

[...]
> +static int collect_touched_paths_cb(int n, unsigned long mask,
> +		unsigned long dirmask, struct name_entry *entry,
> +		struct traverse_info *info)
> +{

Same question about contracts.  Ideally a reader in a rush should be
able to read an opening comment about what the function does and
then return to the caller without delving into the details of how
it does its work.

> +	struct collect_paths_info *collect_info = info->data;
> +	if (n == 1) {
> +		/* We're handling a root commit - add all the paths. */
[...]
> +	if ((entry[0].sha1 && S_ISDIR(entry[0].mode)) ||
> +	    (entry[1].sha1 && S_ISDIR(entry[1].mode))) {

At this point I'm not sure what two trees are being traversed in
parallel, so it's not obvious to me what the code's about.  Are
they the "before" and "after" trees for commits being rebased?

[...]
> +static int collect_touched_paths(struct commit *commit, struct patch_ids *ids,
> +		int searching)
> +{
> +	struct tree_desc trees[2];
> +	struct collect_paths_info info = { &ids->touched_paths, searching };
> +	void *commitbuf;
> +	int result;
> +
> +	commitbuf = fill_tree_descriptor(trees + 1, commit->object.sha1);
> +	if (commit->parents) {
> +		void *parentbuf = fill_tree_descriptor(trees + 0,
> +					commit->parents->item->object.sha1);

Looks like yes.

What should happen for commits with more than 1 parent?  If they're
supposed to not enter this codepath because of a previous check, a
die("BUG: ...") could be a good way to make reading easier.

[...]
> @@ -40,6 +172,7 @@ int init_patch_ids(struct patch_ids *ids)
>  	diff_setup(&ids->diffopts);
>  	DIFF_OPT_SET(&ids->diffopts, RECURSIVE);
>  	diff_setup_done(&ids->diffopts);
> +	ids->touched_paths.strdup_strings = 1;

Good.

[...]
> @@ -64,6 +199,13 @@ static struct patch_id *add_commit(struct commit *commit,
>  	unsigned char sha1[20];
>  	int pos;
>  
> +	if (no_add) {
> +		if (collect_touched_paths(commit, ids, 1) < 0)
> +			return NULL;

Ah, so this is what the "searching" does.

The existing "no_add" argument is very confusing (what does it mean to
add a commit without adding?), but at least the confusion is
self-contained in a small, simple set of functions:

	static struct patch_id *add_commit(struct commit *commit,
					   struct patch_ids *ids,
					   int no_add)
	{
		...
	}

	struct patch_id *has_commit_patch_id(struct commit *commit,
					     struct patch_ids *ids)
	{
		return add_commit(commit, ids, 1);
	}

	struct patch_id *add_commit_patch_id(struct commit *commit,
					     struct patch_ids *ids)
	{
		return add_commit(commit, ids, 0);
	}

In fact the "no_add" makes *some* sense because add_commit() needs
to search for a patch-id before adding it, so calling with no_add
means to do all the work except for actually adding.

This new call propagates the mysterious boolean argument beyond
there and uses it in different ways.  Would it make sense to do
the following?

 (1) rename the "no_add" parameter to "search_instead" or something
 (2) avoid passing on "no_add" as a boolean.  Instead, use the
     same trick elsewhere so this can use appropriately named
     functions like:

	if (!searching)
		collect_touched_paths(commit, ids);
	else if (!only_touches_touched_paths(commit, ids))
		return NULL;

Hope that helps,
Jonathan

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC/PATCH] patch-ids: check modified paths before calculating diff
  2013-05-19 19:36 ` Jonathan Nieder
@ 2013-05-19 22:02   ` John Keeping
  2013-05-20  6:36     ` Jonathan Nieder
  2013-05-20  4:46   ` Junio C Hamano
  1 sibling, 1 reply; 13+ messages in thread
From: John Keeping @ 2013-05-19 22:02 UTC (permalink / raw)
  To: Jonathan Nieder; +Cc: git, Kevin Bracey, Junio C Hamano

On Sun, May 19, 2013 at 12:36:12PM -0700, Jonathan Nieder wrote:
> John Keeping wrote:
> 
> > In this case, it is likely that only a small number of paths are touched
> > by the commits on the smaller side of the range and by storing these we
> > can ignore many commits on the other side before unpacking blobs and
> > diffing them.
> 
> I like this idea a lot.
> 
> > --- a/patch-ids.c
> > +++ b/patch-ids.c
> > @@ -1,5 +1,6 @@
> >  #include "cache.h"
> >  #include "diff.h"
> > +#include "diffcore.h"
> >  #include "commit.h"
> >  #include "sha1-lookup.h"
> >  #include "patch-ids.h"
> > @@ -16,6 +17,137 @@ static int commit_patch_id(struct commit *commit, struct diff_options *options,
> >  	return diff_flush_patch_id(options, sha1);
> >  }
> >  
> > +struct collect_paths_info {
> > +	struct string_list *paths;
> > +	int searching;
> > +};
> > +
> > +static int collect_paths_recursive(int n, struct tree_desc *t,
> > +	const char *base, struct pathspec *pathspec,
> > +	struct collect_paths_info *data);
> 
> Can you say a little about how this function works (e.g., in a
> comment)?  What are the preconditions and postconditions?  How does
> the state evolve (e.g, when is "searching" set)?

As you figure out below, the patch ID API has two modes, "compute and
store" and "have we seen before".  The "searching" flag passes this
information down as we recursively compare the before and after trees
for a commit.  So when we hit a file that is changed by the commit we
either add it to the list of touched paths or tests if it is in the list
and stops comparing the trees if it isn't.

> > +
> > +static int same_entry(struct name_entry *a, struct name_entry *b)
> > +{
> > +	if (!a->sha1 || !b->sha1)
> > +		return a->sha1 == b->sha1;
> > +	return	!hashcmp(a->sha1, b->sha1) &&
> > +		a->mode == b->mode;
> 
> Style: unusual whitespace (the tab after "return").  I'd just do
> 
> 	if (!a->sha1 || ...)
> 		return ...
> 	return !hashcmp(a->sha1, b->sha1) && a->mode == b->mode;
> 
> since it is not too long.

Makes sense, I copied from the function of the same name in
builtin/merge-tree.c, although I then changed the semantics for
non-existent entries.

> [...]
> > +static char *traverse_path(const struct traverse_info *info,
> > +		const struct name_entry *n)
> > +{
> > +	char *path = xmalloc(traverse_path_len(info, n) + 1);
> > +	return make_traverse_path(path, info, n);
> > +}
> 
> This function seems to be the same as one of the same name in
> builtin/merge-tree.c.  Should it be put somewhere more public, for
> example as part of the tree-walk API?  Who is responsible for freeing
> "path"?  Would it make sense to use a strbuf here?
> 
> 	strbuf_setlen(&buf, traverse_path_len(info, n));
> 	make_traverse_path(&buf.buf, info, n);

Perhaps alloc_traverse_path?  I'm not sure the strbuf buys us much for
this case, since we only use the result for a few lines before freeing
it.

> > +
> > +static int add_touched_path(struct collect_paths_info *info, const char *path)
> > +{
> > +	if (info->searching) {
> > +		if (!string_list_has_string(info->paths, path))
> > +			return -1;
> > +	}
> > +	string_list_insert(info->paths, path);
> > +	return 0;
> > +}
> 
> Same question about internal API: when I see
> 
> 	add_touched_path(info, path)
> 
> what should I expect it to do?

Yeah, this patch suffers a lot from writing separate "collect" and
"check" codepaths and refactoring out common functionality later.

Perhaps all of the collect_* functions should be changed_paths_* and
then this is "process_changed_path".

> In the info->searching case, "string_list_insert(info->paths, path)"
> will always be a no-op, right?  What does it mean that this is adding
> a touched path in that case?
> 
> [...]
> > +static int collect_touched_paths_cb(int n, unsigned long mask,
> > +		unsigned long dirmask, struct name_entry *entry,
> > +		struct traverse_info *info)
> > +{
> 
> Same question about contracts.  Ideally a reader in a rush should be
> able to read an opening comment about what the function does and
> then return to the caller without delving into the details of how
> it does its work.
> 
> > +	struct collect_paths_info *collect_info = info->data;
> > +	if (n == 1) {
> > +		/* We're handling a root commit - add all the paths. */
> [...]
> > +	if ((entry[0].sha1 && S_ISDIR(entry[0].mode)) ||
> > +	    (entry[1].sha1 && S_ISDIR(entry[1].mode))) {
> 
> At this point I'm not sure what two trees are being traversed in
> parallel, so it's not obvious to me what the code's about.  Are
> they the "before" and "after" trees for commits being rebased?
> 
> [...]
> > +static int collect_touched_paths(struct commit *commit, struct patch_ids *ids,
> > +		int searching)
> > +{
> > +	struct tree_desc trees[2];
> > +	struct collect_paths_info info = { &ids->touched_paths, searching };
> > +	void *commitbuf;
> > +	int result;
> > +
> > +	commitbuf = fill_tree_descriptor(trees + 1, commit->object.sha1);
> > +	if (commit->parents) {
> > +		void *parentbuf = fill_tree_descriptor(trees + 0,
> > +					commit->parents->item->object.sha1);
> 
> Looks like yes.
> 
> What should happen for commits with more than 1 parent?  If they're
> supposed to not enter this codepath because of a previous check, a
> die("BUG: ...") could be a good way to make reading easier.

Currently the patch ID code compares the commit with its first parent,
so I think this should do the same.  I posted about this in relation to
revision.c earlier [1].

[1] http://article.gmane.org/gmane.comp.version-control.git/224884

> [...]
> > @@ -40,6 +172,7 @@ int init_patch_ids(struct patch_ids *ids)
> >  	diff_setup(&ids->diffopts);
> >  	DIFF_OPT_SET(&ids->diffopts, RECURSIVE);
> >  	diff_setup_done(&ids->diffopts);
> > +	ids->touched_paths.strdup_strings = 1;
> 
> Good.
> 
> [...]
> > @@ -64,6 +199,13 @@ static struct patch_id *add_commit(struct commit *commit,
> >  	unsigned char sha1[20];
> >  	int pos;
> >  
> > +	if (no_add) {
> > +		if (collect_touched_paths(commit, ids, 1) < 0)
> > +			return NULL;
> 
> Ah, so this is what the "searching" does.
> 
> The existing "no_add" argument is very confusing (what does it mean to
> add a commit without adding?), but at least the confusion is
> self-contained in a small, simple set of functions:
> 
> 	static struct patch_id *add_commit(struct commit *commit,
> 					   struct patch_ids *ids,
> 					   int no_add)
> 	{
> 		...
> 	}
> 
> 	struct patch_id *has_commit_patch_id(struct commit *commit,
> 					     struct patch_ids *ids)
> 	{
> 		return add_commit(commit, ids, 1);
> 	}
> 
> 	struct patch_id *add_commit_patch_id(struct commit *commit,
> 					     struct patch_ids *ids)
> 	{
> 		return add_commit(commit, ids, 0);
> 	}
> 
> In fact the "no_add" makes *some* sense because add_commit() needs
> to search for a patch-id before adding it, so calling with no_add
> means to do all the work except for actually adding.
> 
> This new call propagates the mysterious boolean argument beyond
> there and uses it in different ways.  Would it make sense to do
> the following?
> 
>  (1) rename the "no_add" parameter to "search_instead" or something
>  (2) avoid passing on "no_add" as a boolean.  Instead, use the
>      same trick elsewhere so this can use appropriately named
>      functions like:
> 
> 	if (!searching)
> 		collect_touched_paths(commit, ids);
> 	else if (!only_touches_touched_paths(commit, ids))
> 		return NULL;

Seems reasonable.

Thanks for the review.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC/PATCH] patch-ids: check modified paths before calculating diff
  2013-05-19 19:36 ` Jonathan Nieder
  2013-05-19 22:02   ` John Keeping
@ 2013-05-20  4:46   ` Junio C Hamano
  1 sibling, 0 replies; 13+ messages in thread
From: Junio C Hamano @ 2013-05-20  4:46 UTC (permalink / raw)
  To: Jonathan Nieder; +Cc: John Keeping, git, Kevin Bracey

Jonathan Nieder <jrnieder@gmail.com> writes:

>> @@ -64,6 +199,13 @@ static struct patch_id *add_commit(struct commit *commit,
>>  	unsigned char sha1[20];
>>  	int pos;
>>  
>> +	if (no_add) {
>> +		if (collect_touched_paths(commit, ids, 1) < 0)
>> +			return NULL;
>
> Ah, so this is what the "searching" does.
>
> The existing "no_add" argument is very confusing (what does it mean to
> add a commit without adding?),

Such a mode of operation is usually called "dry-run", no?

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC/PATCH] patch-ids: check modified paths before calculating diff
  2013-05-19 22:02   ` John Keeping
@ 2013-05-20  6:36     ` Jonathan Nieder
  2013-05-20  8:16       ` John Keeping
  0 siblings, 1 reply; 13+ messages in thread
From: Jonathan Nieder @ 2013-05-20  6:36 UTC (permalink / raw)
  To: John Keeping; +Cc: git, Kevin Bracey, Junio C Hamano

John Keeping wrote:
> On Sun, May 19, 2013 at 12:36:12PM -0700, Jonathan Nieder wrote:

>>                                        Who is responsible for freeing
>> "path"?  Would it make sense to use a strbuf here?
>>
>> 	strbuf_setlen(&buf, traverse_path_len(info, n));
>> 	make_traverse_path(&buf.buf, info, n);
>
> Perhaps alloc_traverse_path?  I'm not sure the strbuf buys us much for
> this case, since we only use the result for a few lines before freeing
> it.

Good idea.  Generally strbufs are nice for this kind of thing because
they make it easy to reuse buffers, but in this case there's no
opportunity for that.

[...]
> then this is "process_changed_path".

Sounds good.

[...]
>> What should happen for commits with more than 1 parent?  If they're
>> supposed to not enter this codepath because of a previous check, a
>> die("BUG: ...") could be a good way to make reading easier.
>
> Currently the patch ID code compares the commit with its first parent,
> so I think this should do the same.

Ok.  I guess a comment nearby would help future readers avoid the same
question.

I don't know what it should mean to try to use --cherry without
--no-merges or --first-parent, so I guess this is harmless.

[...]
> Thanks for the review.

No problem.  Thanks for a pleasant read.

Ciao,
Jonathan

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC/PATCH] patch-ids: check modified paths before calculating diff
  2013-05-20  6:36     ` Jonathan Nieder
@ 2013-05-20  8:16       ` John Keeping
  0 siblings, 0 replies; 13+ messages in thread
From: John Keeping @ 2013-05-20  8:16 UTC (permalink / raw)
  To: Jonathan Nieder; +Cc: git, Kevin Bracey, Junio C Hamano

On Sun, May 19, 2013 at 11:36:23PM -0700, Jonathan Nieder wrote:
> I don't know what it should mean to try to use --cherry without
> --no-merges or --first-parent, so I guess this is harmless.

Currently --no-merges doesn't actually get passed down this far.  We do
the patch ID calculations without taking that into account.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC/PATCH] patch-ids: check modified paths before calculating diff
  2013-05-19 13:17 [RFC/PATCH] patch-ids: check modified paths before calculating diff John Keeping
  2013-05-19 19:36 ` Jonathan Nieder
@ 2013-05-29  6:20 ` Jeff King
  2013-05-29  7:22   ` Jeff King
  2013-05-29 18:08   ` Junio C Hamano
  1 sibling, 2 replies; 13+ messages in thread
From: Jeff King @ 2013-05-29  6:20 UTC (permalink / raw)
  To: John Keeping; +Cc: git, Kevin Bracey, Junio C Hamano

On Sun, May 19, 2013 at 02:17:35PM +0100, John Keeping wrote:

> When using "git cherry" or "git log --cherry-pick" we often have a small
> number of commits on one side and a large number on the other.  In
> revision.c::cherry_pick_list we store the patch IDs for the small side
> before comparing the large side to this.
> 
> In this case, it is likely that only a small number of paths are touched
> by the commits on the smaller side of the range and by storing these we
> can ignore many commits on the other side before unpacking blobs and
> diffing them.

There are two observations that make your scheme work:

  1. For something like --cherry-pick, we do not even care about the
     actual patch-ids, only whether there are matches from the left and
     right sides.

  2. Comparing the sets of paths touched by two commits is often
     sufficient to realize they do not have the same patch-ids.

But I think those observations mean we can do even better than
calculating the real patch-id for the small side at all.

Imagine we have both "loose" and "strict" patch-ids, where the loose one
is much faster to compute, and has that property that a match _might_
mean we have the same strict patch-id, but a non-match means we
definitely do not have the same strict patch-id.

I think such a loose patch-id could just be a hash of the filenames that
were changed by the patch (e.g., the first 32-bits of the sha1 of the
concatenated filenames). Computing that should be about as expensive as
a tree-diff. Per observation 2 above, if two commits do not have the
same loose id, we know that they cannot possibly have the same strict
id.

Then we can forget about the smaller-side and bigger-side entirely, and
just do something like:

  1. Make a sorted list (or hash table) of loose ids for one side.

  2. For each commit on the other side, calculate its loose id and look
     that up in the sorted list. If no hits, we know that there is no
     match. For any hits, lazily calculate (and cache) the strict patch
     id for both sides and compare as usual.

In the best case, we compute no patch-ids at all. And even for the
average case, I'd expect our lazy calculation to only have to compute a
handful of ids.

-Peff

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC/PATCH] patch-ids: check modified paths before calculating diff
  2013-05-29  6:20 ` Jeff King
@ 2013-05-29  7:22   ` Jeff King
  2013-05-29  7:50     ` Jeff King
  2013-05-29 18:08   ` Junio C Hamano
  1 sibling, 1 reply; 13+ messages in thread
From: Jeff King @ 2013-05-29  7:22 UTC (permalink / raw)
  To: John Keeping; +Cc: git, Kevin Bracey, Junio C Hamano

On Wed, May 29, 2013 at 02:20:07AM -0400, Jeff King wrote:

> In the best case, we compute no patch-ids at all. And even for the
> average case, I'd expect our lazy calculation to only have to compute a
> handful of ids.

Here is a not-well-tested version of the idea. I tried to contain the
changes to patch-ids.c, though we may also be able to simplify the
--cherry code, too.

Here are my timings compared to stock git and to yours
(all are best-of-five, "git log --cherry $revs"):

  revs=origin/master...origin/jk/submodule-subdirectory-ok
        stock    |    you   |    me
        -------------------------------
  real  0m0.501s | 0m0.078s | 0m0.098s
  user  0m0.480s | 0m0.056s | 0m0.084s
  sys   0m0.016s | 0m0.020s | 0m0.012s

  revs=origin/next...origin/pu
        stock    |    you   |    me
        -------------------------------
  real  0m0.857s | 0m0.847s | 0m0.519s
  user  0m0.828s | 0m0.812s | 0m0.488s
  sys   0m0.024s | 0m0.028s | 0m0.024s

So it performs slightly less well on the small case, and a bit better on
the large case. I think part of the problem is that when we do have a
"loose" hit, we end up re-doing the tree diff to find the strict, which
involves re-inflating the trees. It's unavoidable for the lazy entries
unless we want to cache the whole diff_queued_diff, but for the non-lazy
entries we should be able to do the strict patch-id incrementally. We
just need to improve the function interfaces.

---
 diff.c      |  15 ++++-
 diff.h      |   2 +-
 patch-ids.c | 117 +++++++++++++++++++----------------
 patch-ids.h |  11 ++--
 4 files changed, 82 insertions(+), 63 deletions(-)

diff --git a/diff.c b/diff.c
index f0b3e7c..3b55788 100644
--- a/diff.c
+++ b/diff.c
@@ -4233,7 +4233,8 @@ static void patch_id_consume(void *priv, char *line, unsigned long len)
 }
 
 /* returns 0 upon success, and writes result into sha1 */
-static int diff_get_patch_id(struct diff_options *options, unsigned char *sha1)
+static int diff_get_patch_id(struct diff_options *options, unsigned char *sha1,
+			     int loose)
 {
 	struct diff_queue_struct *q = &diff_queued_diff;
 	int i;
@@ -4266,6 +4267,12 @@ static int diff_get_patch_id(struct diff_options *options, unsigned char *sha1)
 		if (DIFF_PAIR_UNMERGED(p))
 			continue;
 
+		if (loose) {
+			git_SHA1_Update(&ctx, p->one->path, strlen(p->one->path));
+			git_SHA1_Update(&ctx, p->two->path, strlen(p->two->path));
+			continue;
+		}
+
 		diff_fill_sha1_info(p->one);
 		diff_fill_sha1_info(p->two);
 		if (fill_mmfile(&mf1, p->one) < 0 ||
@@ -4323,11 +4330,13 @@ int diff_flush_patch_id(struct diff_options *options, unsigned char *sha1)
 	return 0;
 }
 
-int diff_flush_patch_id(struct diff_options *options, unsigned char *sha1)
+int diff_flush_patch_id(struct diff_options *options,
+			unsigned char *sha1,
+			int loose)
 {
 	struct diff_queue_struct *q = &diff_queued_diff;
 	int i;
-	int result = diff_get_patch_id(options, sha1);
+	int result = diff_get_patch_id(options, sha1, loose);
 
 	for (i = 0; i < q->nr; i++)
 		diff_free_filepair(q->queue[i]);
diff --git a/diff.h b/diff.h
index 78b4091..b018aaf 100644
--- a/diff.h
+++ b/diff.h
@@ -320,7 +320,7 @@ extern int do_diff_cache(const unsigned char *, struct diff_options *);
 extern int run_diff_index(struct rev_info *revs, int cached);
 
 extern int do_diff_cache(const unsigned char *, struct diff_options *);
-extern int diff_flush_patch_id(struct diff_options *, unsigned char *);
+extern int diff_flush_patch_id(struct diff_options *, unsigned char *, int loose);
 
 extern int diff_result_code(struct diff_options *, int);
 
diff --git a/patch-ids.c b/patch-ids.c
index bc8a28f..3a83ee6 100644
--- a/patch-ids.c
+++ b/patch-ids.c
@@ -5,7 +5,7 @@ static int commit_patch_id(struct commit *commit, struct diff_options *options,
 #include "patch-ids.h"
 
 static int commit_patch_id(struct commit *commit, struct diff_options *options,
-		    unsigned char *sha1)
+			   unsigned char *sha1, int loose)
 {
 	if (commit->parents)
 		diff_tree_sha1(commit->parents->item->object.sha1,
@@ -13,27 +13,9 @@ static int commit_patch_id(struct commit *commit, struct diff_options *options,
 	else
 		diff_root_tree_sha1(commit->object.sha1, "", options);
 	diffcore_std(options);
-	return diff_flush_patch_id(options, sha1);
+	return diff_flush_patch_id(options, sha1, loose);
 }
 
-static const unsigned char *patch_id_access(size_t index, void *table)
-{
-	struct patch_id **id_table = table;
-	return id_table[index]->patch_id;
-}
-
-static int patch_pos(struct patch_id **table, int nr, const unsigned char *id)
-{
-	return sha1_pos(id, table, nr, patch_id_access);
-}
-
-#define BUCKET_SIZE 190 /* 190 * 21 = 3990, with slop close enough to 4K */
-struct patch_id_bucket {
-	struct patch_id_bucket *next;
-	int nr;
-	struct patch_id bucket[BUCKET_SIZE];
-};
-
 int init_patch_ids(struct patch_ids *ids)
 {
 	memset(ids, 0, sizeof(*ids));
@@ -43,56 +25,83 @@ static struct patch_id *add_commit(struct commit *commit,
 	return 0;
 }
 
+static int free_each_patch_id(void *patch_id, void *data)
+{
+	free(patch_id);
+	return 0;
+}
+
 int free_patch_ids(struct patch_ids *ids)
 {
-	struct patch_id_bucket *next, *patches;
-
-	free(ids->table);
-	for (patches = ids->patches; patches; patches = next) {
-		next = patches->next;
-		free(patches);
-	}
+	for_each_hash(&ids->table, free_each_patch_id, NULL);
+	free_hash(&ids->table);
 	return 0;
 }
 
+static inline unsigned long hash_sha1(const unsigned char *sha1)
+{
+	unsigned int hash;
+	memcpy(&hash, sha1, sizeof(hash));
+	return hash;
+}
+
 static struct patch_id *add_commit(struct commit *commit,
 				   struct patch_ids *ids,
 				   int no_add)
 {
-	struct patch_id_bucket *bucket;
-	struct patch_id *ent;
-	unsigned char sha1[20];
-	int pos;
+	struct patch_id *p;
+	unsigned char loose[20], strict[20];
+	unsigned long hash;
+	void **pos;
 
-	if (commit_patch_id(commit, &ids->diffopts, sha1))
+	if (commit_patch_id(commit, &ids->diffopts, loose, 1))
 		return NULL;
-	pos = patch_pos(ids->table, ids->nr, sha1);
-	if (0 <= pos)
-		return ids->table[pos];
+	hash = hash_sha1(loose);
+
+	p = lookup_hash(hash, &ids->table);
+	for (; p; p = p->next) {
+		/* Skip collisions from our reduced hash. */
+		if (hashcmp(loose, p->loose))
+		    continue;
+
+		/*
+		 * We have a real loose match; lazily load and cache the strict
+		 * id to compare.
+		 */
+		if (is_null_sha1(p->strict)) {
+			if (commit_patch_id(p->commit, &ids->diffopts,
+					    p->strict, 0))
+				return NULL;
+		}
+		if (commit_patch_id(commit, &ids->diffopts, strict, 0))
+			return NULL;
+
+		/*
+		 * If the strict ones match, we do not need to look farther;
+		 * the patch-id is here.
+		 */
+		if (!hashcmp(strict, p->strict))
+			return p;
+	}
+
+	/*
+	 * Otherwise, we may have a loose but not strict match, or even no
+	 * loose match at all. Now we can add the new entry (or return failure
+	 * to look it up).
+	 */
 	if (no_add)
 		return NULL;
 
-	pos = -1 - pos;
+	p = xcalloc(1, sizeof(*p));
+	p->commit = commit;
+	hashcpy(p->loose, loose);
 
-	bucket = ids->patches;
-	if (!bucket || (BUCKET_SIZE <= bucket->nr)) {
-		bucket = xcalloc(1, sizeof(*bucket));
-		bucket->next = ids->patches;
-		ids->patches = bucket;
+	pos = insert_hash(hash, p, &ids->table);
+	if (pos) {
+		p->next = *pos;
+		*pos = p;
 	}
-	ent = &bucket->bucket[bucket->nr++];
-	hashcpy(ent->patch_id, sha1);
-
-	if (ids->alloc <= ids->nr) {
-		ids->alloc = alloc_nr(ids->nr);
-		ids->table = xrealloc(ids->table, sizeof(ent) * ids->alloc);
-	}
-	if (pos < ids->nr)
-		memmove(ids->table + pos + 1, ids->table + pos,
-			sizeof(ent) * (ids->nr - pos));
-	ids->nr++;
-	ids->table[pos] = ent;
-	return ids->table[pos];
+	return p;
 }
 
 struct patch_id *has_commit_patch_id(struct commit *commit,
diff --git a/patch-ids.h b/patch-ids.h
index c8c7ca1..c40ff39 100644
--- a/patch-ids.h
+++ b/patch-ids.h
@@ -2,15 +2,16 @@ struct patch_ids {
 #define PATCH_IDS_H
 
 struct patch_id {
-	unsigned char patch_id[20];
-	char seen;
+	struct commit *commit;
+	unsigned char loose[20];
+	unsigned char strict[20];
+	unsigned seen:1;
+	struct patch_id *next;
 };
 
 struct patch_ids {
 	struct diff_options diffopts;
-	int nr, alloc;
-	struct patch_id **table;
-	struct patch_id_bucket *patches;
+	struct hash_table table;
 };
 
 int init_patch_ids(struct patch_ids *);

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [RFC/PATCH] patch-ids: check modified paths before calculating diff
  2013-05-29  7:22   ` Jeff King
@ 2013-05-29  7:50     ` Jeff King
  0 siblings, 0 replies; 13+ messages in thread
From: Jeff King @ 2013-05-29  7:50 UTC (permalink / raw)
  To: John Keeping; +Cc: git, Kevin Bracey, Junio C Hamano

On Wed, May 29, 2013 at 03:22:25AM -0400, Jeff King wrote:

>   revs=origin/master...origin/jk/submodule-subdirectory-ok
>         stock    |    you   |    me
>         -------------------------------
>   real  0m0.501s | 0m0.078s | 0m0.098s
>   user  0m0.480s | 0m0.056s | 0m0.084s
>   sys   0m0.016s | 0m0.020s | 0m0.012s
> 
>   revs=origin/next...origin/pu
>         stock    |    you   |    me
>         -------------------------------
>   real  0m0.857s | 0m0.847s | 0m0.519s
>   user  0m0.828s | 0m0.812s | 0m0.488s
>   sys   0m0.024s | 0m0.028s | 0m0.024s
> 
> So it performs slightly less well on the small case, and a bit better on
> the large case. I think part of the problem is that when we do have a
> "loose" hit, we end up re-doing the tree diff to find the strict, which
> involves re-inflating the trees. It's unavoidable for the lazy entries
> unless we want to cache the whole diff_queued_diff, but for the non-lazy
> entries we should be able to do the strict patch-id incrementally. We
> just need to improve the function interfaces.

The (somewhat hacky) patch below, on top of my previous one, does that
optimization.  But the timings aren't improved by much. My best-of-five
for the second case went down to:

  real    0m0.495s
  user    0m0.488s
  sys     0m0.004s

However, the actual time to just do "git log --raw $revs" is:

  real    0m0.333s
  user    0m0.292s
  sys     0m0.032s

which provides a lower-bound for how well we can do, as it is just doing
a single tree diff for each commit. So I think we have reaped most of
the benefits of this approach already (and we will generally have to do
_some_ true patch-id calculations, so we can never meet that lower
bound).

---
 diff.c      | 11 +++-------
 diff.h      |  3 ++-
 patch-ids.c | 39 ++++++++++++++++++++++++++---------
 3 files changed, 34 insertions(+), 19 deletions(-)

diff --git a/diff.c b/diff.c
index 3b55788..161c5bf 100644
--- a/diff.c
+++ b/diff.c
@@ -4233,8 +4233,8 @@ static void patch_id_consume(void *priv, char *line, unsigned long len)
 }
 
 /* returns 0 upon success, and writes result into sha1 */
-static int diff_get_patch_id(struct diff_options *options, unsigned char *sha1,
-			     int loose)
+int diff_get_patch_id(struct diff_options *options, unsigned char *sha1,
+		      int loose)
 {
 	struct diff_queue_struct *q = &diff_queued_diff;
 	int i;
@@ -4330,21 +4330,16 @@ int diff_flush_patch_id(struct diff_options *options,
 	return 0;
 }
 
-int diff_flush_patch_id(struct diff_options *options,
-			unsigned char *sha1,
-			int loose)
+void diff_queue_clear(void)
 {
 	struct diff_queue_struct *q = &diff_queued_diff;
 	int i;
-	int result = diff_get_patch_id(options, sha1, loose);
 
 	for (i = 0; i < q->nr; i++)
 		diff_free_filepair(q->queue[i]);
 
 	free(q->queue);
 	DIFF_QUEUE_CLEAR(q);
-
-	return result;
 }
 
 static int is_summary_empty(const struct diff_queue_struct *q)
diff --git a/diff.h b/diff.h
index b018aaf..7207d4b 100644
--- a/diff.h
+++ b/diff.h
@@ -320,7 +320,8 @@ extern int do_diff_cache(const unsigned char *, struct diff_options *);
 extern int run_diff_index(struct rev_info *revs, int cached);
 
 extern int do_diff_cache(const unsigned char *, struct diff_options *);
-extern int diff_flush_patch_id(struct diff_options *, unsigned char *, int loose);
+extern int diff_get_patch_id(struct diff_options *, unsigned char *, int loose);
+extern void diff_queue_clear(void);
 
 extern int diff_result_code(struct diff_options *, int);
 
diff --git a/patch-ids.c b/patch-ids.c
index 3a83ee6..83cda92 100644
--- a/patch-ids.c
+++ b/patch-ids.c
@@ -4,8 +4,7 @@
 #include "sha1-lookup.h"
 #include "patch-ids.h"
 
-static int commit_patch_id(struct commit *commit, struct diff_options *options,
-			   unsigned char *sha1, int loose)
+static void start_patch_id(struct commit *commit, struct diff_options *options)
 {
 	if (commit->parents)
 		diff_tree_sha1(commit->parents->item->object.sha1,
@@ -13,7 +12,6 @@ static int commit_patch_id(struct commit *commit, struct diff_options *options,
 	else
 		diff_root_tree_sha1(commit->object.sha1, "", options);
 	diffcore_std(options);
-	return diff_flush_patch_id(options, sha1, loose);
 }
 
 int init_patch_ids(struct patch_ids *ids)
@@ -50,12 +48,16 @@ static struct patch_id *add_commit(struct commit *commit,
 				   int no_add)
 {
 	struct patch_id *p;
-	unsigned char loose[20], strict[20];
+	unsigned char loose[20], strict[20] = {0};
 	unsigned long hash;
 	void **pos;
 
-	if (commit_patch_id(commit, &ids->diffopts, loose, 1))
+	start_patch_id(commit, &ids->diffopts);
+	if (diff_get_patch_id(&ids->diffopts, loose, 1)) {
+		diff_queue_clear();
 		return NULL;
+	}
+
 	hash = hash_sha1(loose);
 
 	p = lookup_hash(hash, &ids->table);
@@ -66,15 +68,24 @@ static struct patch_id *add_commit(struct commit *commit,
 
 		/*
 		 * We have a real loose match; lazily load and cache the strict
-		 * id to compare.
+		 * id to compare. We must do the "current" commit first, as its
+		 * incremental state is waiting in the diff machinery.
 		 */
+		if (is_null_sha1(strict)) {
+			int result = diff_get_patch_id(&ids->diffopts, strict, 0);
+			diff_queue_clear();
+			if (result)
+				return NULL;
+		}
+
 		if (is_null_sha1(p->strict)) {
-			if (commit_patch_id(p->commit, &ids->diffopts,
-					    p->strict, 0))
+			int result;
+			start_patch_id(p->commit, &ids->diffopts);
+			result = diff_get_patch_id(&ids->diffopts, p->strict, 0);
+			diff_queue_clear();
+			if (result)
 				return NULL;
 		}
-		if (commit_patch_id(commit, &ids->diffopts, strict, 0))
-			return NULL;
 
 		/*
 		 * If the strict ones match, we do not need to look farther;
@@ -85,6 +96,14 @@ static struct patch_id *add_commit(struct commit *commit,
 	}
 
 	/*
+	 * If we get here and have not filled in strict, then we do not need
+	 * to compute it (for now), but we must clean up the leftover diff
+	 * state.
+	 */
+	if (is_null_sha1(strict))
+		diff_queue_clear();
+
+	/*
 	 * Otherwise, we may have a loose but not strict match, or even no
 	 * loose match at all. Now we can add the new entry (or return failure
 	 * to look it up).

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [RFC/PATCH] patch-ids: check modified paths before calculating diff
  2013-05-29  6:20 ` Jeff King
  2013-05-29  7:22   ` Jeff King
@ 2013-05-29 18:08   ` Junio C Hamano
  2013-05-29 18:36     ` Jeff King
  1 sibling, 1 reply; 13+ messages in thread
From: Junio C Hamano @ 2013-05-29 18:08 UTC (permalink / raw)
  To: Jeff King; +Cc: John Keeping, git, Kevin Bracey

Jeff King <peff@peff.net> writes:

> I think such a loose patch-id could just be a hash of the filenames that
> were changed by the patch (e.g., the first 32-bits of the sha1 of the
> concatenated filenames). Computing that should be about as expensive as
> a tree-diff. Per observation 2 above, if two commits do not have the
> same loose id, we know that they cannot possibly have the same strict
> id.

Because the "strict" one already hashes the filenames, if files that
are touched by a patch is different from that of another patch, we
judge them being different.

> Then we can forget about the smaller-side and bigger-side entirely, and
> just do something like:
>
>   1. Make a sorted list (or hash table) of loose ids for one side.
>
>   2. For each commit on the other side, calculate its loose id and look
>      that up in the sorted list. If no hits, we know that there is no
>      match. For any hits, lazily calculate (and cache) the strict patch
>      id for both sides and compare as usual.
>
> In the best case, we compute no patch-ids at all. And even for the
> average case, I'd expect our lazy calculation to only have to compute a
> handful of ids.

Correct.

This has rather interesting ramifications on cherry-pick and rebase,
though.  Both command can handle changes that come from an old tree
before some paths were renamed, but strict patch-id would not spot
equivalent changes we already have in our history if our change
happened after a rename, i.e.

   Z
  /
 O---R---X---Y

where Z updates path F, R moves F to G and X changes G the same way
as Z changes F, and we are trying to cherry-pick Z on top of Y.  The
cherry-pick filter will see different patch-id for Z and X.

We will likely to notice that "patch already applied" (if using am-3
machinery) or "already up-to-date" (if using merge machinery) even
when we missed this equivalency and drop the duplicate from the
result, so it is not a big loss, but we might want to consider
removing the filename from patch-id computation, at least for the
ones we internally use and discard for revs->cherry_pick filtering.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC/PATCH] patch-ids: check modified paths before calculating diff
  2013-05-29 18:08   ` Junio C Hamano
@ 2013-05-29 18:36     ` Jeff King
  2013-05-29 18:48       ` Junio C Hamano
  0 siblings, 1 reply; 13+ messages in thread
From: Jeff King @ 2013-05-29 18:36 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: John Keeping, git, Kevin Bracey

On Wed, May 29, 2013 at 11:08:46AM -0700, Junio C Hamano wrote:

> This has rather interesting ramifications on cherry-pick and rebase,
> though.  Both command can handle changes that come from an old tree
> before some paths were renamed, but strict patch-id would not spot
> equivalent changes we already have in our history if our change
> happened after a rename, i.e.
> 
>    Z
>   /
>  O---R---X---Y
> 
> where Z updates path F, R moves F to G and X changes G the same way
> as Z changes F, and we are trying to cherry-pick Z on top of Y.  The
> cherry-pick filter will see different patch-id for Z and X.

True. That is a problem with the current patch-id system, no? Though my
proposal would make it harder to change it in the future (as does
John's).

With mine, I wonder if you could use a different "loose" definition that
does not take the filenames into account.  Using something like the
changes in file sizes would be unique, but would not properly map to
strict cases (a patch-id that adds 5 lines to the end of a 100-byte file
may match one that adds the same five lines to the end of a 200-byte
file).

-Peff

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC/PATCH] patch-ids: check modified paths before calculating diff
  2013-05-29 18:36     ` Jeff King
@ 2013-05-29 18:48       ` Junio C Hamano
  2013-05-29 21:30         ` John Keeping
  0 siblings, 1 reply; 13+ messages in thread
From: Junio C Hamano @ 2013-05-29 18:48 UTC (permalink / raw)
  To: Jeff King; +Cc: John Keeping, git, Kevin Bracey

Jeff King <peff@peff.net> writes:

> On Wed, May 29, 2013 at 11:08:46AM -0700, Junio C Hamano wrote:
>
>> This has rather interesting ramifications on cherry-pick and rebase,
>> though.  Both command can handle changes that come from an old tree
>> before some paths were renamed, but strict patch-id would not spot
>> equivalent changes we already have in our history if our change
>> happened after a rename, i.e.
>> 
>>    Z
>>   /
>>  O---R---X---Y
>> 
>> where Z updates path F, R moves F to G and X changes G the same way
>> as Z changes F, and we are trying to cherry-pick Z on top of Y.  The
>> cherry-pick filter will see different patch-id for Z and X.
>
> True. That is a problem with the current patch-id system, no?

Correct.  That is why I suggested not to change the external
interface (i.e. what is shown from patch-id command) as people may
have kept them, but wondered if a possible improvement may be to
exclude the name from ids when we internally generate two sets of
Ids and intersect them, i.e. log --cherry.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC/PATCH] patch-ids: check modified paths before calculating diff
  2013-05-29 18:48       ` Junio C Hamano
@ 2013-05-29 21:30         ` John Keeping
  0 siblings, 0 replies; 13+ messages in thread
From: John Keeping @ 2013-05-29 21:30 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Jeff King, git, Kevin Bracey

On Wed, May 29, 2013 at 11:48:53AM -0700, Junio C Hamano wrote:
> Jeff King <peff@peff.net> writes:
> 
> > On Wed, May 29, 2013 at 11:08:46AM -0700, Junio C Hamano wrote:
> >
> >> This has rather interesting ramifications on cherry-pick and rebase,
> >> though.  Both command can handle changes that come from an old tree
> >> before some paths were renamed, but strict patch-id would not spot
> >> equivalent changes we already have in our history if our change
> >> happened after a rename, i.e.
> >> 
> >>    Z
> >>   /
> >>  O---R---X---Y
> >> 
> >> where Z updates path F, R moves F to G and X changes G the same way
> >> as Z changes F, and we are trying to cherry-pick Z on top of Y.  The
> >> cherry-pick filter will see different patch-id for Z and X.
> >
> > True. That is a problem with the current patch-id system, no?
> 
> Correct.  That is why I suggested not to change the external
> interface (i.e. what is shown from patch-id command) as people may
> have kept them, but wondered if a possible improvement may be to
> exclude the name from ids when we internally generate two sets of
> Ids and intersect them, i.e. log --cherry.

Would that not rely on the files still being in the same order?  Since
we essentially hash the content of the patch, it will presumably be
quite different if the order of the files in the patch changes.

I expect it's possible to work around that by being a bit more clever
though.  Perhaps hash each file individually and then sort those and
hash the concatenated result?

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2013-05-29 21:30 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-05-19 13:17 [RFC/PATCH] patch-ids: check modified paths before calculating diff John Keeping
2013-05-19 19:36 ` Jonathan Nieder
2013-05-19 22:02   ` John Keeping
2013-05-20  6:36     ` Jonathan Nieder
2013-05-20  8:16       ` John Keeping
2013-05-20  4:46   ` Junio C Hamano
2013-05-29  6:20 ` Jeff King
2013-05-29  7:22   ` Jeff King
2013-05-29  7:50     ` Jeff King
2013-05-29 18:08   ` Junio C Hamano
2013-05-29 18:36     ` Jeff King
2013-05-29 18:48       ` Junio C Hamano
2013-05-29 21:30         ` John Keeping

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.