All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] patch-ids.c: cache patch IDs in a notes tree
@ 2013-05-11 19:54 John Keeping
  2013-05-11 21:10 ` Linus Torvalds
  0 siblings, 1 reply; 18+ messages in thread
From: John Keeping @ 2013-05-11 19:54 UTC (permalink / raw)
  To: git; +Cc: Johannes Schindelin, Junio C Hamano, John Keeping

This adds a new configuration variable "patchid.cacheRef" which controls
whether (and where) patch IDs will be cached in a notes tree.

Caching patch IDs in this way results in a performance improvement in
every case I have tried, for example when comparing the git-gui tree to
git.git (where git-gui has been merged into git.git but most git.git
commits do not appear in git-gui):

Without patch ID caching:
$ time git log --cherry master...git-gui/master >/dev/null
real    0m32.981s
user    0m32.364s
sys     0m0.621s

Prime the cache:
$ time git -c patchid.cacheref=patchids log --cherry \
	master...git-gui/master >/dev/null
real    0m33.860s
user    0m32.832s
sys     0m0.986s

With all patch IDs cached:
$ time git -c patchid.cacheref=patchids log --cherry \
	master...git-gui/master >/dev/null
real    0m1.041s
user    0m0.679s
sys     0m0.363s

Signed-off-by: John Keeping <john@keeping.me.uk>
---
This is another approach to fixing the "log --cherry" takes a long time
issue I encountered comparing commits built on git-gui to those in
git.git [1][2].

I think this is a useful feature even outside that use case.  I measured
a small improvement (when the cache is primed) even comparing two
branches with 1 and 2 different commits respectively.

[1] http://article.gmane.org/gmane.comp.version-control.git/223959
[2] http://article.gmane.org/gmane.comp.version-control.git/223956

 Documentation/config.txt             |  7 +++
 patch-ids.c                          | 91 +++++++++++++++++++++++++++++++++++-
 patch-ids.h                          |  1 +
 t/t6007-rev-list-cherry-pick-file.sh | 16 +++++++
 4 files changed, 114 insertions(+), 1 deletion(-)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index 6e53fc5..e36585c 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -1798,6 +1798,13 @@ pager.<cmd>::
 	precedence over this option.  To disable pagination for all
 	commands, set `core.pager` or `GIT_PAGER` to `cat`.
 
+patchid.cacheRef::
+	The name of a notes ref in which to store patch IDs for commits.
+	The ref is taken to be in `refs/notes/` if it is not qualified.
++
+Note that the patch ID notes are never pruned automatically, so you may
+wish to periodically run `git notes --ref <ref> prune` against this ref.
+
 pretty.<name>::
 	Alias for a --pretty= format string, as specified in
 	linkgit:git-log[1]. Any aliases defined here can be used just
diff --git a/patch-ids.c b/patch-ids.c
index bc8a28f..cb05eec 100644
--- a/patch-ids.c
+++ b/patch-ids.c
@@ -1,6 +1,9 @@
 #include "cache.h"
+#include "blob.h"
 #include "diff.h"
 #include "commit.h"
+#include "notes.h"
+#include "refs.h"
 #include "sha1-lookup.h"
 #include "patch-ids.h"
 
@@ -34,12 +37,38 @@ struct patch_id_bucket {
 	struct patch_id bucket[BUCKET_SIZE];
 };
 
+static int patch_id_config(const char *var, const char *value, void *cb)
+{
+	const char **cacheref = cb;
+
+	if (!strcmp(var, "patchid.cacheref"))
+		return git_config_string(cacheref, var, value);
+
+	return 0;
+}
+
 int init_patch_ids(struct patch_ids *ids)
 {
+	char *cacheref = NULL;
+
 	memset(ids, 0, sizeof(*ids));
 	diff_setup(&ids->diffopts);
 	DIFF_OPT_SET(&ids->diffopts, RECURSIVE);
 	diff_setup_done(&ids->diffopts);
+
+	git_config(patch_id_config, &cacheref);
+	if (cacheref) {
+		struct strbuf sb = STRBUF_INIT;
+		strbuf_addstr(&sb, cacheref);
+		expand_notes_ref(&sb);
+
+		ids->cache = xcalloc(1, sizeof(*ids->cache));
+		init_notes(ids->cache, sb.buf, combine_notes_overwrite, 0);
+
+		strbuf_release(&sb);
+		free(cacheref);
+	}
+
 	return 0;
 }
 
@@ -52,9 +81,67 @@ int free_patch_ids(struct patch_ids *ids)
 		next = patches->next;
 		free(patches);
 	}
+
+	if (ids->cache) {
+		unsigned char notes_sha1[20];
+		if (write_notes_tree(ids->cache, notes_sha1) ||
+		    update_ref("patch-id: update cache", ids->cache->ref,
+				notes_sha1, NULL, 0, QUIET_ON_ERR))
+			error(_("failed to write patch ID cache"));
+
+		free_notes(ids->cache);
+		ids->cache = NULL;
+	}
+
 	return 0;
 }
 
+static int load_cached_patch_id(struct commit *commit,
+		struct notes_tree *cache, unsigned char *sha1)
+{
+	const unsigned char *note_sha1;
+	char *note;
+	enum object_type type;
+	unsigned long notelen;
+	int result = 1;
+
+	if (!cache)
+		return 1;
+
+	note_sha1 = get_note(cache, commit->object.sha1);
+	if (!note_sha1)
+		return 1;
+
+	if (!(note = read_sha1_file(note_sha1, &type, &notelen)) || !notelen ||
+			type != OBJ_BLOB)
+		goto out;
+
+	if (get_sha1_hex(note, sha1))
+		goto out;
+
+	result = 0;
+out:
+	free(note);
+	return result;
+}
+
+static void save_cached_patch_id(struct commit *commit,
+		struct notes_tree *cache, unsigned char *sha1)
+{
+	unsigned char note_sha1[20];
+	struct strbuf sb = STRBUF_INIT;
+	if (!cache)
+		return;
+
+	strbuf_addstr(&sb, sha1_to_hex(sha1));
+	strbuf_addch(&sb, '\n');
+	if (write_sha1_file(sb.buf, sb.len, blob_type, note_sha1) ||
+	    add_note(cache, commit->object.sha1, note_sha1, NULL))
+		error(_("unable to save patch ID in cache"));
+
+	strbuf_release(&sb);
+}
+
 static struct patch_id *add_commit(struct commit *commit,
 				   struct patch_ids *ids,
 				   int no_add)
@@ -64,11 +151,13 @@ static struct patch_id *add_commit(struct commit *commit,
 	unsigned char sha1[20];
 	int pos;
 
-	if (commit_patch_id(commit, &ids->diffopts, sha1))
+	if (load_cached_patch_id(commit, ids->cache, sha1) &&
+	    commit_patch_id(commit, &ids->diffopts, sha1))
 		return NULL;
 	pos = patch_pos(ids->table, ids->nr, sha1);
 	if (0 <= pos)
 		return ids->table[pos];
+	save_cached_patch_id(commit, ids->cache, sha1);
 	if (no_add)
 		return NULL;
 
diff --git a/patch-ids.h b/patch-ids.h
index c8c7ca1..cffb0eb 100644
--- a/patch-ids.h
+++ b/patch-ids.h
@@ -8,6 +8,7 @@ struct patch_id {
 
 struct patch_ids {
 	struct diff_options diffopts;
+	struct notes_tree *cache;
 	int nr, alloc;
 	struct patch_id **table;
 	struct patch_id_bucket *patches;
diff --git a/t/t6007-rev-list-cherry-pick-file.sh b/t/t6007-rev-list-cherry-pick-file.sh
index 28d4f6b..1b112c3 100755
--- a/t/t6007-rev-list-cherry-pick-file.sh
+++ b/t/t6007-rev-list-cherry-pick-file.sh
@@ -207,4 +207,20 @@ test_expect_success '--count --left-right' '
 	test_cmp expect actual
 '
 
+cat >expect <<EOF
+1	1	2
+EOF
+
+test_expect_success 'rev-list --cherry-mark caches patch ids' '
+	test_config patchid.cacheref patchids &&
+	git rev-list --cherry-mark --left-right --count F...E -- bar >actual &&
+	test_cmp expect actual &&
+	git notes --ref patchids show master >cached_master &&
+	git log -p -1 master | git patch-id | sed -e "s/ .*//" >patch-id_master &&
+	test_cmp patch-id_master cached_master &&
+	# Get the patch IDs again, now they should be cached
+	git rev-list --cherry-mark --left-right --count F...E -- bar >actual &&
+	test_cmp expect actual
+'
+
 test_done
-- 
1.8.3.rc1.289.gcb3647f

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH] patch-ids.c: cache patch IDs in a notes tree
  2013-05-11 19:54 [PATCH] patch-ids.c: cache patch IDs in a notes tree John Keeping
@ 2013-05-11 21:10 ` Linus Torvalds
  2013-05-11 21:49   ` John Keeping
  0 siblings, 1 reply; 18+ messages in thread
From: Linus Torvalds @ 2013-05-11 21:10 UTC (permalink / raw)
  To: John Keeping; +Cc: Git Mailing List, Johannes Schindelin, Junio C Hamano

On Sat, May 11, 2013 at 12:54 PM, John Keeping <john@keeping.me.uk> wrote:
> This adds a new configuration variable "patchid.cacheRef" which controls
> whether (and where) patch IDs will be cached in a notes tree.

Patch ID's aren't stable wrt different diff options, so I think this
is potentially a very bad idea.

           Linus

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] patch-ids.c: cache patch IDs in a notes tree
  2013-05-11 21:10 ` Linus Torvalds
@ 2013-05-11 21:49   ` John Keeping
  2013-05-11 22:41     ` Linus Torvalds
  0 siblings, 1 reply; 18+ messages in thread
From: John Keeping @ 2013-05-11 21:49 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Git Mailing List, Johannes Schindelin, Junio C Hamano

On Sat, May 11, 2013 at 02:10:01PM -0700, Linus Torvalds wrote:
> On Sat, May 11, 2013 at 12:54 PM, John Keeping <john@keeping.me.uk> wrote:
> > This adds a new configuration variable "patchid.cacheRef" which controls
> > whether (and where) patch IDs will be cached in a notes tree.
> 
> Patch ID's aren't stable wrt different diff options, so I think this
> is potentially a very bad idea.

Hmm... I hadn't realised that.  Looking a bit closer, it looks like
init_patch_ids sets up its own diffopts so its not affected by the
command line (except for pathspecs which would be easy to check for).
Of course that still means it can be affected by settings in the user's
configuration.

It's a pity that this can't be done since it gives a significant
performance improvement for some tasks that I perform relatively
frequently.  Is there a reason patch IDs couldn't be generated using
fixed diff options?  Since there's no way to control it from the command
line it seems surprising that the results of "log --cherry" might be
different based on what's in your config.

That could go either way I suppose - is it useful to be able to change
patch IDs based on command line arguments or is it wrong that as we add
persistent diff settings to the configuration we've been silently
changing the behaviour of "git patch-id" and "git cherry".

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] patch-ids.c: cache patch IDs in a notes tree
  2013-05-11 21:49   ` John Keeping
@ 2013-05-11 22:41     ` Linus Torvalds
  2013-05-11 23:57       ` Johannes Schindelin
  2013-05-12  3:00       ` [PATCH] patch-ids.c: " Junio C Hamano
  0 siblings, 2 replies; 18+ messages in thread
From: Linus Torvalds @ 2013-05-11 22:41 UTC (permalink / raw)
  To: John Keeping; +Cc: Git Mailing List, Johannes Schindelin, Junio C Hamano

On Sat, May 11, 2013 at 2:49 PM, John Keeping <john@keeping.me.uk> wrote:
>
> Hmm... I hadn't realised that.  Looking a bit closer, it looks like
> init_patch_ids sets up its own diffopts so its not affected by the
> command line (except for pathspecs which would be easy to check for).
> Of course that still means it can be affected by settings in the user's
> configuration.

.. and in the actual diff algorithm.

The thing is - patches ARE NOT STABLE. There are many valid ways to
get a patch from one version to another, and even without command line
changes, we've had different versions of git generating different
patches. There are heuristics in xdiff to avoid some nasty "use up
tons of CPU-time" things that have been tweaked over time. And even
for simple cases there are ambiguous ways to describe the patch.

Now, maybe we don't care, because in practice, most of the time this
just doesn't much matter. And anybody who uses patch-ID's had better
not rely on them being guaranteed to be unique anyway, so it's more of
a "if the patch ID is the same, it's almost guaranteed to be the same
patch", but that's a big "almost". And no, it's not some theoretical
"SHA1 collisions are very unlikely" kind of thing, it's a "the patch
ID actually ignores a lot of data in order to give the same ID even if
lins have been added above it, and the patch is at different line
numbers etc".

So maybe it doesn't matter. But at the same time, I really think
caching patch ID's should be something people should be aware of is
fundamentally wrong, even if it might work.

And quite frankly, if you do rebases etc so much that you think patch
ID's are so important that they need to be cached, you may be doing
odd/wrong things.

             Linus

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] patch-ids.c: cache patch IDs in a notes tree
  2013-05-11 22:41     ` Linus Torvalds
@ 2013-05-11 23:57       ` Johannes Schindelin
  2013-05-12  9:08         ` John Keeping
  2013-05-12  3:00       ` [PATCH] patch-ids.c: " Junio C Hamano
  1 sibling, 1 reply; 18+ messages in thread
From: Johannes Schindelin @ 2013-05-11 23:57 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: John Keeping, Git Mailing List, Junio C Hamano

Hi John & Linus,

On Sat, 11 May 2013, Linus Torvalds wrote:

> [...] I really think caching patch ID's should be something people
> should be aware of is fundamentally wrong, even if it might work.

Given the incredible performance win, I would say it is still worth
looking into.

If you store also a hash of Git version and diff options (may even be the
hash of the raw bytes of the diff options if you do not plan to share the
ref between machines) with the patch ID, you can make it safe.

That hash would be generated at patch_id init time and
load_cached_patch_id() would check this hash in addition to the return
value of get_sha1() (and ignore the note if the version/diff options
differ).

If you are following git.git slavishly, maybe hashing just the major/minor
Git version would be in order to avoid frequent regeneration of identical
patch IDs.

> And quite frankly, if you do rebases etc so much that you think patch
> ID's are so important that they need to be cached, you may be doing
> odd/wrong things.

AFAICT John actually gave a very valid scenario that validates his use
case: git-gui patches are best tested in the git.git scenario but have to
be contributed via git-gui.git. It's not John's fault that this typically
requires a lot of rebasing between vastly divergent histories.

John, do you think you can incorporate that "finger-print" of the patch ID
generation and store it in the same note?

Ciao,
Johannes

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] patch-ids.c: cache patch IDs in a notes tree
  2013-05-11 22:41     ` Linus Torvalds
  2013-05-11 23:57       ` Johannes Schindelin
@ 2013-05-12  3:00       ` Junio C Hamano
  2013-05-12  8:59         ` John Keeping
  1 sibling, 1 reply; 18+ messages in thread
From: Junio C Hamano @ 2013-05-12  3:00 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: John Keeping, Git Mailing List, Johannes Schindelin

Linus Torvalds <torvalds@linux-foundation.org> writes:

> On Sat, May 11, 2013 at 2:49 PM, John Keeping <john@keeping.me.uk> wrote:
>>
>> Hmm... I hadn't realised that.  Looking a bit closer, it looks like
>> init_patch_ids sets up its own diffopts so its not affected by the
>> command line (except for pathspecs which would be easy to check for).
>> Of course that still means it can be affected by settings in the user's
>> configuration.
>
> .. and in the actual diff algorithm.

As to the "objection" side of the argument, I already said
essentially the same thing several months ago:

  http://thread.gmane.org/gmane.comp.version-control.git/202654/focus=202898

and do not have much to add [*1*].

However.

The use of patch-id in cherry and rebase is to facilitate avoiding
to replay commits that are obviously identical to the ones you have
in your history.  The cached patch id for an existing old commit may
differ from a patch id you freshly compute for a new commit you are
trying to see if it truly new, even though they may represent the
same change.  So we may incorrectly think such a new commit is not
yet in your history and attempt to replay it.

But it is not a big problem.  Either 3-way merge notices that there
is nothing new, or you get a conflict and have chance to inspect
what is going on.

A conceptually much larger and more problematic issue is that we may
discard a truly new change that you still need as an old one you
already have due to a hash collision and discard it.  Because the
hash space of SHA-1 is so large, however, it is not a problem in
practice, and more importantly, that hash space is just as large as
the hash space used by Git to reduce a patch to a patch id, the
filtering done with patch-id in cherry and rebase _already_ have
that exact problem with or without this additional cache layer. A
stale cache may make the possibility of lost change due to such a
hash collision merely twice as likely.

> ... it's a "the patch ID actually ignores a lot of data in order
> to give the same ID even if lins have been added above it, and the
> patch is at different line numbers etc".

Yes.

> So maybe it doesn't matter. But at the same time, I really think
> caching patch ID's should be something people should be aware of is
> fundamentally wrong, even if it might work.

I do not think it is "caching patch ID" that people should be aware
of is fundamentally wrong.  What is fundamentally wrong, even if it
might work, is "using patch ID" itself.

> And quite frankly, if you do rebases etc so much that you think patch
> ID's are so important that they need to be cached, you may be doing
> odd/wrong things.

And that, too ;-)


[Footnote]

*1* For people listening from the sidelines, the fact that Git
algorithm can improve over time is a real issue, and and has caused
one issue that still hasn't been solved in the k.org upload process.
Somebody who has a repository there could *theoretically*:

 - push her v1.1 release via Git ("git push origin v1.1");
 - create a tarball ("git archive -o v1.1.tar v1.1") and diff since the
   last release ("git diff v1.0 v1.1 >v1.0-v1.1.diff") locally;
 - GPG sign them ("gpg -b v1.1.tar", "gpg -b v1.0-v1.1.diff"); and
 - upload only the signature files

and have k.org create the tarballs and diff to save bandwidth of
uploading logically derivable stuff over and over again.  But that
can be done only when output from "git archive" and "git diff" are
stable, which is not the case.  We changed how extended header fields
are used in the tar archive for a long pathname recently, and also
we changed use of XDF_NEED_MINIMAL a couple of years ago in "git diff";
both of these affect the output.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] patch-ids.c: cache patch IDs in a notes tree
  2013-05-12  3:00       ` [PATCH] patch-ids.c: " Junio C Hamano
@ 2013-05-12  8:59         ` John Keeping
  2013-05-12 22:19           ` Junio C Hamano
  0 siblings, 1 reply; 18+ messages in thread
From: John Keeping @ 2013-05-12  8:59 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Linus Torvalds, Git Mailing List, Johannes Schindelin

On Sat, May 11, 2013 at 08:00:44PM -0700, Junio C Hamano wrote:
> Linus Torvalds <torvalds@linux-foundation.org> writes:
> 
> > On Sat, May 11, 2013 at 2:49 PM, John Keeping <john@keeping.me.uk> wrote:
> >>
> >> Hmm... I hadn't realised that.  Looking a bit closer, it looks like
> >> init_patch_ids sets up its own diffopts so its not affected by the
> >> command line (except for pathspecs which would be easy to check for).
> >> Of course that still means it can be affected by settings in the user's
> >> configuration.
> >
> > .. and in the actual diff algorithm.
> 
> As to the "objection" side of the argument, I already said
> essentially the same thing several months ago:
> 
>   http://thread.gmane.org/gmane.comp.version-control.git/202654/focus=202898
> 
> and do not have much to add [*1*].
> 
> However.
> 
> The use of patch-id in cherry and rebase is to facilitate avoiding
> to replay commits that are obviously identical to the ones you have
> in your history.  The cached patch id for an existing old commit may
> differ from a patch id you freshly compute for a new commit you are
> trying to see if it truly new, even though they may represent the
> same change.  So we may incorrectly think such a new commit is not
> yet in your history and attempt to replay it.
> 
> But it is not a big problem.  Either 3-way merge notices that there
> is nothing new, or you get a conflict and have chance to inspect
> what is going on.

It's not a problem here, but false negatives would be annoying if you're
looking at "git log --cherry-mark".

> A conceptually much larger and more problematic issue is that we may
> discard a truly new change that you still need as an old one you
> already have due to a hash collision and discard it.  Because the
> hash space of SHA-1 is so large, however, it is not a problem in
> practice, and more importantly, that hash space is just as large as
> the hash space used by Git to reduce a patch to a patch id, the
> filtering done with patch-id in cherry and rebase _already_ have
> that exact problem with or without this additional cache layer. A
> stale cache may make the possibility of lost change due to such a
> hash collision merely twice as likely.
> 
> > ... it's a "the patch ID actually ignores a lot of data in order
> > to give the same ID even if lins have been added above it, and the
> > patch is at different line numbers etc".
> 
> Yes.
> 
> > So maybe it doesn't matter. But at the same time, I really think
> > caching patch ID's should be something people should be aware of is
> > fundamentally wrong, even if it might work.
> 
> I do not think it is "caching patch ID" that people should be aware
> of is fundamentally wrong.  What is fundamentally wrong, even if it
> might work, is "using patch ID" itself.
> 
> > And quite frankly, if you do rebases etc so much that you think patch
> > ID's are so important that they need to be cached, you may be doing
> > odd/wrong things.
> 
> And that, too ;-)

I've never noticed a problem with rebases, it's when I use "git log
--cherry master..." to see if patches I've sent to a mailing list have
been picked up.

To take Git as an example (albeit a bad one because "What's Cooking" is
a more useful way to track patch state here), if I compare this patch to
pu I have:

	$ git rev-list --left-right --count pu...
	234	1

and caching patch IDs takes that from ~0.6s to ~0.1s.  When doing that
over several branches consecutively that makes a big difference to the
overall runtime, especially because most of the commits of interest will
be cached during the first one.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] patch-ids.c: cache patch IDs in a notes tree
  2013-05-11 23:57       ` Johannes Schindelin
@ 2013-05-12  9:08         ` John Keeping
  2013-05-12 11:41           ` [RFC/PATCH v2] patch-ids: " John Keeping
  0 siblings, 1 reply; 18+ messages in thread
From: John Keeping @ 2013-05-12  9:08 UTC (permalink / raw)
  To: Johannes Schindelin; +Cc: Linus Torvalds, Git Mailing List, Junio C Hamano

On Sat, May 11, 2013 at 06:57:02PM -0500, Johannes Schindelin wrote:
> On Sat, 11 May 2013, Linus Torvalds wrote:
> 
> > [...] I really think caching patch ID's should be something people
> > should be aware of is fundamentally wrong, even if it might work.
> 
> Given the incredible performance win, I would say it is still worth
> looking into.
> 
> If you store also a hash of Git version and diff options (may even be the
> hash of the raw bytes of the diff options if you do not plan to share the
> ref between machines) with the patch ID, you can make it safe.
> 
> That hash would be generated at patch_id init time and
> load_cached_patch_id() would check this hash in addition to the return
> value of get_sha1() (and ignore the note if the version/diff options
> differ).

I was thinking about this overnight, glad to see someone else had the
same idea :-)

It's slightly annoying because the diff options can be customized after
we return from init_patch_ids() so we either need a new
setup_patch_ids() function to be run after init once diff options have
been set or to set it lazily.  I'll try introducing a setup function.

> If you are following git.git slavishly, maybe hashing just the major/minor
> Git version would be in order to avoid frequent regeneration of identical
> patch IDs.

I think just storing the version is quite good here, and it avoids pain
when a topic that affects patch IDs is working its way through pu and
next.

> > And quite frankly, if you do rebases etc so much that you think patch
> > ID's are so important that they need to be cached, you may be doing
> > odd/wrong things.
> 
> AFAICT John actually gave a very valid scenario that validates his use
> case: git-gui patches are best tested in the git.git scenario but have to
> be contributed via git-gui.git. It's not John's fault that this typically
> requires a lot of rebasing between vastly divergent histories.

Actually, I don't think that use case is valid.  Because it's a subtree
merge I can be absolutely certain that nothing on the LHS of
master...git-gui/master is patch identical to anything on the RHS since
all the paths are different.  So doing "git log --cherry-mark" in that
case is completely useless.  I think my script should be able to learn
that, which gets rid of the really horrible case I was seeing, but it
would be nice to improve the "fast enough" cases as well if it can be
done without too much effort.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [RFC/PATCH v2] patch-ids: cache patch IDs in a notes tree
  2013-05-12  9:08         ` John Keeping
@ 2013-05-12 11:41           ` John Keeping
  2013-05-12 11:57             ` John Keeping
  0 siblings, 1 reply; 18+ messages in thread
From: John Keeping @ 2013-05-12 11:41 UTC (permalink / raw)
  To: Johannes Schindelin; +Cc: Linus Torvalds, Git Mailing List, Junio C Hamano

This adds a new configuration variable "patchid.cacheRef" which controls
whether (and where) patch IDs will be cached in a notes tree.  The cache
includes a hash of the diff options in place when the cache was
generated as well as the version of Git that generated it.

When the diff options hash changes we simply ignore that cache entry and
replace it with the new patch ID with the new diff options.  But if the
Git version changes we throw away the entire cache tree on the
assumption that the user is unlikely to be simply flipping between
different versions of Git.

Caching patch IDs in this way results in a performance improvement in
every case I have tried, for example if I run a script which checks
whether any of six local branches have been picked up upstream I get the
following results:

Without patchid.cacheRef enabled:

	$ time git integration --status jk/pu >/dev/null

	real    0m4.295s
	user    0m3.927s
	sys     0m0.270s

With patchid.cacheRef set but not yet initialised:

	$ time git integration --status jk/pu >/dev/null

	real    0m2.317s
	user    0m2.036s
	sys     0m0.187s

With patchid.cacheRef and cache primed:

	$ time git integration --status jk/pu >/dev/null

	real    0m1.565s
	user    0m1.310s
	sys     0m0.153s

The script basically does:

	git log --cherry pu...<branch>

for each of six branches in turn (with some additional commands around
that which are now the bottleneck).

Signed-off-by: John Keeping <john@keeping.me.uk>
---
On Sun, May 12, 2013 at 10:08:51AM +0100, John Keeping wrote:
> On Sat, May 11, 2013 at 06:57:02PM -0500, Johannes Schindelin wrote:
> > On Sat, 11 May 2013, Linus Torvalds wrote:
> > 
> > > [...] I really think caching patch ID's should be something people
> > > should be aware of is fundamentally wrong, even if it might work.
> > 
> > Given the incredible performance win, I would say it is still worth
> > looking into.
> > 
> > If you store also a hash of Git version and diff options (may even be the
> > hash of the raw bytes of the diff options if you do not plan to share the
> > ref between machines) with the patch ID, you can make it safe.
> > 
> > That hash would be generated at patch_id init time and
> > load_cached_patch_id() would check this hash in addition to the return
> > value of get_sha1() (and ignore the note if the version/diff options
> > differ).
> 
> I was thinking about this overnight, glad to see someone else had the
> same idea :-)
> 
> It's slightly annoying because the diff options can be customized after
> we return from init_patch_ids() so we either need a new
> setup_patch_ids() function to be run after init once diff options have
> been set or to set it lazily.  I'll try introducing a setup function.

This is what that looks like.  I think the way I'm hashing the diff
options is sensible but another pair of eyes would be useful there.

A cache entry looks like this:

	9e99e9dbf5c6a717ac60f7ee425c53e87ffd821a
	diffopts:f8ca35c3d9d57076338dff8abf91b07157bb6329
	1.8.3.rc1.45.gcb72da6.dirty

 Documentation/config.txt             |   7 ++
 builtin/log.c                        |   1 +
 patch-ids.c                          | 215 ++++++++++++++++++++++++++++++++++-
 patch-ids.h                          |   4 +
 revision.c                           |   1 +
 t/t6007-rev-list-cherry-pick-file.sh |  16 +++
 6 files changed, 243 insertions(+), 1 deletion(-)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index 6e53fc5..e36585c 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -1798,6 +1798,13 @@ pager.<cmd>::
 	precedence over this option.  To disable pagination for all
 	commands, set `core.pager` or `GIT_PAGER` to `cat`.
 
+patchid.cacheRef::
+	The name of a notes ref in which to store patch IDs for commits.
+	The ref is taken to be in `refs/notes/` if it is not qualified.
++
+Note that the patch ID notes are never pruned automatically, so you may
+wish to periodically run `git notes --ref <ref> prune` against this ref.
+
 pretty.<name>::
 	Alias for a --pretty= format string, as specified in
 	linkgit:git-log[1]. Any aliases defined here can be used just
diff --git a/builtin/log.c b/builtin/log.c
index 6e56a50..dd6c24d 100644
--- a/builtin/log.c
+++ b/builtin/log.c
@@ -758,6 +758,7 @@ static void get_patch_ids(struct rev_info *rev, struct patch_ids *ids)
 		die(_("Not a range."));
 
 	init_patch_ids(ids);
+	setup_patch_ids(ids);
 
 	/* given a range a..b get all patch ids for b..a */
 	init_revisions(&check_rev, rev->prefix);
diff --git a/patch-ids.c b/patch-ids.c
index bc8a28f..73b2aaf 100644
--- a/patch-ids.c
+++ b/patch-ids.c
@@ -1,8 +1,12 @@
 #include "cache.h"
+#include "blob.h"
 #include "diff.h"
 #include "commit.h"
+#include "notes.h"
+#include "refs.h"
 #include "sha1-lookup.h"
 #include "patch-ids.h"
+#include "version.h"
 
 static int commit_patch_id(struct commit *commit, struct diff_options *options,
 		    unsigned char *sha1)
@@ -34,6 +38,16 @@ struct patch_id_bucket {
 	struct patch_id bucket[BUCKET_SIZE];
 };
 
+static int patch_id_config(const char *var, const char *value, void *cb)
+{
+	const char **cacheref = cb;
+
+	if (!strcmp(var, "patchid.cacheref"))
+		return git_config_string(cacheref, var, value);
+
+	return 0;
+}
+
 int init_patch_ids(struct patch_ids *ids)
 {
 	memset(ids, 0, sizeof(*ids));
@@ -43,6 +57,108 @@ int init_patch_ids(struct patch_ids *ids)
 	return 0;
 }
 
+static void sha1_update_str(git_SHA_CTX *ctx, const char *s)
+{
+	size_t len = s ? strlen(s) + 1 : 0;
+	long nl = htonl((long) len);
+	git_SHA1_Update(ctx, &nl, sizeof(nl));
+	git_SHA1_Update(ctx, s, len);
+}
+
+static void sha1_update_int(git_SHA_CTX *ctx, int v)
+{
+	long nv = htonl((long) v);
+	git_SHA1_Update(ctx, &nv, sizeof(nv));
+}
+
+static void sha1_update_pathspec(git_SHA_CTX *ctx, struct pathspec *pathspec)
+{
+	int i;
+	/*
+	 * Pathspecs are uniquely identified by their number and match string
+	 * providing that we take limit_pathspec_to_literal into account.
+	 */
+	sha1_update_int(ctx, limit_pathspec_to_literal());
+	sha1_update_int(ctx, pathspec->nr);
+	for (i = 0; i < pathspec->nr; i++)
+		sha1_update_str(ctx, pathspec->items[i].match);
+}
+
+static void hash_diff_options(struct diff_options *options, unsigned char *sha1)
+{
+	git_SHA_CTX ctx;
+	git_SHA1_Init(&ctx);
+
+	sha1_update_str(&ctx, options->filter);
+	/* ignore options->orderfile (see setup_patch_ids) */
+	sha1_update_str(&ctx, options->pickaxe);
+	sha1_update_str(&ctx, options->single_follow);
+	/* a_prefix and b_prefix aren't used for patch IDs */
+
+	sha1_update_int(&ctx, options->flags);
+	/* use_color isn't used for patch IDs */
+	sha1_update_int(&ctx, options->context);
+	sha1_update_int(&ctx, options->interhunkcontext);
+	sha1_update_int(&ctx, options->break_opt);
+	sha1_update_int(&ctx, options->detect_rename);
+	sha1_update_int(&ctx, options->irreversible_delete);
+	sha1_update_int(&ctx, options->skip_stat_unmatch);
+	/* line_termination isn't used for patch IDs */
+	/* output_format isn't used for patch IDs */
+	sha1_update_int(&ctx, options->pickaxe_opts);
+	sha1_update_int(&ctx, options->rename_score);
+	sha1_update_int(&ctx, options->rename_limit);
+	/* needed_rename_limit is set while diffing */
+	/* degraded_cc_to_c is set while diffing */
+	/* show_rename_progress isn't used for patch IDs */
+	/* dirstat_permille isn't used for patch IDs */
+	/* setup isn't used for patch IDs */
+	/* abbrev isn't used for patch IDs */
+	/* prefix and prefix_length aren't used for patch IDs */
+	/* stat_sep isn't used for patch IDs */
+	sha1_update_int(&ctx, options->xdl_opts);
+
+	/* stat arguments aren't used for patch IDs */
+	sha1_update_str(&ctx, options->word_regex);
+	sha1_update_int(&ctx, options->word_diff);
+
+	sha1_update_pathspec(&ctx, &options->pathspec);
+
+	git_SHA1_Final(sha1, &ctx);
+}
+
+int setup_patch_ids(struct patch_ids *ids)
+{
+	char *cacheref = NULL;
+
+	/*
+	 * Make extra sure we aren't using an orderfile as it is unnecessary
+	 * and will break caching.
+	 */
+	ids->diffopts.orderfile = NULL;
+
+	git_config(patch_id_config, &cacheref);
+	if (cacheref) {
+		unsigned char diffopts_raw_hash[20];
+		struct strbuf sb = STRBUF_INIT;
+		strbuf_addstr(&sb, cacheref);
+		expand_notes_ref(&sb);
+
+		ids->cache = xcalloc(1, sizeof(*ids->cache));
+		init_notes(ids->cache, sb.buf, combine_notes_overwrite, 0);
+
+		hash_diff_options(&ids->diffopts, diffopts_raw_hash);
+		strbuf_reset(&sb);
+		strbuf_addf(&sb, "diffopts:%s\n", sha1_to_hex(diffopts_raw_hash));
+		ids->diffopts_hash_len = sb.len;
+		ids->diffopts_hash = strbuf_detach(&sb, NULL);
+
+		free(cacheref);
+	}
+
+	return 0;
+}
+
 int free_patch_ids(struct patch_ids *ids)
 {
 	struct patch_id_bucket *next, *patches;
@@ -52,9 +168,104 @@ int free_patch_ids(struct patch_ids *ids)
 		next = patches->next;
 		free(patches);
 	}
+
+	if (ids->cache) {
+		unsigned char notes_sha1[20];
+		if (write_notes_tree(ids->cache, notes_sha1) ||
+		    update_ref("patch-id: update cache", ids->cache->ref,
+				notes_sha1, NULL, 0, QUIET_ON_ERR))
+			error(_("failed to write patch ID cache"));
+
+		free_notes(ids->cache);
+		ids->cache = NULL;
+
+		free(ids->diffopts_hash);
+		ids->diffopts_hash = NULL;
+	}
+
 	return 0;
 }
 
+static int load_cached_patch_id(struct commit *commit,
+		struct patch_ids *ids, unsigned char *sha1)
+{
+	const unsigned char *note_sha1;
+	char *orig_note;
+	char *note;
+	enum object_type type;
+	unsigned long notelen;
+	int result = 1;
+
+	if (!ids->cache)
+		return 1;
+
+	note_sha1 = get_note(ids->cache, commit->object.sha1);
+	if (!note_sha1)
+		return 1;
+
+	if (!(orig_note = read_sha1_file(note_sha1, &type, &notelen)) ||
+			!notelen || type != OBJ_BLOB)
+		goto out;
+
+	note = orig_note;
+	if (get_sha1_hex(note, sha1))
+		goto out;
+
+	/* Advance past the patch ID */
+	note += 41;
+	/* Was the cached patch ID generated with the same diffopts? */
+	if (strncmp(note, ids->diffopts_hash, ids->diffopts_hash_len)) {
+		goto out;
+	}
+
+	note += ids->diffopts_hash_len;
+	if (note[strlen(note) - 1] == '\n')
+		note[strlen(note) - 1] = '\0';
+	if (strcmp(note, git_version_string)) {
+		struct notes_tree *new_cache;
+		/*
+		 * If the Git version has changed, throw away the entire
+		 * caching notes tree on the assumption that the user will
+		 * not return to the previous version.  We can bail out of
+		 * this function sooner next time round if we don't find a
+		 * note for the commit at all.
+		 */
+		new_cache = xcalloc(1, sizeof(*new_cache));
+		init_notes(new_cache, ids->cache->ref,
+			ids->cache->combine_notes, NOTES_INIT_EMPTY);
+		free_notes(ids->cache);
+		free(ids->cache);
+		ids->cache = new_cache;
+		goto out;
+	}
+
+	result = 0;
+out:
+	free(orig_note);
+	return result;
+}
+
+static void save_cached_patch_id(struct commit *commit,
+		struct patch_ids *ids, unsigned char *sha1)
+{
+	unsigned char note_sha1[20];
+	struct strbuf sb = STRBUF_INIT;
+	if (!ids->cache)
+		return;
+
+	strbuf_addstr(&sb, sha1_to_hex(sha1));
+	strbuf_addch(&sb, '\n');
+	strbuf_add(&sb, ids->diffopts_hash, ids->diffopts_hash_len);
+	strbuf_addstr(&sb, git_version_string);
+	strbuf_addch(&sb, '\n');
+
+	if (write_sha1_file(sb.buf, sb.len, blob_type, note_sha1) ||
+	    add_note(ids->cache, commit->object.sha1, note_sha1, NULL))
+		error(_("unable to save patch ID in cache"));
+
+	strbuf_release(&sb);
+}
+
 static struct patch_id *add_commit(struct commit *commit,
 				   struct patch_ids *ids,
 				   int no_add)
@@ -64,11 +275,13 @@ static struct patch_id *add_commit(struct commit *commit,
 	unsigned char sha1[20];
 	int pos;
 
-	if (commit_patch_id(commit, &ids->diffopts, sha1))
+	if (load_cached_patch_id(commit, ids, sha1) &&
+	    commit_patch_id(commit, &ids->diffopts, sha1))
 		return NULL;
 	pos = patch_pos(ids->table, ids->nr, sha1);
 	if (0 <= pos)
 		return ids->table[pos];
+	save_cached_patch_id(commit, ids, sha1);
 	if (no_add)
 		return NULL;
 
diff --git a/patch-ids.h b/patch-ids.h
index c8c7ca1..08d4dd3 100644
--- a/patch-ids.h
+++ b/patch-ids.h
@@ -8,12 +8,16 @@ struct patch_id {
 
 struct patch_ids {
 	struct diff_options diffopts;
+	struct notes_tree *cache;
+	char *diffopts_hash;
+	size_t diffopts_hash_len;
 	int nr, alloc;
 	struct patch_id **table;
 	struct patch_id_bucket *patches;
 };
 
 int init_patch_ids(struct patch_ids *);
+int setup_patch_ids(struct patch_ids *);
 int free_patch_ids(struct patch_ids *);
 struct patch_id *add_commit_patch_id(struct commit *, struct patch_ids *);
 struct patch_id *has_commit_patch_id(struct commit *, struct patch_ids *);
diff --git a/revision.c b/revision.c
index a67b615..0de4168 100644
--- a/revision.c
+++ b/revision.c
@@ -632,6 +632,7 @@ static void cherry_pick_list(struct commit_list *list, struct rev_info *revs)
 	left_first = left_count < right_count;
 	init_patch_ids(&ids);
 	ids.diffopts.pathspec = revs->diffopt.pathspec;
+	setup_patch_ids(&ids);
 
 	/* Compute patch-ids for one side */
 	for (p = list; p; p = p->next) {
diff --git a/t/t6007-rev-list-cherry-pick-file.sh b/t/t6007-rev-list-cherry-pick-file.sh
index 28d4f6b..378cf3e 100755
--- a/t/t6007-rev-list-cherry-pick-file.sh
+++ b/t/t6007-rev-list-cherry-pick-file.sh
@@ -207,4 +207,20 @@ test_expect_success '--count --left-right' '
 	test_cmp expect actual
 '
 
+cat >expect <<EOF
+3	2	0
+EOF
+
+test_expect_success 'rev-list --cherry-mark caches patch ids' '
+	test_config patchid.cacheref patchids &&
+	git rev-list --cherry-mark --left-right --count F...E >actual &&
+	test_cmp expect actual &&
+	git notes --ref patchids show master >cached_master &&
+	git log -p -1 master | git patch-id | sed -e "s/ .*//" >patch-id_master &&
+	test_cmp patch-id_master cached_master &&
+	# Get the patch IDs again, now they should be cached
+	git rev-list --cherry-mark --left-right --count F...E >actual &&
+	test_cmp expect actual
+'
+
 test_done
-- 
1.8.3.rc1.45.gcb72da6.dirty

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [RFC/PATCH v2] patch-ids: cache patch IDs in a notes tree
  2013-05-12 11:41           ` [RFC/PATCH v2] patch-ids: " John Keeping
@ 2013-05-12 11:57             ` John Keeping
  0 siblings, 0 replies; 18+ messages in thread
From: John Keeping @ 2013-05-12 11:57 UTC (permalink / raw)
  To: Johannes Schindelin; +Cc: Linus Torvalds, Git Mailing List, Junio C Hamano

On Sun, May 12, 2013 at 12:41:31PM +0100, John Keeping wrote:
> diff --git a/t/t6007-rev-list-cherry-pick-file.sh b/t/t6007-rev-list-cherry-pick-file.sh
> index 28d4f6b..378cf3e 100755
> --- a/t/t6007-rev-list-cherry-pick-file.sh
> +++ b/t/t6007-rev-list-cherry-pick-file.sh
> @@ -207,4 +207,20 @@ test_expect_success '--count --left-right' '
>  	test_cmp expect actual
>  '
>  
> +cat >expect <<EOF
> +3	2	0
> +EOF
> +
> +test_expect_success 'rev-list --cherry-mark caches patch ids' '
> +	test_config patchid.cacheref patchids &&
> +	git rev-list --cherry-mark --left-right --count F...E >actual &&
> +	test_cmp expect actual &&
> +	git notes --ref patchids show master >cached_master &&

I forgot to update this test, it needs a "| sed -e 1q" in the middle of
this line to make sure that we're only checking the patch ID and not the
diffopts hash and Git version.

> +	git log -p -1 master | git patch-id | sed -e "s/ .*//" >patch-id_master &&
> +	test_cmp patch-id_master cached_master &&
> +	# Get the patch IDs again, now they should be cached
> +	git rev-list --cherry-mark --left-right --count F...E >actual &&
> +	test_cmp expect actual
> +'
> +
>  test_done

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] patch-ids.c: cache patch IDs in a notes tree
  2013-05-12  8:59         ` John Keeping
@ 2013-05-12 22:19           ` Junio C Hamano
  2013-05-13  7:59             ` John Keeping
  0 siblings, 1 reply; 18+ messages in thread
From: Junio C Hamano @ 2013-05-12 22:19 UTC (permalink / raw)
  To: John Keeping; +Cc: Linus Torvalds, Git Mailing List, Johannes Schindelin

John Keeping <john@keeping.me.uk> writes:

>> But it is not a big problem.  Either 3-way merge notices that there
>> is nothing new, or you get a conflict and have chance to inspect
>> what is going on.
>
> It's not a problem here, but false negatives would be annoying if you're
> looking at "git log --cherry-mark".

The primary thing to notice is that it is not a new problem with or
without the caching layer.  As Linus mentioned how patch-ids are
computed by ignoring offsets and whitespaces, the filtering is done
as a crude approximation and false negatives are part of design, so
making the cache more complex by recording hash of the binary and/or
options used to compute misses the fundamental.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] patch-ids.c: cache patch IDs in a notes tree
  2013-05-12 22:19           ` Junio C Hamano
@ 2013-05-13  7:59             ` John Keeping
  2013-05-13 13:53               ` Junio C Hamano
  0 siblings, 1 reply; 18+ messages in thread
From: John Keeping @ 2013-05-13  7:59 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Linus Torvalds, Git Mailing List, Johannes Schindelin

On Sun, May 12, 2013 at 03:19:49PM -0700, Junio C Hamano wrote:
> John Keeping <john@keeping.me.uk> writes:
> 
> >> But it is not a big problem.  Either 3-way merge notices that there
> >> is nothing new, or you get a conflict and have chance to inspect
> >> what is going on.
> >
> > It's not a problem here, but false negatives would be annoying if you're
> > looking at "git log --cherry-mark".
> 
> The primary thing to notice is that it is not a new problem with or
> without the caching layer.  As Linus mentioned how patch-ids are
> computed by ignoring offsets and whitespaces, the filtering is done
> as a crude approximation and false negatives are part of design, so
> making the cache more complex by recording hash of the binary and/or
> options used to compute misses the fundamental.

The caching layer could also introduce false positives though, which is
more serious.  If you cache patch IDs with a pathspec restriction and
then run a command that uses the cache with no such restriction you
could hit a patch that is the same for those paths but also touches
other paths that you don't want to ignore and mark it as patch identical
even though it is not.

Adding a hash of the diffopts fixes that.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] patch-ids.c: cache patch IDs in a notes tree
  2013-05-13  7:59             ` John Keeping
@ 2013-05-13 13:53               ` Junio C Hamano
  2013-05-13 14:02                 ` John Keeping
  0 siblings, 1 reply; 18+ messages in thread
From: Junio C Hamano @ 2013-05-13 13:53 UTC (permalink / raw)
  To: John Keeping; +Cc: Linus Torvalds, Git Mailing List, Johannes Schindelin

John Keeping <john@keeping.me.uk> writes:

> The caching layer could also introduce false positives though, which is
> more serious.  If you cache patch IDs with a pathspec restriction ...

What?  What business does patch-id have with pathspec-limited diff
generation?  You do not rebase or cherry-pick with pathspec, so
unless you are populating the patch-id cache at a wrong point (like,
say whenevern "git show $commit" is run), I am not sure why pathspec
limit becomes even an issue.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] patch-ids.c: cache patch IDs in a notes tree
  2013-05-13 13:53               ` Junio C Hamano
@ 2013-05-13 14:02                 ` John Keeping
  2013-05-13 14:46                   ` Junio C Hamano
  0 siblings, 1 reply; 18+ messages in thread
From: John Keeping @ 2013-05-13 14:02 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Linus Torvalds, Git Mailing List, Johannes Schindelin

On Mon, May 13, 2013 at 06:53:29AM -0700, Junio C Hamano wrote:
> John Keeping <john@keeping.me.uk> writes:
> 
> > The caching layer could also introduce false positives though, which is
> > more serious.  If you cache patch IDs with a pathspec restriction ...
> 
> What?  What business does patch-id have with pathspec-limited diff
> generation?  You do not rebase or cherry-pick with pathspec, so
> unless you are populating the patch-id cache at a wrong point (like,
> say whenevern "git show $commit" is run), I am not sure why pathspec
> limit becomes even an issue.

revision.c::cherry_pick_list() sets the pathspec to what was specified
in the revision options.  It's done that since commit 36d56de (Fix
--cherry-pick with given paths, 2007-07-10) and t6007 tests that it
works.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] patch-ids.c: cache patch IDs in a notes tree
  2013-05-13 14:02                 ` John Keeping
@ 2013-05-13 14:46                   ` Junio C Hamano
  2013-05-13 14:59                     ` John Keeping
  0 siblings, 1 reply; 18+ messages in thread
From: Junio C Hamano @ 2013-05-13 14:46 UTC (permalink / raw)
  To: John Keeping; +Cc: Linus Torvalds, Git Mailing List, Johannes Schindelin

John Keeping <john@keeping.me.uk> writes:

> On Mon, May 13, 2013 at 06:53:29AM -0700, Junio C Hamano wrote:
>> John Keeping <john@keeping.me.uk> writes:
>> 
>> > The caching layer could also introduce false positives though, which is
>> > more serious.  If you cache patch IDs with a pathspec restriction ...
>> 
>> What?  What business does patch-id have with pathspec-limited diff
>> generation?  You do not rebase or cherry-pick with pathspec, so
>> unless you are populating the patch-id cache at a wrong point (like,
>> say whenevern "git show $commit" is run), I am not sure why pathspec
>> limit becomes even an issue.
>
> revision.c::cherry_pick_list() sets the pathspec to what was specified
> in the revision options.  It's done that since commit 36d56de (Fix
> --cherry-pick with given paths, 2007-07-10) and t6007 tests that it
> works.

Then the caching should be automatically turned off when pathspec is
given.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] patch-ids.c: cache patch IDs in a notes tree
  2013-05-13 14:46                   ` Junio C Hamano
@ 2013-05-13 14:59                     ` John Keeping
  2013-05-13 15:45                       ` Junio C Hamano
  0 siblings, 1 reply; 18+ messages in thread
From: John Keeping @ 2013-05-13 14:59 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Linus Torvalds, Git Mailing List, Johannes Schindelin

On Mon, May 13, 2013 at 07:46:09AM -0700, Junio C Hamano wrote:
> John Keeping <john@keeping.me.uk> writes:
> 
> > On Mon, May 13, 2013 at 06:53:29AM -0700, Junio C Hamano wrote:
> >> John Keeping <john@keeping.me.uk> writes:
> >> 
> >> > The caching layer could also introduce false positives though, which is
> >> > more serious.  If you cache patch IDs with a pathspec restriction ...
> >> 
> >> What?  What business does patch-id have with pathspec-limited diff
> >> generation?  You do not rebase or cherry-pick with pathspec, so
> >> unless you are populating the patch-id cache at a wrong point (like,
> >> say whenevern "git show $commit" is run), I am not sure why pathspec
> >> limit becomes even an issue.
> >
> > revision.c::cherry_pick_list() sets the pathspec to what was specified
> > in the revision options.  It's done that since commit 36d56de (Fix
> > --cherry-pick with given paths, 2007-07-10) and t6007 tests that it
> > works.
> 
> Then the caching should be automatically turned off when pathspec is
> given.

That was my first thought, but since we can be affected by other diff
options set in the user's config as well it ended up being simpler to
include it in the options hash and use that.

This has the advantage that you get the benefit of the cache if you run
"git log --cherry-mark" with the same paths more than once.  In my
testing the cache is beneficial as soon as you examine more than one
similar range (e.g. master...feature-A and then master...feature-B).

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] patch-ids.c: cache patch IDs in a notes tree
  2013-05-13 14:59                     ` John Keeping
@ 2013-05-13 15:45                       ` Junio C Hamano
  2013-05-13 15:52                         ` John Keeping
  0 siblings, 1 reply; 18+ messages in thread
From: Junio C Hamano @ 2013-05-13 15:45 UTC (permalink / raw)
  To: John Keeping; +Cc: Linus Torvalds, Git Mailing List, Johannes Schindelin

John Keeping <john@keeping.me.uk> writes:

> This has the advantage that you get the benefit of the cache if you run
> "git log --cherry-mark" with the same paths more than once.  In my
> testing the cache is beneficial as soon as you examine more than one
> similar range (e.g. master...feature-A and then master...feature-B).

OK, so perhaps the notes that are keyed with commit ID will record
multiple entries, one for each invocation pattern (i.e. all pathspec
given, possibly with nonstandard options)?

"git diff -- t Documentation" and "git diff -- Docu\* t", even
though they use different pathspec, would produce the same diff;
instead of pathspec you may need to key with the actual list of
paths in the patch, though.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] patch-ids.c: cache patch IDs in a notes tree
  2013-05-13 15:45                       ` Junio C Hamano
@ 2013-05-13 15:52                         ` John Keeping
  0 siblings, 0 replies; 18+ messages in thread
From: John Keeping @ 2013-05-13 15:52 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Linus Torvalds, Git Mailing List, Johannes Schindelin

On Mon, May 13, 2013 at 08:45:21AM -0700, Junio C Hamano wrote:
> John Keeping <john@keeping.me.uk> writes:
> 
> > This has the advantage that you get the benefit of the cache if you run
> > "git log --cherry-mark" with the same paths more than once.  In my
> > testing the cache is beneficial as soon as you examine more than one
> > similar range (e.g. master...feature-A and then master...feature-B).
> 
> OK, so perhaps the notes that are keyed with commit ID will record
> multiple entries, one for each invocation pattern (i.e. all pathspec
> given, possibly with nonstandard options)?

That would be possible, but I didn't do it in the current version of the
patch.

> "git diff -- t Documentation" and "git diff -- Docu\* t", even
> though they use different pathspec, would produce the same diff;
> instead of pathspec you may need to key with the actual list of
> paths in the patch, though.

Maybe, but I think that would be overkill.

I'm interested to see how much of a benefit we could get by not
calculating the patch ID of any commits on the larger side of a
symmetric range that touch paths outside the set touched by the smaller
side.  (revision.c::cherry_pick_list() remembers patch IDs for the
smaller side of a symmetric range and then checks if anything on the
larger side matches so this fits in naturally.)

In my usage I generally compare a relatively small topic branch against
whatever has happened in some upstream branch so I think this could give
a big improvement but I haven't had time to try it yet.

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2013-05-13 15:52 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-05-11 19:54 [PATCH] patch-ids.c: cache patch IDs in a notes tree John Keeping
2013-05-11 21:10 ` Linus Torvalds
2013-05-11 21:49   ` John Keeping
2013-05-11 22:41     ` Linus Torvalds
2013-05-11 23:57       ` Johannes Schindelin
2013-05-12  9:08         ` John Keeping
2013-05-12 11:41           ` [RFC/PATCH v2] patch-ids: " John Keeping
2013-05-12 11:57             ` John Keeping
2013-05-12  3:00       ` [PATCH] patch-ids.c: " Junio C Hamano
2013-05-12  8:59         ` John Keeping
2013-05-12 22:19           ` Junio C Hamano
2013-05-13  7:59             ` John Keeping
2013-05-13 13:53               ` Junio C Hamano
2013-05-13 14:02                 ` John Keeping
2013-05-13 14:46                   ` Junio C Hamano
2013-05-13 14:59                     ` John Keeping
2013-05-13 15:45                       ` Junio C Hamano
2013-05-13 15:52                         ` John Keeping

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.