All of lore.kernel.org
 help / color / mirror / Atom feed
From: Jeff King <peff@peff.net>
To: John Keeping <john@keeping.me.uk>
Cc: git@vger.kernel.org, Kevin Bracey <kevin@bracey.fi>,
	Junio C Hamano <gitster@pobox.com>
Subject: Re: [RFC/PATCH] patch-ids: check modified paths before calculating diff
Date: Wed, 29 May 2013 03:50:20 -0400	[thread overview]
Message-ID: <20130529075020.GC11955@sigill.intra.peff.net> (raw)
In-Reply-To: <20130529072225.GB11955@sigill.intra.peff.net>

On Wed, May 29, 2013 at 03:22:25AM -0400, Jeff King wrote:

>   revs=origin/master...origin/jk/submodule-subdirectory-ok
>         stock    |    you   |    me
>         -------------------------------
>   real  0m0.501s | 0m0.078s | 0m0.098s
>   user  0m0.480s | 0m0.056s | 0m0.084s
>   sys   0m0.016s | 0m0.020s | 0m0.012s
> 
>   revs=origin/next...origin/pu
>         stock    |    you   |    me
>         -------------------------------
>   real  0m0.857s | 0m0.847s | 0m0.519s
>   user  0m0.828s | 0m0.812s | 0m0.488s
>   sys   0m0.024s | 0m0.028s | 0m0.024s
> 
> So it performs slightly less well on the small case, and a bit better on
> the large case. I think part of the problem is that when we do have a
> "loose" hit, we end up re-doing the tree diff to find the strict, which
> involves re-inflating the trees. It's unavoidable for the lazy entries
> unless we want to cache the whole diff_queued_diff, but for the non-lazy
> entries we should be able to do the strict patch-id incrementally. We
> just need to improve the function interfaces.

The (somewhat hacky) patch below, on top of my previous one, does that
optimization.  But the timings aren't improved by much. My best-of-five
for the second case went down to:

  real    0m0.495s
  user    0m0.488s
  sys     0m0.004s

However, the actual time to just do "git log --raw $revs" is:

  real    0m0.333s
  user    0m0.292s
  sys     0m0.032s

which provides a lower-bound for how well we can do, as it is just doing
a single tree diff for each commit. So I think we have reaped most of
the benefits of this approach already (and we will generally have to do
_some_ true patch-id calculations, so we can never meet that lower
bound).

---
 diff.c      | 11 +++-------
 diff.h      |  3 ++-
 patch-ids.c | 39 ++++++++++++++++++++++++++---------
 3 files changed, 34 insertions(+), 19 deletions(-)

diff --git a/diff.c b/diff.c
index 3b55788..161c5bf 100644
--- a/diff.c
+++ b/diff.c
@@ -4233,8 +4233,8 @@ static void patch_id_consume(void *priv, char *line, unsigned long len)
 }
 
 /* returns 0 upon success, and writes result into sha1 */
-static int diff_get_patch_id(struct diff_options *options, unsigned char *sha1,
-			     int loose)
+int diff_get_patch_id(struct diff_options *options, unsigned char *sha1,
+		      int loose)
 {
 	struct diff_queue_struct *q = &diff_queued_diff;
 	int i;
@@ -4330,21 +4330,16 @@ int diff_flush_patch_id(struct diff_options *options,
 	return 0;
 }
 
-int diff_flush_patch_id(struct diff_options *options,
-			unsigned char *sha1,
-			int loose)
+void diff_queue_clear(void)
 {
 	struct diff_queue_struct *q = &diff_queued_diff;
 	int i;
-	int result = diff_get_patch_id(options, sha1, loose);
 
 	for (i = 0; i < q->nr; i++)
 		diff_free_filepair(q->queue[i]);
 
 	free(q->queue);
 	DIFF_QUEUE_CLEAR(q);
-
-	return result;
 }
 
 static int is_summary_empty(const struct diff_queue_struct *q)
diff --git a/diff.h b/diff.h
index b018aaf..7207d4b 100644
--- a/diff.h
+++ b/diff.h
@@ -320,7 +320,8 @@ extern int do_diff_cache(const unsigned char *, struct diff_options *);
 extern int run_diff_index(struct rev_info *revs, int cached);
 
 extern int do_diff_cache(const unsigned char *, struct diff_options *);
-extern int diff_flush_patch_id(struct diff_options *, unsigned char *, int loose);
+extern int diff_get_patch_id(struct diff_options *, unsigned char *, int loose);
+extern void diff_queue_clear(void);
 
 extern int diff_result_code(struct diff_options *, int);
 
diff --git a/patch-ids.c b/patch-ids.c
index 3a83ee6..83cda92 100644
--- a/patch-ids.c
+++ b/patch-ids.c
@@ -4,8 +4,7 @@
 #include "sha1-lookup.h"
 #include "patch-ids.h"
 
-static int commit_patch_id(struct commit *commit, struct diff_options *options,
-			   unsigned char *sha1, int loose)
+static void start_patch_id(struct commit *commit, struct diff_options *options)
 {
 	if (commit->parents)
 		diff_tree_sha1(commit->parents->item->object.sha1,
@@ -13,7 +12,6 @@ static int commit_patch_id(struct commit *commit, struct diff_options *options,
 	else
 		diff_root_tree_sha1(commit->object.sha1, "", options);
 	diffcore_std(options);
-	return diff_flush_patch_id(options, sha1, loose);
 }
 
 int init_patch_ids(struct patch_ids *ids)
@@ -50,12 +48,16 @@ static struct patch_id *add_commit(struct commit *commit,
 				   int no_add)
 {
 	struct patch_id *p;
-	unsigned char loose[20], strict[20];
+	unsigned char loose[20], strict[20] = {0};
 	unsigned long hash;
 	void **pos;
 
-	if (commit_patch_id(commit, &ids->diffopts, loose, 1))
+	start_patch_id(commit, &ids->diffopts);
+	if (diff_get_patch_id(&ids->diffopts, loose, 1)) {
+		diff_queue_clear();
 		return NULL;
+	}
+
 	hash = hash_sha1(loose);
 
 	p = lookup_hash(hash, &ids->table);
@@ -66,15 +68,24 @@ static struct patch_id *add_commit(struct commit *commit,
 
 		/*
 		 * We have a real loose match; lazily load and cache the strict
-		 * id to compare.
+		 * id to compare. We must do the "current" commit first, as its
+		 * incremental state is waiting in the diff machinery.
 		 */
+		if (is_null_sha1(strict)) {
+			int result = diff_get_patch_id(&ids->diffopts, strict, 0);
+			diff_queue_clear();
+			if (result)
+				return NULL;
+		}
+
 		if (is_null_sha1(p->strict)) {
-			if (commit_patch_id(p->commit, &ids->diffopts,
-					    p->strict, 0))
+			int result;
+			start_patch_id(p->commit, &ids->diffopts);
+			result = diff_get_patch_id(&ids->diffopts, p->strict, 0);
+			diff_queue_clear();
+			if (result)
 				return NULL;
 		}
-		if (commit_patch_id(commit, &ids->diffopts, strict, 0))
-			return NULL;
 
 		/*
 		 * If the strict ones match, we do not need to look farther;
@@ -85,6 +96,14 @@ static struct patch_id *add_commit(struct commit *commit,
 	}
 
 	/*
+	 * If we get here and have not filled in strict, then we do not need
+	 * to compute it (for now), but we must clean up the leftover diff
+	 * state.
+	 */
+	if (is_null_sha1(strict))
+		diff_queue_clear();
+
+	/*
 	 * Otherwise, we may have a loose but not strict match, or even no
 	 * loose match at all. Now we can add the new entry (or return failure
 	 * to look it up).

  reply	other threads:[~2013-05-29  7:50 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-05-19 13:17 [RFC/PATCH] patch-ids: check modified paths before calculating diff John Keeping
2013-05-19 19:36 ` Jonathan Nieder
2013-05-19 22:02   ` John Keeping
2013-05-20  6:36     ` Jonathan Nieder
2013-05-20  8:16       ` John Keeping
2013-05-20  4:46   ` Junio C Hamano
2013-05-29  6:20 ` Jeff King
2013-05-29  7:22   ` Jeff King
2013-05-29  7:50     ` Jeff King [this message]
2013-05-29 18:08   ` Junio C Hamano
2013-05-29 18:36     ` Jeff King
2013-05-29 18:48       ` Junio C Hamano
2013-05-29 21:30         ` John Keeping

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20130529075020.GC11955@sigill.intra.peff.net \
    --to=peff@peff.net \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    --cc=john@keeping.me.uk \
    --cc=kevin@bracey.fi \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.