[RFC/PATCH] patch-ids: check modified paths before calculating diff

* [RFC/PATCH] patch-ids: check modified paths before calculating diff
@ 2013-05-19 13:17 John Keeping
  2013-05-19 19:36 ` Jonathan Nieder
  2013-05-29  6:20 ` Jeff King
  0 siblings, 2 replies; 13+ messages in thread
From: John Keeping @ 2013-05-19 13:17 UTC (permalink / raw)
  To: git; +Cc: Kevin Bracey, Junio C Hamano, John Keeping

When using "git cherry" or "git log --cherry-pick" we often have a small
number of commits on one side and a large number on the other.  In
revision.c::cherry_pick_list we store the patch IDs for the small side
before comparing the large side to this.

In this case, it is likely that only a small number of paths are touched
by the commits on the smaller side of the range and by storing these we
can ignore many commits on the other side before unpacking blobs and
diffing them.

This improves performance in every example I have tried (times are best
of three, using git.git):

Before:
$ time git log --cherry master...jk/submodule-subdirectory-ok >/dev/null

real    0m0.373s
user    0m0.341s
sys     0m0.031s

After:
$ time git log --cherry master...jk/submodule-subdirectory-ok >/dev/null

real    0m0.060s
user    0m0.055s
sys     0m0.005s

Before:
$ time git log --cherry next...pu >/dev/null

real    0m0.661s
user    0m0.617s
sys     0m0.044s

After:
$ time git log --cherry next...pu >/dev/null

real    0m0.509s
user    0m0.478s
sys     0m0.030s

Signed-off-by: John Keeping <john@keeping.me.uk>
---
 patch-ids.c | 142 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 patch-ids.h |   3 ++
 2 files changed, 145 insertions(+)

diff --git a/patch-ids.c b/patch-ids.c
index bc8a28f..912f40c 100644
--- a/patch-ids.c
+++ b/patch-ids.c
@@ -1,5 +1,6 @@
 #include "cache.h"
 #include "diff.h"
+#include "diffcore.h"
 #include "commit.h"
 #include "sha1-lookup.h"
 #include "patch-ids.h"
@@ -16,6 +17,137 @@ static int commit_patch_id(struct commit *commit, struct diff_options *options,
 	return diff_flush_patch_id(options, sha1);
 }
 
+struct collect_paths_info {
+	struct string_list *paths;
+	int searching;
+};
+
+static int collect_paths_recursive(int n, struct tree_desc *t,
+	const char *base, struct pathspec *pathspec,
+	struct collect_paths_info *data);
+
+static int same_entry(struct name_entry *a, struct name_entry *b)
+{
+	if (!a->sha1 || !b->sha1)
+		return a->sha1 == b->sha1;
+	return	!hashcmp(a->sha1, b->sha1) &&
+		a->mode == b->mode;
+}
+
+static char *traverse_path(const struct traverse_info *info,
+		const struct name_entry *n)
+{
+	char *path = xmalloc(traverse_path_len(info, n) + 1);
+	return make_traverse_path(path, info, n);
+}
+
+static int add_touched_path(struct collect_paths_info *info, const char *path)
+{
+	if (info->searching) {
+		if (!string_list_has_string(info->paths, path))
+			return -1;
+	}
+	string_list_insert(info->paths, path);
+	return 0;
+}
+
+static inline const unsigned char *dir_sha1(struct name_entry *e)
+{
+	if (S_ISDIR(e->mode))
+		return e->sha1;
+	return NULL;
+}
+
+static int collect_touched_paths_cb(int n, unsigned long mask,
+		unsigned long dirmask, struct name_entry *entry,
+		struct traverse_info *info)
+{
+	struct collect_paths_info *collect_info = info->data;
+	if (n == 1) {
+		/* We're handling a root commit - add all the paths. */
+		if (entry[0].sha1 && !S_ISDIR(entry[0].mode)) {
+			if (add_touched_path(collect_info, entry[0].path))
+				return -1;
+		} else if (S_ISDIR(entry[0].mode)) {
+			char *newbase = traverse_path(info, entry);
+			struct tree_desc t[1];
+			void *buf0 = fill_tree_descriptor(t, entry[0].sha1);
+			int error = collect_paths_recursive(1, t, newbase,
+					info->pathspec, collect_info);
+			free(buf0);
+			free(newbase);
+			if (error < 0)
+				return error;
+		}
+		return mask;
+	}
+
+	if (same_entry(entry+0, entry+1))
+		return mask;
+
+	if (entry[0].mode && !S_ISDIR(entry[0].mode))
+		if (add_touched_path(collect_info, entry[0].path))
+			return -1;
+	if (entry[1].mode && !S_ISDIR(entry[1].mode))
+		if (add_touched_path(collect_info, entry[1].path))
+			return -1;
+
+	if ((entry[0].sha1 && S_ISDIR(entry[0].mode)) ||
+	    (entry[1].sha1 && S_ISDIR(entry[1].mode))) {
+		char *newbase = traverse_path(info,
+				S_ISDIR(entry[0].mode) ? entry+0 : entry+1);
+		struct tree_desc t[2];
+		void *buf0 = fill_tree_descriptor(t+0, dir_sha1(entry+0));
+		void *buf1 = fill_tree_descriptor(t+1, dir_sha1(entry+1));
+		int error = collect_paths_recursive(2, t, newbase,
+				info->pathspec, collect_info);
+		free(buf0);
+		free(buf1);
+		free(newbase);
+		if (error < 0)
+			return error;
+	}
+
+	return mask;
+}
+
+static int collect_paths_recursive(int n, struct tree_desc *t,
+		const char *base, struct pathspec *pathspec,
+		struct collect_paths_info *data)
+{
+	struct traverse_info info;
+
+	setup_traverse_info(&info, base);
+	info.data = data;
+	info.fn = collect_touched_paths_cb;
+	info.pathspec = pathspec;
+
+	return traverse_trees(n, t, &info);
+}
+
+static int collect_touched_paths(struct commit *commit, struct patch_ids *ids,
+		int searching)
+{
+	struct tree_desc trees[2];
+	struct collect_paths_info info = { &ids->touched_paths, searching };
+	void *commitbuf;
+	int result;
+
+	commitbuf = fill_tree_descriptor(trees + 1, commit->object.sha1);
+	if (commit->parents) {
+		void *parentbuf = fill_tree_descriptor(trees + 0,
+					commit->parents->item->object.sha1);
+		result = collect_paths_recursive(2, trees, "",
+				&ids->diffopts.pathspec, &info);
+		free(parentbuf);
+	} else {
+		result = collect_paths_recursive(1, trees + 1, "",
+				&ids->diffopts.pathspec, &info);
+	}
+	free(commitbuf);
+	return result;
+}
+
 static const unsigned char *patch_id_access(size_t index, void *table)
 {
 	struct patch_id **id_table = table;
@@ -40,6 +172,7 @@ int init_patch_ids(struct patch_ids *ids)
 	diff_setup(&ids->diffopts);
 	DIFF_OPT_SET(&ids->diffopts, RECURSIVE);
 	diff_setup_done(&ids->diffopts);
+	ids->touched_paths.strdup_strings = 1;
 	return 0;
 }
 
@@ -52,6 +185,8 @@ int free_patch_ids(struct patch_ids *ids)
 		next = patches->next;
 		free(patches);
 	}
+
+	string_list_clear(&ids->touched_paths, 0);
 	return 0;
 }
 
@@ -64,6 +199,13 @@ static struct patch_id *add_commit(struct commit *commit,
 	unsigned char sha1[20];
 	int pos;
 
+	if (no_add) {
+		if (collect_touched_paths(commit, ids, 1) < 0)
+			return NULL;
+	} else {
+		collect_touched_paths(commit, ids, 0);
+	}
+
 	if (commit_patch_id(commit, &ids->diffopts, sha1))
 		return NULL;
 	pos = patch_pos(ids->table, ids->nr, sha1);
diff --git a/patch-ids.h b/patch-ids.h
index c8c7ca1..c90bec5 100644
--- a/patch-ids.h
+++ b/patch-ids.h
@@ -1,6 +1,8 @@
 #ifndef PATCH_IDS_H
 #define PATCH_IDS_H
 
+#include "string-list.h"
+
 struct patch_id {
 	unsigned char patch_id[20];
 	char seen;
@@ -8,6 +10,7 @@ struct patch_id {
 
 struct patch_ids {
 	struct diff_options diffopts;
+	struct string_list touched_paths;
 	int nr, alloc;
 	struct patch_id **table;
 	struct patch_id_bucket *patches;
-- 
1.8.3.rc3.372.g721bad8

^ permalink raw reply related	[flat|nested] 13+ messages in thread