All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v4 0/4] read-cache: speed up add_index_entry
@ 2017-04-04 21:08 git
  2017-04-04 21:08 ` [PATCH v4 1/4] p0004-read-tree: perf test to time read-tree git
                   ` (3 more replies)
  0 siblings, 4 replies; 5+ messages in thread
From: git @ 2017-04-04 21:08 UTC (permalink / raw)
  To: git; +Cc: gitster, peff, Jeff Hostetler

From: Jeff Hostetler <jeffhost@microsoft.com>

Teach add_index_entry_with_check() and has_dir_name()
to avoid index lookups if the given path sorts after
the last entry in the index.

This saves at least 2 binary searches per entry.

This improves performance during checkout and read-tree because
merge_working_tree() and unpack_trees() processes a list of already
sorted entries.

This helps performance on very large repositories.

================
Before and after numbers on index with 1M files.
./p0004-read-tree.sh
0004.2: read-tree (1003037)              3.24(2.46+0.72)
0004.3: switch branches (3038 1003037)   7.53(5.66+1.56)

$ ./p0004-read-tree.sh
0004.2: read-tree (1003040)              2.45(1.79+0.61)
0004.3: switch branches (3041 1003040)   6.65(4.22+1.60)

================
Before and after numbers on index with 100K files.

./p0004-read-tree.sh
0004.2: read-tree (103037)              0.30(0.20+0.08)
0004.3: switch branches (3038 103037)   0.65(0.47+0.16)

$ ./p0004-read-tree.sh
0004.2: read-tree (103040)              0.25(0.16+0.07)
0004.3: switch branches (3041 103040)   0.58(0.44+0.13)
================


Jeff Hostetler (4):
  p0004-read-tree: perf test to time read-tree
  read-cache: add strcmp_offset function
  test-strcmp-offset: created test for strcmp_offset
  read-cache: speed up add_index_entry during checkout

 Makefile                      |  1 +
 cache.h                       |  1 +
 read-cache.c                  | 73 ++++++++++++++++++++++++++++++++++++-
 t/helper/.gitignore           |  1 +
 t/helper/test-strcmp-offset.c | 64 +++++++++++++++++++++++++++++++++
 t/perf/p0004-read-tree.sh     | 84 +++++++++++++++++++++++++++++++++++++++++++
 t/t0065-strcmp-offset.sh      | 11 ++++++
 7 files changed, 234 insertions(+), 1 deletion(-)
 create mode 100644 t/helper/test-strcmp-offset.c
 create mode 100755 t/perf/p0004-read-tree.sh
 create mode 100755 t/t0065-strcmp-offset.sh

-- 
2.9.3


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH v4 1/4] p0004-read-tree: perf test to time read-tree
  2017-04-04 21:08 [PATCH v4 0/4] read-cache: speed up add_index_entry git
@ 2017-04-04 21:08 ` git
  2017-04-04 21:08 ` [PATCH v4 2/4] read-cache: add strcmp_offset function git
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 5+ messages in thread
From: git @ 2017-04-04 21:08 UTC (permalink / raw)
  To: git; +Cc: gitster, peff, Jeff Hostetler

From: Jeff Hostetler <jeffhost@microsoft.com>

Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
---
 t/perf/p0004-read-tree.sh | 84 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 84 insertions(+)
 create mode 100755 t/perf/p0004-read-tree.sh

diff --git a/t/perf/p0004-read-tree.sh b/t/perf/p0004-read-tree.sh
new file mode 100755
index 0000000..ffbbf8e
--- /dev/null
+++ b/t/perf/p0004-read-tree.sh
@@ -0,0 +1,84 @@
+#!/bin/sh
+
+test_description="Tests performance of read-tree"
+
+. ./perf-lib.sh
+
+test_perf_default_repo
+test_checkout_worktree
+
+## usage: dir depth width files
+make_paths () {
+	for f in $(seq $4)
+	do
+		echo $1/file$f
+	done;
+	if test $2 -gt 0;
+	then
+		for w in $(seq $3)
+		do
+			make_paths $1/dir$w $(($2 - 1)) $3 $4
+		done
+	fi
+	return 0
+}
+
+fill_index () {
+	make_paths $1 $2 $3 $4 |
+	sed "s/^/100644 $EMPTY_BLOB	/" |
+	git update-index --index-info
+	return 0
+}
+
+br_base=xxx_base_xxx
+br_work=xxx_work_xxx
+
+new_dir=xxx_dir_xxx
+
+## (5, 10, 9) will create 999,999 files.
+## (4, 10, 9) will create  99,999 files.
+depth=5
+width=10
+files=9
+
+export br_base
+export br_work
+export new_dir
+export depth
+export width
+export files
+
+## The number of files in the xxx_base_xxx branch.
+nr_base=$(git ls-files | wc -l)
+export nr_base
+
+## Inflate the index with thousands of empty files and commit it.
+## Turn on sparse-checkout so that we don't have to populate them
+## later when we start switching branches.
+test_expect_success 'inflate the index' '
+	git reset --hard &&
+	git branch $br_base &&
+	git branch $br_work &&
+	git checkout $br_work &&
+	fill_index $new_dir $depth $width $files &&
+	git commit -m $br_work &&
+	echo $new_dir/file1 >.git/info/sparse-checkout &&
+	git config --local core.sparsecheckout 1 &&
+	git reset --hard
+'
+
+## The number of files in the xxx_work_xxx branch.
+nr_work=$(git ls-files | wc -l)
+export nr_work
+
+test_perf "read-tree ($nr_work)" '
+	git read-tree -m $br_base $br_work -n
+'
+
+test_perf "switch branches ($nr_base $nr_work)" '
+	git checkout $br_base &&
+	git checkout $br_work
+'
+
+
+test_done
-- 
2.9.3


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH v4 2/4] read-cache: add strcmp_offset function
  2017-04-04 21:08 [PATCH v4 0/4] read-cache: speed up add_index_entry git
  2017-04-04 21:08 ` [PATCH v4 1/4] p0004-read-tree: perf test to time read-tree git
@ 2017-04-04 21:08 ` git
  2017-04-04 21:08 ` [PATCH v4 3/4] test-strcmp-offset: created test for strcmp_offset git
  2017-04-04 21:08 ` [PATCH v4 4/4] read-cache: speed up add_index_entry during checkout git
  3 siblings, 0 replies; 5+ messages in thread
From: git @ 2017-04-04 21:08 UTC (permalink / raw)
  To: git; +Cc: gitster, peff, Jeff Hostetler

From: Jeff Hostetler <jeffhost@microsoft.com>

Add strcmp_offset() function to also return the offset of the
first change.

Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
---
 cache.h      |  1 +
 read-cache.c | 29 +++++++++++++++++++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/cache.h b/cache.h
index 80b6372..4d82490 100644
--- a/cache.h
+++ b/cache.h
@@ -574,6 +574,7 @@ extern int write_locked_index(struct index_state *, struct lock_file *lock, unsi
 extern int discard_index(struct index_state *);
 extern int unmerged_index(const struct index_state *);
 extern int verify_path(const char *path);
+extern int strcmp_offset(const char *s1_in, const char *s2_in, int *first_change);
 extern int index_dir_exists(struct index_state *istate, const char *name, int namelen);
 extern void adjust_dirname_case(struct index_state *istate, char *name);
 extern struct cache_entry *index_file_exists(struct index_state *istate, const char *name, int namelen, int igncase);
diff --git a/read-cache.c b/read-cache.c
index 9054369..b3fc77d 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -887,6 +887,35 @@ static int has_file_name(struct index_state *istate,
 	return retval;
 }
 
+
+/*
+ * Like strcmp(), but also return the offset of the first change.
+ */
+int strcmp_offset(const char *s1_in, const char *s2_in, int *first_change)
+{
+	const unsigned char *s1 = (const unsigned char *)s1_in;
+	const unsigned char *s2 = (const unsigned char *)s2_in;
+	int diff = 0;
+	int k;
+
+	*first_change = 0;
+	for (k=0; s1[k]; k++)
+		if ((diff = (s1[k] - s2[k])))
+			goto found_it;
+	if (!s2[k])
+		return 0;
+	diff = -1;
+
+found_it:
+	*first_change = k;
+	if (diff > 0)
+		return 1;
+	else if (diff < 0)
+		return -1;
+	else
+		return 0;
+}
+
 /*
  * Do we have another file with a pathname that is a proper
  * subset of the name we're trying to add?
-- 
2.9.3


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH v4 3/4] test-strcmp-offset: created test for strcmp_offset
  2017-04-04 21:08 [PATCH v4 0/4] read-cache: speed up add_index_entry git
  2017-04-04 21:08 ` [PATCH v4 1/4] p0004-read-tree: perf test to time read-tree git
  2017-04-04 21:08 ` [PATCH v4 2/4] read-cache: add strcmp_offset function git
@ 2017-04-04 21:08 ` git
  2017-04-04 21:08 ` [PATCH v4 4/4] read-cache: speed up add_index_entry during checkout git
  3 siblings, 0 replies; 5+ messages in thread
From: git @ 2017-04-04 21:08 UTC (permalink / raw)
  To: git; +Cc: gitster, peff, Jeff Hostetler

From: Jeff Hostetler <jeffhost@microsoft.com>

Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
---
 Makefile                      |  1 +
 t/helper/.gitignore           |  1 +
 t/helper/test-strcmp-offset.c | 64 +++++++++++++++++++++++++++++++++++++++++++
 t/t0065-strcmp-offset.sh      | 11 ++++++++
 4 files changed, 77 insertions(+)
 create mode 100644 t/helper/test-strcmp-offset.c
 create mode 100755 t/t0065-strcmp-offset.sh

diff --git a/Makefile b/Makefile
index 9ec6065..4c4c246 100644
--- a/Makefile
+++ b/Makefile
@@ -631,6 +631,7 @@ TEST_PROGRAMS_NEED_X += test-scrap-cache-tree
 TEST_PROGRAMS_NEED_X += test-sha1
 TEST_PROGRAMS_NEED_X += test-sha1-array
 TEST_PROGRAMS_NEED_X += test-sigchain
+TEST_PROGRAMS_NEED_X += test-strcmp-offset
 TEST_PROGRAMS_NEED_X += test-string-list
 TEST_PROGRAMS_NEED_X += test-submodule-config
 TEST_PROGRAMS_NEED_X += test-subprocess
diff --git a/t/helper/.gitignore b/t/helper/.gitignore
index d6e8b36..0a89531 100644
--- a/t/helper/.gitignore
+++ b/t/helper/.gitignore
@@ -25,6 +25,7 @@
 /test-sha1
 /test-sha1-array
 /test-sigchain
+/test-strcmp-offset
 /test-string-list
 /test-submodule-config
 /test-subprocess
diff --git a/t/helper/test-strcmp-offset.c b/t/helper/test-strcmp-offset.c
new file mode 100644
index 0000000..fe01318
--- /dev/null
+++ b/t/helper/test-strcmp-offset.c
@@ -0,0 +1,64 @@
+#include "cache.h"
+
+struct test_data {
+	const char *s1;
+	const char *s2;
+	int first_change;
+};
+
+static struct test_data data[] = {
+	{ "abc", "abc", 0 },
+	{ "abc", "def", 0 },
+
+	{ "abc", "abz", 2 },
+
+	{ "abc", "abcdef", 3 },
+
+	{ "abc\xF0zzz", "abc\xFFzzz", 3 },
+
+	{ NULL, NULL, 0 }
+};
+
+int try_pair(const char *sa, const char *sb, int first_change)
+{
+	int failed = 0;
+	int offset, r_exp, r_tst;
+	int r_exp_sign, r_tst_sign;
+
+	/*
+	 * Because differnt CRTs behave differently, only rely on signs
+	 * of the result values.
+	 */
+	r_exp = strcmp(sa, sb);
+	r_exp_sign = ((r_exp < 0) ? -1 : ((r_exp == 0) ? 0 : 1));
+
+	r_tst = strcmp_offset(sa, sb, &offset);
+	r_tst_sign = ((r_tst < 0) ? -1 : ((r_tst == 0) ? 0 : 1));
+
+	if (r_tst_sign != r_exp_sign) {
+		error("FAIL: '%s' vs '%s', result expect %d, observed %d\n",
+			  sa, sb, r_exp_sign, r_tst_sign);
+		failed = 1;
+	}
+
+	if (offset != first_change) {
+		error("FAIL: '%s' vs '%s', offset expect %d, observed %d\n",
+			  sa, sb, first_change, offset);
+		failed = 1;
+	}
+
+	return failed;
+}
+
+int cmd_main(int argc, const char **argv)
+{
+	int failed = 0;
+	int k;
+
+	for (k=0; data[k].s1; k++) {
+		failed += try_pair(data[k].s1, data[k].s2, data[k].first_change);
+		failed += try_pair(data[k].s2, data[k].s1, data[k].first_change);
+	}
+
+	return failed;
+}
diff --git a/t/t0065-strcmp-offset.sh b/t/t0065-strcmp-offset.sh
new file mode 100755
index 0000000..0176c8c
--- /dev/null
+++ b/t/t0065-strcmp-offset.sh
@@ -0,0 +1,11 @@
+#!/bin/sh
+
+test_description='Test strcmp_offset functionality'
+
+. ./test-lib.sh
+
+test_expect_success run_helper '
+	test-strcmp-offset
+'
+
+test_done
-- 
2.9.3


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH v4 4/4] read-cache: speed up add_index_entry during checkout
  2017-04-04 21:08 [PATCH v4 0/4] read-cache: speed up add_index_entry git
                   ` (2 preceding siblings ...)
  2017-04-04 21:08 ` [PATCH v4 3/4] test-strcmp-offset: created test for strcmp_offset git
@ 2017-04-04 21:08 ` git
  3 siblings, 0 replies; 5+ messages in thread
From: git @ 2017-04-04 21:08 UTC (permalink / raw)
  To: git; +Cc: gitster, peff, Jeff Hostetler

From: Jeff Hostetler <jeffhost@microsoft.com>

Teach add_index_entry_with_check() and has_dir_name()
to see if the path of the new item is greater than the
last path in the index array before attempting to search
for it.

During checkout, merge_working_tree() populates the new
index in sorted order, so this change saves at least 2
lookups per file.

Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
---
 read-cache.c | 44 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 43 insertions(+), 1 deletion(-)

diff --git a/read-cache.c b/read-cache.c
index b3fc77d..bf9fc53 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -927,6 +927,21 @@ static int has_dir_name(struct index_state *istate,
 	int stage = ce_stage(ce);
 	const char *name = ce->name;
 	const char *slash = name + ce_namelen(ce);
+	int len_eq_last;
+	int cmp_last = 0;
+
+	if (istate->cache_nr > 0) {
+		/*
+		 * Compare the entry's full path with the last path in the index.
+		 * If it sorts AFTER the last entry in the index and they have no
+		 * common prefix, then there cannot be any F/D name conflicts.
+		 */
+		cmp_last = strcmp_offset(name,
+			istate->cache[istate->cache_nr-1]->name,
+			&len_eq_last);
+		if (cmp_last > 0 && len_eq_last == 0)
+			return retval;
+	}
 
 	for (;;) {
 		int len;
@@ -939,6 +954,24 @@ static int has_dir_name(struct index_state *istate,
 		}
 		len = slash - name;
 
+		if (cmp_last > 0) {
+			/*
+			 * If this part of the directory prefix (including the trailing
+			 * slash) already appears in the path of the last entry in the
+			 * index, then we cannot also have a file with this prefix (or
+			 * any parent directory prefix).
+			 */
+			if (len+1 <= len_eq_last)
+				return retval;
+			/*
+			 * If this part of the directory prefix (excluding the trailing
+			 * slash) is longer than the known equal portions, then this part
+			 * of the prefix cannot collide with a file.  Go on to the parent.
+			 */
+			if (len > len_eq_last)
+				continue;
+		}
+
 		pos = index_name_stage_pos(istate, name, len, stage);
 		if (pos >= 0) {
 			/*
@@ -1030,7 +1063,16 @@ static int add_index_entry_with_check(struct index_state *istate, struct cache_e
 
 	if (!(option & ADD_CACHE_KEEP_CACHE_TREE))
 		cache_tree_invalidate_path(istate, ce->name);
-	pos = index_name_stage_pos(istate, ce->name, ce_namelen(ce), ce_stage(ce));
+
+	/*
+	 * If this entry's path sorts after the last entry in the index,
+	 * we can avoid searching for it.
+	 */
+	if (istate->cache_nr > 0 &&
+		strcmp(ce->name, istate->cache[istate->cache_nr - 1]->name) > 0)
+		pos = -istate->cache_nr - 1;
+	else
+		pos = index_name_stage_pos(istate, ce->name, ce_namelen(ce), ce_stage(ce));
 
 	/* existing match? Just replace it. */
 	if (pos >= 0) {
-- 
2.9.3


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2017-04-04 21:09 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-04-04 21:08 [PATCH v4 0/4] read-cache: speed up add_index_entry git
2017-04-04 21:08 ` [PATCH v4 1/4] p0004-read-tree: perf test to time read-tree git
2017-04-04 21:08 ` [PATCH v4 2/4] read-cache: add strcmp_offset function git
2017-04-04 21:08 ` [PATCH v4 3/4] test-strcmp-offset: created test for strcmp_offset git
2017-04-04 21:08 ` [PATCH v4 4/4] read-cache: speed up add_index_entry during checkout git

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.