All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v5 0/4] read-cache: speed up add_index_entry
@ 2017-04-05 17:38 git
  2017-04-05 17:38 ` [PATCH v5 1/4] p0004-read-tree: perf test to time read-tree git
                   ` (3 more replies)
  0 siblings, 4 replies; 16+ messages in thread
From: git @ 2017-04-05 17:38 UTC (permalink / raw)
  To: git; +Cc: gitster, peff, Jeff Hostetler

From: Jeff Hostetler <jeffhost@microsoft.com>

[This is labeled as version 5 because of a cut-n-paste
accident where my first version of this patch was labeled
version 4.]

This version adds more perf tests to p0004-read-tree.sh

================
Teach add_index_entry_with_check() and has_dir_name()
to avoid index lookups if the given path sorts after
the last entry in the index.

This saves at least 2 binary searches per entry.

This improves performance during checkout and read-tree because
merge_working_tree() and unpack_trees() processes a list of already
sorted entries.

This helps performance on very large repositories.

================
Before and after numbers on index with 1M files
./p0004-read-tree.sh
0004.2: read-tree work1 (1003037)          3.21(2.54+0.62)
0004.3: switch base work1 (3038 1003037)   7.49(5.39+1.84)
0004.5: switch work1 work2 (1003037)       11.91(8.38+3.00)
0004.6: switch commit aliases (1003037)    12.22(8.30+3.06)

./p0004-read-tree.sh
0004.2: read-tree work1 (1003040)          2.40(1.65+0.73)
0004.3: switch base work1 (3041 1003040)   6.07(4.12+1.66)
0004.5: switch work1 work2 (1003040)       10.23(6.76+2.92)
0004.6: switch commit aliases (1003040)    10.53(6.97+2.83)
================

Jeff Hostetler (4):
  p0004-read-tree: perf test to time read-tree
  read-cache: add strcmp_offset function
  test-strcmp-offset: created test for strcmp_offset
  read-cache: speed up add_index_entry during checkout

 Makefile                      |   1 +
 cache.h                       |   1 +
 read-cache.c                  |  73 +++++++++++++++++++++++++-
 t/helper/.gitignore           |   1 +
 t/helper/test-strcmp-offset.c |  64 +++++++++++++++++++++++
 t/perf/p0004-read-tree.sh     | 116 ++++++++++++++++++++++++++++++++++++++++++
 t/t0065-strcmp-offset.sh      |  11 ++++
 7 files changed, 266 insertions(+), 1 deletion(-)
 create mode 100644 t/helper/test-strcmp-offset.c
 create mode 100755 t/perf/p0004-read-tree.sh
 create mode 100755 t/t0065-strcmp-offset.sh

-- 
2.9.3


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v5 1/4] p0004-read-tree: perf test to time read-tree
  2017-04-05 17:38 [PATCH v5 0/4] read-cache: speed up add_index_entry git
@ 2017-04-05 17:38 ` git
  2017-04-06 20:20   ` René Scharfe
  2017-04-05 17:38 ` [PATCH v5 2/4] read-cache: add strcmp_offset function git
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 16+ messages in thread
From: git @ 2017-04-05 17:38 UTC (permalink / raw)
  To: git; +Cc: gitster, peff, Jeff Hostetler

From: Jeff Hostetler <jeffhost@microsoft.com>

Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
---
 t/perf/p0004-read-tree.sh | 116 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 116 insertions(+)
 create mode 100755 t/perf/p0004-read-tree.sh

diff --git a/t/perf/p0004-read-tree.sh b/t/perf/p0004-read-tree.sh
new file mode 100755
index 0000000..5d8bbf5
--- /dev/null
+++ b/t/perf/p0004-read-tree.sh
@@ -0,0 +1,116 @@
+#!/bin/sh
+
+test_description="Tests performance of read-tree"
+
+. ./perf-lib.sh
+
+test_perf_default_repo
+test_checkout_worktree
+
+## usage: dir depth width files
+make_paths () {
+	for f in $(seq $4)
+	do
+		echo $1/file$f
+	done;
+	if test $2 -gt 0;
+	then
+		for w in $(seq $3)
+		do
+			make_paths $1/dir$w $(($2 - 1)) $3 $4
+		done
+	fi
+	return 0
+}
+
+fill_index () {
+	make_paths $1 $2 $3 $4 |
+	sed "s/^/100644 $EMPTY_BLOB	/" |
+	git update-index --index-info
+	return 0
+}
+
+br_base=xxx_base_xxx
+br_work1=xxx_work1_xxx
+br_work2=xxx_work2_xxx
+br_work3=xxx_work3_xxx
+
+new_dir=xxx_dir_xxx
+
+## (5, 10, 9) will create 999,999 files.
+## (4, 10, 9) will create  99,999 files.
+depth=5
+width=10
+files=9
+
+export br_base
+export br_work1
+export br_work2
+export br_work3
+
+export new_dir
+
+export depth
+export width
+export files
+
+## The number of files in the xxx_base_xxx branch.
+nr_base=$(git ls-files | wc -l)
+export nr_base
+
+## Inflate the index with thousands of empty files and commit it.
+## Turn on sparse-checkout so that we don't have to populate them
+## later when we start switching branches.
+test_expect_success 'inflate the index' '
+	git reset --hard &&
+	git branch $br_base &&
+	git branch $br_work1 &&
+	git checkout $br_work1 &&
+	fill_index $new_dir $depth $width $files &&
+	git commit -m $br_work1 &&
+	echo $new_dir/file1 >.git/info/sparse-checkout &&
+	git config --local core.sparsecheckout 1 &&
+	git reset --hard
+'
+
+## The number of files in the xxx_work1_xxx branch.
+nr_work1=$(git ls-files | wc -l)
+export nr_work1
+
+test_perf "read-tree work1 ($nr_work1)" '
+	git read-tree -m $br_base $br_work1 -n
+'
+
+## Alternate between base and work branches several
+## times to measure a large change.
+test_perf "switch base work1 ($nr_base $nr_work1)" '
+	git checkout $br_base &&
+	git checkout $br_work1
+'
+
+## Create work2 by modifying 1 file in work1.
+## Create work3 as an alias of work2.
+test_expect_success 'setup work2' '
+	git branch $br_work2 &&
+	git checkout $br_work2 &&
+	echo x >$new_dir/file1 &&
+	git add $new_dir/file1 &&
+	git commit -m $br_work2 &&
+	git branch $br_work3
+'
+
+## Alternate between work1 and work2 several times
+## to measure a very small change.
+test_perf "switch work1 work2 ($nr_work1)" '
+	git checkout $br_work1 &&
+	git checkout $br_work2
+'
+
+## Alternate between branches work2 and work3 which
+## are aliases of the same commit.
+test_perf "switch commit aliases ($nr_work1)" '
+	git checkout $br_work3 &&
+	git checkout $br_work2
+'
+
+test_done
-- 
2.9.3


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v5 2/4] read-cache: add strcmp_offset function
  2017-04-05 17:38 [PATCH v5 0/4] read-cache: speed up add_index_entry git
  2017-04-05 17:38 ` [PATCH v5 1/4] p0004-read-tree: perf test to time read-tree git
@ 2017-04-05 17:38 ` git
  2017-04-06 14:19   ` SZEDER Gábor
  2017-04-05 17:38 ` [PATCH v5 3/4] test-strcmp-offset: created test for strcmp_offset git
  2017-04-05 17:38 ` [PATCH v5 4/4] read-cache: speed up add_index_entry during checkout git
  3 siblings, 1 reply; 16+ messages in thread
From: git @ 2017-04-05 17:38 UTC (permalink / raw)
  To: git; +Cc: gitster, peff, Jeff Hostetler

From: Jeff Hostetler <jeffhost@microsoft.com>

Add strcmp_offset() function to also return the offset of the
first change.

Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
---
 cache.h      |  1 +
 read-cache.c | 29 +++++++++++++++++++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/cache.h b/cache.h
index 80b6372..4d82490 100644
--- a/cache.h
+++ b/cache.h
@@ -574,6 +574,7 @@ extern int write_locked_index(struct index_state *, struct lock_file *lock, unsi
 extern int discard_index(struct index_state *);
 extern int unmerged_index(const struct index_state *);
 extern int verify_path(const char *path);
+extern int strcmp_offset(const char *s1_in, const char *s2_in, int *first_change);
 extern int index_dir_exists(struct index_state *istate, const char *name, int namelen);
 extern void adjust_dirname_case(struct index_state *istate, char *name);
 extern struct cache_entry *index_file_exists(struct index_state *istate, const char *name, int namelen, int igncase);
diff --git a/read-cache.c b/read-cache.c
index 9054369..b3fc77d 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -887,6 +887,35 @@ static int has_file_name(struct index_state *istate,
 	return retval;
 }
 
+
+/*
+ * Like strcmp(), but also return the offset of the first change.
+ */
+int strcmp_offset(const char *s1_in, const char *s2_in, int *first_change)
+{
+	const unsigned char *s1 = (const unsigned char *)s1_in;
+	const unsigned char *s2 = (const unsigned char *)s2_in;
+	int diff = 0;
+	int k;
+
+	*first_change = 0;
+	for (k=0; s1[k]; k++)
+		if ((diff = (s1[k] - s2[k])))
+			goto found_it;
+	if (!s2[k])
+		return 0;
+	diff = -1;
+
+found_it:
+	*first_change = k;
+	if (diff > 0)
+		return 1;
+	else if (diff < 0)
+		return -1;
+	else
+		return 0;
+}
+
 /*
  * Do we have another file with a pathname that is a proper
  * subset of the name we're trying to add?
-- 
2.9.3


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v5 3/4] test-strcmp-offset: created test for strcmp_offset
  2017-04-05 17:38 [PATCH v5 0/4] read-cache: speed up add_index_entry git
  2017-04-05 17:38 ` [PATCH v5 1/4] p0004-read-tree: perf test to time read-tree git
  2017-04-05 17:38 ` [PATCH v5 2/4] read-cache: add strcmp_offset function git
@ 2017-04-05 17:38 ` git
  2017-04-05 22:47   ` SZEDER Gábor
  2017-04-06 20:20   ` René Scharfe
  2017-04-05 17:38 ` [PATCH v5 4/4] read-cache: speed up add_index_entry during checkout git
  3 siblings, 2 replies; 16+ messages in thread
From: git @ 2017-04-05 17:38 UTC (permalink / raw)
  To: git; +Cc: gitster, peff, Jeff Hostetler

From: Jeff Hostetler <jeffhost@microsoft.com>

Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
---
 Makefile                      |  1 +
 t/helper/.gitignore           |  1 +
 t/helper/test-strcmp-offset.c | 64 +++++++++++++++++++++++++++++++++++++++++++
 t/t0065-strcmp-offset.sh      | 11 ++++++++
 4 files changed, 77 insertions(+)
 create mode 100644 t/helper/test-strcmp-offset.c
 create mode 100755 t/t0065-strcmp-offset.sh

diff --git a/Makefile b/Makefile
index 9ec6065..4c4c246 100644
--- a/Makefile
+++ b/Makefile
@@ -631,6 +631,7 @@ TEST_PROGRAMS_NEED_X += test-scrap-cache-tree
 TEST_PROGRAMS_NEED_X += test-sha1
 TEST_PROGRAMS_NEED_X += test-sha1-array
 TEST_PROGRAMS_NEED_X += test-sigchain
+TEST_PROGRAMS_NEED_X += test-strcmp-offset
 TEST_PROGRAMS_NEED_X += test-string-list
 TEST_PROGRAMS_NEED_X += test-submodule-config
 TEST_PROGRAMS_NEED_X += test-subprocess
diff --git a/t/helper/.gitignore b/t/helper/.gitignore
index d6e8b36..0a89531 100644
--- a/t/helper/.gitignore
+++ b/t/helper/.gitignore
@@ -25,6 +25,7 @@
 /test-sha1
 /test-sha1-array
 /test-sigchain
+/test-strcmp-offset
 /test-string-list
 /test-submodule-config
 /test-subprocess
diff --git a/t/helper/test-strcmp-offset.c b/t/helper/test-strcmp-offset.c
new file mode 100644
index 0000000..fe01318
--- /dev/null
+++ b/t/helper/test-strcmp-offset.c
@@ -0,0 +1,64 @@
+#include "cache.h"
+
+struct test_data {
+	const char *s1;
+	const char *s2;
+	int first_change;
+};
+
+static struct test_data data[] = {
+	{ "abc", "abc", 0 },
+	{ "abc", "def", 0 },
+
+	{ "abc", "abz", 2 },
+
+	{ "abc", "abcdef", 3 },
+
+	{ "abc\xF0zzz", "abc\xFFzzz", 3 },
+
+	{ NULL, NULL, 0 }
+};
+
+int try_pair(const char *sa, const char *sb, int first_change)
+{
+	int failed = 0;
+	int offset, r_exp, r_tst;
+	int r_exp_sign, r_tst_sign;
+
+	/*
+	 * Because differnt CRTs behave differently, only rely on signs
+	 * of the result values.
+	 */
+	r_exp = strcmp(sa, sb);
+	r_exp_sign = ((r_exp < 0) ? -1 : ((r_exp == 0) ? 0 : 1));
+
+	r_tst = strcmp_offset(sa, sb, &offset);
+	r_tst_sign = ((r_tst < 0) ? -1 : ((r_tst == 0) ? 0 : 1));
+
+	if (r_tst_sign != r_exp_sign) {
+		error("FAIL: '%s' vs '%s', result expect %d, observed %d\n",
+			  sa, sb, r_exp_sign, r_tst_sign);
+		failed = 1;
+	}
+
+	if (offset != first_change) {
+		error("FAIL: '%s' vs '%s', offset expect %d, observed %d\n",
+			  sa, sb, first_change, offset);
+		failed = 1;
+	}
+
+	return failed;
+}
+
+int cmd_main(int argc, const char **argv)
+{
+	int failed = 0;
+	int k;
+
+	for (k=0; data[k].s1; k++) {
+		failed += try_pair(data[k].s1, data[k].s2, data[k].first_change);
+		failed += try_pair(data[k].s2, data[k].s1, data[k].first_change);
+	}
+
+	return failed;
+}
diff --git a/t/t0065-strcmp-offset.sh b/t/t0065-strcmp-offset.sh
new file mode 100755
index 0000000..0176c8c
--- /dev/null
+++ b/t/t0065-strcmp-offset.sh
@@ -0,0 +1,11 @@
+#!/bin/sh
+
+test_description='Test strcmp_offset functionality'
+
+. ./test-lib.sh
+
+test_expect_success run_helper '
+	test-strcmp-offset
+'
+
+test_done
-- 
2.9.3


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v5 4/4] read-cache: speed up add_index_entry during checkout
  2017-04-05 17:38 [PATCH v5 0/4] read-cache: speed up add_index_entry git
                   ` (2 preceding siblings ...)
  2017-04-05 17:38 ` [PATCH v5 3/4] test-strcmp-offset: created test for strcmp_offset git
@ 2017-04-05 17:38 ` git
  2017-04-05 22:54   ` SZEDER Gábor
  3 siblings, 1 reply; 16+ messages in thread
From: git @ 2017-04-05 17:38 UTC (permalink / raw)
  To: git; +Cc: gitster, peff, Jeff Hostetler

From: Jeff Hostetler <jeffhost@microsoft.com>

Teach add_index_entry_with_check() and has_dir_name()
to see if the path of the new item is greater than the
last path in the index array before attempting to search
for it.

During checkout, merge_working_tree() populates the new
index in sorted order, so this change saves at least 2
lookups per file.

Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
---
 read-cache.c | 44 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 43 insertions(+), 1 deletion(-)

diff --git a/read-cache.c b/read-cache.c
index b3fc77d..bf9fc53 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -927,6 +927,21 @@ static int has_dir_name(struct index_state *istate,
 	int stage = ce_stage(ce);
 	const char *name = ce->name;
 	const char *slash = name + ce_namelen(ce);
+	int len_eq_last;
+	int cmp_last = 0;
+
+	if (istate->cache_nr > 0) {
+		/*
+		 * Compare the entry's full path with the last path in the index.
+		 * If it sorts AFTER the last entry in the index and they have no
+		 * common prefix, then there cannot be any F/D name conflicts.
+		 */
+		cmp_last = strcmp_offset(name,
+			istate->cache[istate->cache_nr-1]->name,
+			&len_eq_last);
+		if (cmp_last > 0 && len_eq_last == 0)
+			return retval;
+	}
 
 	for (;;) {
 		int len;
@@ -939,6 +954,24 @@ static int has_dir_name(struct index_state *istate,
 		}
 		len = slash - name;
 
+		if (cmp_last > 0) {
+			/*
+			 * If this part of the directory prefix (including the trailing
+			 * slash) already appears in the path of the last entry in the
+			 * index, then we cannot also have a file with this prefix (or
+			 * any parent directory prefix).
+			 */
+			if (len+1 <= len_eq_last)
+				return retval;
+			/*
+			 * If this part of the directory prefix (excluding the trailing
+			 * slash) is longer than the known equal portions, then this part
+			 * of the prefix cannot collide with a file.  Go on to the parent.
+			 */
+			if (len > len_eq_last)
+				continue;
+		}
+
 		pos = index_name_stage_pos(istate, name, len, stage);
 		if (pos >= 0) {
 			/*
@@ -1030,7 +1063,16 @@ static int add_index_entry_with_check(struct index_state *istate, struct cache_e
 
 	if (!(option & ADD_CACHE_KEEP_CACHE_TREE))
 		cache_tree_invalidate_path(istate, ce->name);
-	pos = index_name_stage_pos(istate, ce->name, ce_namelen(ce), ce_stage(ce));
+
+	/*
+	 * If this entry's path sorts after the last entry in the index,
+	 * we can avoid searching for it.
+	 */
+	if (istate->cache_nr > 0 &&
+		strcmp(ce->name, istate->cache[istate->cache_nr - 1]->name) > 0)
+		pos = -istate->cache_nr - 1;
+	else
+		pos = index_name_stage_pos(istate, ce->name, ce_namelen(ce), ce_stage(ce));
 
 	/* existing match? Just replace it. */
 	if (pos >= 0) {
-- 
2.9.3


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH v5 3/4] test-strcmp-offset: created test for strcmp_offset
  2017-04-05 17:38 ` [PATCH v5 3/4] test-strcmp-offset: created test for strcmp_offset git
@ 2017-04-05 22:47   ` SZEDER Gábor
  2017-04-06  8:21     ` Johannes Schindelin
  2017-04-06 20:20   ` René Scharfe
  1 sibling, 1 reply; 16+ messages in thread
From: SZEDER Gábor @ 2017-04-05 22:47 UTC (permalink / raw)
  To: git; +Cc: SZEDER Gábor, gitster, peff, Jeff Hostetler, git

I think this patch should be squashed into the previous commit;  I
don't see any reason why the tests should be added in a different
commit than the function they are testing.  However...

> Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
> ---
>  Makefile                      |  1 +
>  t/helper/.gitignore           |  1 +
>  t/helper/test-strcmp-offset.c | 64 +++++++++++++++++++++++++++++++++++++++++++
>  t/t0065-strcmp-offset.sh      | 11 ++++++++
>  4 files changed, 77 insertions(+)
>  create mode 100644 t/helper/test-strcmp-offset.c
>  create mode 100755 t/t0065-strcmp-offset.sh

Sure, tests are good, but I have to wonder how this would scale in the
long term, when even such simple functions would get their own
t/helper/test-func executable and t/tNNNN-func.sh script.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v5 4/4] read-cache: speed up add_index_entry during checkout
  2017-04-05 17:38 ` [PATCH v5 4/4] read-cache: speed up add_index_entry during checkout git
@ 2017-04-05 22:54   ` SZEDER Gábor
  2017-04-06 14:05     ` Jeff Hostetler
  0 siblings, 1 reply; 16+ messages in thread
From: SZEDER Gábor @ 2017-04-05 22:54 UTC (permalink / raw)
  To: git; +Cc: SZEDER Gábor, gitster, peff, Jeff Hostetler, git

> Teach add_index_entry_with_check() and has_dir_name()
> to see if the path of the new item is greater than the
> last path in the index array before attempting to search
> for it.
> 
> During checkout, merge_working_tree() populates the new
> index in sorted order, so this change saves at least 2
> lookups per file.

I think the performance numbers from the cover letter should be
included here, so they will be included in the history.

> Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v5 3/4] test-strcmp-offset: created test for strcmp_offset
  2017-04-05 22:47   ` SZEDER Gábor
@ 2017-04-06  8:21     ` Johannes Schindelin
  2017-04-06 14:25       ` Jeff Hostetler
  0 siblings, 1 reply; 16+ messages in thread
From: Johannes Schindelin @ 2017-04-06  8:21 UTC (permalink / raw)
  To: SZEDER Gábor; +Cc: git, gitster, peff, Jeff Hostetler, git

[-- Attachment #1: Type: text/plain, Size: 2040 bytes --]

Hi Gábor,

On Thu, 6 Apr 2017, SZEDER Gábor wrote:

> I think this patch should be squashed into the previous commit;  I
> don't see any reason why the tests should be added in a different
> commit than the function they are testing.

I am of two minds there. In some cases, the newly added test demonstrates
the intended usage, and therefore makes for a nice documentation. In other
cases, the new test is large enough to stand on its own, i.e. to merit a
separate patch (also to make reviewing easier).

In this particular case, I tend to the latter: it is large enough a patch
that it is easier to review as a separate patch.

> >  t/helper/test-strcmp-offset.c | 64 +++++++++++++++++++++++++++++++++++++++++++
> >  t/t0065-strcmp-offset.sh      | 11 ++++++++
> >  4 files changed, 77 insertions(+)
> >  create mode 100644 t/helper/test-strcmp-offset.c
> >  create mode 100755 t/t0065-strcmp-offset.sh
> 
> Sure, tests are good, but I have to wonder how this would scale in the
> long term, when even such simple functions would get their own
> t/helper/test-func executable and t/tNNNN-func.sh script.

True. The proliferation of executables in t/helper/ got a little out of
hand.

But there is nothing preventing us from consolidating a few of them into a
single executable, using our wonderful option parsing function with
OPT_CMDMODE to switch between the different functions.

I could see, for example, how we could consolidate all string-related
test helpers into a single one, say, test-strings:

t/helper/test-ctype.c
t/helper/test-regex.c
t/helper/test-strcmp-offset.c
t/helper/test-string-list.c
t/helper/test-line-buffer.c
t/helper/test-urlmatch-normalization.c
t/helper/test-wildmatch.c

Also, these helpers seem to be related to index handling and could go into
a new test-index helper:

t/helper/test-dump-cache-tree.c
t/helper/test-dump-split-index.c
t/helper/test-dump-untracked-cache.c
t/helper/test-index-version.c
t/helper/test-scrap-cache-tree.c

Ciao,
Johannes

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v5 4/4] read-cache: speed up add_index_entry during checkout
  2017-04-05 22:54   ` SZEDER Gábor
@ 2017-04-06 14:05     ` Jeff Hostetler
  0 siblings, 0 replies; 16+ messages in thread
From: Jeff Hostetler @ 2017-04-06 14:05 UTC (permalink / raw)
  To: SZEDER Gábor; +Cc: gitster, peff, Jeff Hostetler, git



On 4/5/2017 6:54 PM, SZEDER Gábor wrote:
>> Teach add_index_entry_with_check() and has_dir_name()
>> to see if the path of the new item is greater than the
>> last path in the index array before attempting to search
>> for it.
>>
>> During checkout, merge_working_tree() populates the new
>> index in sorted order, so this change saves at least 2
>> lookups per file.
>
> I think the performance numbers from the cover letter should be
> included here, so they will be included in the history.

Good point.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v5 2/4] read-cache: add strcmp_offset function
  2017-04-05 17:38 ` [PATCH v5 2/4] read-cache: add strcmp_offset function git
@ 2017-04-06 14:19   ` SZEDER Gábor
  2017-04-06 15:58     ` Jeff Hostetler
  0 siblings, 1 reply; 16+ messages in thread
From: SZEDER Gábor @ 2017-04-06 14:19 UTC (permalink / raw)
  To: git; +Cc: SZEDER Gábor, gitster, peff, Jeff Hostetler, git

> Add strcmp_offset() function to also return the offset of the
> first change.
> 
> Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
> ---
>  cache.h      |  1 +
>  read-cache.c | 29 +++++++++++++++++++++++++++++
>  2 files changed, 30 insertions(+)
> 
> diff --git a/cache.h b/cache.h
> index 80b6372..4d82490 100644
> --- a/cache.h
> +++ b/cache.h
> @@ -574,6 +574,7 @@ extern int write_locked_index(struct index_state *, struct lock_file *lock, unsi
>  extern int discard_index(struct index_state *);
>  extern int unmerged_index(const struct index_state *);
>  extern int verify_path(const char *path);
> +extern int strcmp_offset(const char *s1_in, const char *s2_in, int *first_change);
>  extern int index_dir_exists(struct index_state *istate, const char *name, int namelen);
>  extern void adjust_dirname_case(struct index_state *istate, char *name);
>  extern struct cache_entry *index_file_exists(struct index_state *istate, const char *name, int namelen, int igncase);
> diff --git a/read-cache.c b/read-cache.c
> index 9054369..b3fc77d 100644
> --- a/read-cache.c
> +++ b/read-cache.c

read-cache.c is probably not the best place for such a general string
utility function, though I'm not sure where its most apropriate place
would be.

> @@ -887,6 +887,35 @@ static int has_file_name(struct index_state *istate,
>  	return retval;
>  }
>  
> +
> +/*
> + * Like strcmp(), but also return the offset of the first change.

This comment doesn't tell us what will happen with the offset
parameter if it is NULL or if the two strings are equal.  I don't
really care about the safety check for NULL: a callsite not interested
in the offset should just call strcmp() instead.  I'm fine either way.
However, setting it to 0 when the strings are equal doesn't seem quite
right, does it.

> + */
> +int strcmp_offset(const char *s1_in, const char *s2_in, int *first_change)
> +{
> +	const unsigned char *s1 = (const unsigned char *)s1_in;
> +	const unsigned char *s2 = (const unsigned char *)s2_in;
> +	int diff = 0;
> +	int k;
> +
> +	*first_change = 0;
> +	for (k=0; s1[k]; k++)
> +		if ((diff = (s1[k] - s2[k])))
> +			goto found_it;
> +	if (!s2[k])
> +		return 0;
> +	diff = -1;
> +
> +found_it:
> +	*first_change = k;
> +	if (diff > 0)
> +		return 1;
> +	else if (diff < 0)
> +		return -1;
> +	else
> +		return 0;
> +}

The implementation seems to me a bit long-winded, with more
conditional statements than necessary.  How about something like this
instead?  Much shorter, no goto and only half the number of
conditionals to reason about, leaves *first_change untouched when the
two strings are equal, and deals with it being NULL.

int strcmp_offset(const char *s1, const char *s2, int *first_change)
{
	int k;

	for (k = 0; s1[k] == s2[k]; k++)
		if (s1[k] == '\0')
			return 0;

	if (first_change)
		*first_change = k;
	return ((unsigned char *)s1)[k] - ((unsigned char *)s2)[k];
}


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v5 3/4] test-strcmp-offset: created test for strcmp_offset
  2017-04-06  8:21     ` Johannes Schindelin
@ 2017-04-06 14:25       ` Jeff Hostetler
  0 siblings, 0 replies; 16+ messages in thread
From: Jeff Hostetler @ 2017-04-06 14:25 UTC (permalink / raw)
  To: Johannes Schindelin, SZEDER Gábor; +Cc: gitster, peff, Jeff Hostetler, git



On 4/6/2017 4:21 AM, Johannes Schindelin wrote:
> Hi Gábor,
>
> On Thu, 6 Apr 2017, SZEDER Gábor wrote:
>
>> I think this patch should be squashed into the previous commit;  I
>> don't see any reason why the tests should be added in a different
>> commit than the function they are testing.

Will do.

>
> I am of two minds there. In some cases, the newly added test demonstrates
> the intended usage, and therefore makes for a nice documentation. In other
> cases, the new test is large enough to stand on its own, i.e. to merit a
> separate patch (also to make reviewing easier).
>
> In this particular case, I tend to the latter: it is large enough a patch
> that it is easier to review as a separate patch.
>
>>>  t/helper/test-strcmp-offset.c | 64 +++++++++++++++++++++++++++++++++++++++++++
>>>  t/t0065-strcmp-offset.sh      | 11 ++++++++
>>>  4 files changed, 77 insertions(+)
>>>  create mode 100644 t/helper/test-strcmp-offset.c
>>>  create mode 100755 t/t0065-strcmp-offset.sh
>>
>> Sure, tests are good, but I have to wonder how this would scale in the
>> long term, when even such simple functions would get their own
>> t/helper/test-func executable and t/tNNNN-func.sh script.

I think this is the start of a larger conversation on
how we want to handle function-level unit-testing and
outside the scope of this patch series.

>
> True. The proliferation of executables in t/helper/ got a little out of
> hand.
>
> But there is nothing preventing us from consolidating a few of them into a
> single executable, using our wonderful option parsing function with
> OPT_CMDMODE to switch between the different functions.
>
> I could see, for example, how we could consolidate all string-related
> test helpers into a single one, say, test-strings:
>
> t/helper/test-ctype.c
> t/helper/test-regex.c
> t/helper/test-strcmp-offset.c
> t/helper/test-string-list.c
> t/helper/test-line-buffer.c
> t/helper/test-urlmatch-normalization.c
> t/helper/test-wildmatch.c
>
> Also, these helpers seem to be related to index handling and could go into
> a new test-index helper:
>
> t/helper/test-dump-cache-tree.c
> t/helper/test-dump-split-index.c
> t/helper/test-dump-untracked-cache.c
> t/helper/test-index-version.c
> t/helper/test-scrap-cache-tree.c

This is an interesting proposal.  I could go one further and
say we have a single "t/helper/unit-test.c" that has series
of "t/helper/builtin/*.c" commands (using the same mechanism
as the builtin commands in git.exe).  The question then is
how much of the test logic and/or parameters go into the shell
scripts.

But again, all of that is outside my scope here.
Jeff


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v5 2/4] read-cache: add strcmp_offset function
  2017-04-06 14:19   ` SZEDER Gábor
@ 2017-04-06 15:58     ` Jeff Hostetler
  0 siblings, 0 replies; 16+ messages in thread
From: Jeff Hostetler @ 2017-04-06 15:58 UTC (permalink / raw)
  To: SZEDER Gábor; +Cc: gitster, peff, Jeff Hostetler, git



On 4/6/2017 10:19 AM, SZEDER Gábor wrote:
>> Add strcmp_offset() function to also return the offset of the
>> first change.
>>
>> Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
>> ---
>>  cache.h      |  1 +
>>  read-cache.c | 29 +++++++++++++++++++++++++++++
>>  2 files changed, 30 insertions(+)
>>
>> diff --git a/cache.h b/cache.h
>> index 80b6372..4d82490 100644
>> --- a/cache.h
>> +++ b/cache.h
>> @@ -574,6 +574,7 @@ extern int write_locked_index(struct index_state *, struct lock_file *lock, unsi
>>  extern int discard_index(struct index_state *);
>>  extern int unmerged_index(const struct index_state *);
>>  extern int verify_path(const char *path);
>> +extern int strcmp_offset(const char *s1_in, const char *s2_in, int *first_change);
>>  extern int index_dir_exists(struct index_state *istate, const char *name, int namelen);
>>  extern void adjust_dirname_case(struct index_state *istate, char *name);
>>  extern struct cache_entry *index_file_exists(struct index_state *istate, const char *name, int namelen, int igncase);
>> diff --git a/read-cache.c b/read-cache.c
>> index 9054369..b3fc77d 100644
>> --- a/read-cache.c
>> +++ b/read-cache.c
>
> read-cache.c is probably not the best place for such a general string
> utility function, though I'm not sure where its most apropriate place
> would be.

Yeah, I looked and didn't see any place so I left it here
near it's first usage for now.

>
>> @@ -887,6 +887,35 @@ static int has_file_name(struct index_state *istate,
>>  	return retval;
>>  }
>>
>> +
>> +/*
>> + * Like strcmp(), but also return the offset of the first change.
>
> This comment doesn't tell us what will happen with the offset
> parameter if it is NULL or if the two strings are equal.  I don't
> really care about the safety check for NULL: a callsite not interested
> in the offset should just call strcmp() instead.  I'm fine either way.
> However, setting it to 0 when the strings are equal doesn't seem quite
> right, does it.

Good catch. I'll add a null check.

When the strings are equal, I'm not sure any one value is
better than another.  I could leave it undefined or set it
to the length.  That might be more useful since we have it on
hand and it might save the caller a strlen() later.

>
>> + */
>> +int strcmp_offset(const char *s1_in, const char *s2_in, int *first_change)
>> +{
>> +	const unsigned char *s1 = (const unsigned char *)s1_in;
>> +	const unsigned char *s2 = (const unsigned char *)s2_in;
>> +	int diff = 0;
>> +	int k;
>> +
>> +	*first_change = 0;
>> +	for (k=0; s1[k]; k++)
>> +		if ((diff = (s1[k] - s2[k])))
>> +			goto found_it;
>> +	if (!s2[k])
>> +		return 0;
>> +	diff = -1;
>> +
>> +found_it:
>> +	*first_change = k;
>> +	if (diff > 0)
>> +		return 1;
>> +	else if (diff < 0)
>> +		return -1;
>> +	else
>> +		return 0;
>> +}
>
> The implementation seems to me a bit long-winded, with more
> conditional statements than necessary.  How about something like this
> instead?  Much shorter, no goto and only half the number of
> conditionals to reason about, leaves *first_change untouched when the
> two strings are equal, and deals with it being NULL.
>
> int strcmp_offset(const char *s1, const char *s2, int *first_change)
> {
> 	int k;
>
> 	for (k = 0; s1[k] == s2[k]; k++)
> 		if (s1[k] == '\0')
> 			return 0;
>
> 	if (first_change)
> 		*first_change = k;
> 	return ((unsigned char *)s1)[k] - ((unsigned char *)s2)[k];
> }
>

Let me give that a try.  Thanks!
Jeff


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v5 3/4] test-strcmp-offset: created test for strcmp_offset
  2017-04-05 17:38 ` [PATCH v5 3/4] test-strcmp-offset: created test for strcmp_offset git
  2017-04-05 22:47   ` SZEDER Gábor
@ 2017-04-06 20:20   ` René Scharfe
  2017-04-06 20:42     ` Jeff Hostetler
  1 sibling, 1 reply; 16+ messages in thread
From: René Scharfe @ 2017-04-06 20:20 UTC (permalink / raw)
  To: git, git; +Cc: gitster, peff, Jeff Hostetler

Am 05.04.2017 um 19:38 schrieb git@jeffhostetler.com:
> diff --git a/t/helper/test-strcmp-offset.c b/t/helper/test-strcmp-offset.c
> new file mode 100644
> index 0000000..fe01318
> --- /dev/null
> +++ b/t/helper/test-strcmp-offset.c
> @@ -0,0 +1,64 @@
> +#include "cache.h"
> +
> +struct test_data {
> +	const char *s1;
> +	const char *s2;
> +	int first_change;
> +};
> +
> +static struct test_data data[] = {
> +	{ "abc", "abc", 0 },
> +	{ "abc", "def", 0 },
> +
> +	{ "abc", "abz", 2 },
> +
> +	{ "abc", "abcdef", 3 },
> +
> +	{ "abc\xF0zzz", "abc\xFFzzz", 3 },
> +
> +	{ NULL, NULL, 0 }
> +};
> +
> +int try_pair(const char *sa, const char *sb, int first_change)

This should be static, right?  Found with -Wmissing-prototypes.

> +{
> +	int failed = 0;
> +	int offset, r_exp, r_tst;
> +	int r_exp_sign, r_tst_sign;
> +
> +	/*
> +	 * Because differnt CRTs behave differently, only rely on signs

s/differnt/different/

René

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v5 1/4] p0004-read-tree: perf test to time read-tree
  2017-04-05 17:38 ` [PATCH v5 1/4] p0004-read-tree: perf test to time read-tree git
@ 2017-04-06 20:20   ` René Scharfe
  2017-04-06 20:41     ` Jeff Hostetler
  0 siblings, 1 reply; 16+ messages in thread
From: René Scharfe @ 2017-04-06 20:20 UTC (permalink / raw)
  To: git, git; +Cc: gitster, peff, Jeff Hostetler

Am 05.04.2017 um 19:38 schrieb git@jeffhostetler.com:
> From: Jeff Hostetler <jeffhost@microsoft.com>
> 
> Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
> ---
>  t/perf/p0004-read-tree.sh | 116 ++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 116 insertions(+)
>  create mode 100755 t/perf/p0004-read-tree.sh
> 
> diff --git a/t/perf/p0004-read-tree.sh b/t/perf/p0004-read-tree.sh
> new file mode 100755
> index 0000000..5d8bbf5
> --- /dev/null
> +++ b/t/perf/p0004-read-tree.sh
> @@ -0,0 +1,116 @@
> +#!/bin/sh
> +
> +test_description="Tests performance of read-tree"
> +
> +. ./perf-lib.sh
> +
> +test_perf_default_repo
> +test_checkout_worktree
> +
> +## usage: dir depth width files
> +make_paths () {
> +	for f in $(seq $4)
> +	do
> +		echo $1/file$f
> +	done;
> +	if test $2 -gt 0;
> +	then
> +		for w in $(seq $3)
> +		do
> +			make_paths $1/dir$w $(($2 - 1)) $3 $4
> +		done
> +	fi
> +	return 0
> +}

"make_paths xxx_dir_xxx 5 10 9" takes more than a minute for me.
Providing its results as a file would be quicker but less flexible.
The following command prints the same result in less than a second.

	awk -v dir=xxx_dir_xxx -v depth=5 -v width=10 -v files=9 '
        	function make_paths(dir, depth, width, files,  i)
	        {
        	        for (i = 1; i <= files; i++) {
	                        print dir "/file" i
        	        }
	                if (depth > 0) {
        	                for (i = 1; i <= width; i++) {
	                                make_paths(dir "/dir" i, depth - 1, width, files)
        	                }
                	}
	        }
	        END {make_paths(dir, depth, width, files)}
	' </dev/null

It's faster because it avoids calling seq thousands of times.

> +
> +fill_index () {
> +	make_paths $1 $2 $3 $4 |
> +	sed "s/^/100644 $EMPTY_BLOB	/" |

You could add the prefix to the script above and avoid this sed call
as well.

René

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v5 1/4] p0004-read-tree: perf test to time read-tree
  2017-04-06 20:20   ` René Scharfe
@ 2017-04-06 20:41     ` Jeff Hostetler
  0 siblings, 0 replies; 16+ messages in thread
From: Jeff Hostetler @ 2017-04-06 20:41 UTC (permalink / raw)
  To: René Scharfe, git; +Cc: gitster, peff, Jeff Hostetler



On 4/6/2017 4:20 PM, René Scharfe wrote:
> Am 05.04.2017 um 19:38 schrieb git@jeffhostetler.com:
>> From: Jeff Hostetler <jeffhost@microsoft.com>
>>
>> Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
>> ---
>>  t/perf/p0004-read-tree.sh | 116 ++++++++++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 116 insertions(+)
>>  create mode 100755 t/perf/p0004-read-tree.sh
>>
>> diff --git a/t/perf/p0004-read-tree.sh b/t/perf/p0004-read-tree.sh
>> new file mode 100755
>> index 0000000..5d8bbf5
>> --- /dev/null
>> +++ b/t/perf/p0004-read-tree.sh
>> @@ -0,0 +1,116 @@
>> +#!/bin/sh
>> +
>> +test_description="Tests performance of read-tree"
>> +
>> +. ./perf-lib.sh
>> +
>> +test_perf_default_repo
>> +test_checkout_worktree
>> +
>> +## usage: dir depth width files
>> +make_paths () {
>> +	for f in $(seq $4)
>> +	do
>> +		echo $1/file$f
>> +	done;
>> +	if test $2 -gt 0;
>> +	then
>> +		for w in $(seq $3)
>> +		do
>> +			make_paths $1/dir$w $(($2 - 1)) $3 $4
>> +		done
>> +	fi
>> +	return 0
>> +}
>
> "make_paths xxx_dir_xxx 5 10 9" takes more than a minute for me.
> Providing its results as a file would be quicker but less flexible.
> The following command prints the same result in less than a second.
>
> 	awk -v dir=xxx_dir_xxx -v depth=5 -v width=10 -v files=9 '
>         	function make_paths(dir, depth, width, files,  i)
> 	        {
>         	        for (i = 1; i <= files; i++) {
> 	                        print dir "/file" i
>         	        }
> 	                if (depth > 0) {
>         	                for (i = 1; i <= width; i++) {
> 	                                make_paths(dir "/dir" i, depth - 1, width, files)
>         	                }
>                 	}
> 	        }
> 	        END {make_paths(dir, depth, width, files)}
> 	' </dev/null
>
> It's faster because it avoids calling seq thousands of times.

Very nice!!!  I'll give that a try.  Thanks!

>
>> +
>> +fill_index () {
>> +	make_paths $1 $2 $3 $4 |
>> +	sed "s/^/100644 $EMPTY_BLOB	/" |
>
> You could add the prefix to the script above and avoid this sed call
> as well.
>
> René
>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v5 3/4] test-strcmp-offset: created test for strcmp_offset
  2017-04-06 20:20   ` René Scharfe
@ 2017-04-06 20:42     ` Jeff Hostetler
  0 siblings, 0 replies; 16+ messages in thread
From: Jeff Hostetler @ 2017-04-06 20:42 UTC (permalink / raw)
  To: René Scharfe, git; +Cc: gitster, peff, Jeff Hostetler



On 4/6/2017 4:20 PM, René Scharfe wrote:
> Am 05.04.2017 um 19:38 schrieb git@jeffhostetler.com:
>> diff --git a/t/helper/test-strcmp-offset.c
>> b/t/helper/test-strcmp-offset.c
>> new file mode 100644
>> index 0000000..fe01318
>> --- /dev/null
>> +++ b/t/helper/test-strcmp-offset.c
>> @@ -0,0 +1,64 @@
>> +#include "cache.h"
>> +
>> +struct test_data {
>> +    const char *s1;
>> +    const char *s2;
>> +    int first_change;
>> +};
>> +
>> +static struct test_data data[] = {
>> +    { "abc", "abc", 0 },
>> +    { "abc", "def", 0 },
>> +
>> +    { "abc", "abz", 2 },
>> +
>> +    { "abc", "abcdef", 3 },
>> +
>> +    { "abc\xF0zzz", "abc\xFFzzz", 3 },
>> +
>> +    { NULL, NULL, 0 }
>> +};
>> +
>> +int try_pair(const char *sa, const char *sb, int first_change)
>
> This should be static, right?  Found with -Wmissing-prototypes.

right.  thanks.

>
>> +{
>> +    int failed = 0;
>> +    int offset, r_exp, r_tst;
>> +    int r_exp_sign, r_tst_sign;
>> +
>> +    /*
>> +     * Because differnt CRTs behave differently, only rely on signs
>
> s/differnt/different/
>
> René

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2017-04-06 20:42 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-04-05 17:38 [PATCH v5 0/4] read-cache: speed up add_index_entry git
2017-04-05 17:38 ` [PATCH v5 1/4] p0004-read-tree: perf test to time read-tree git
2017-04-06 20:20   ` René Scharfe
2017-04-06 20:41     ` Jeff Hostetler
2017-04-05 17:38 ` [PATCH v5 2/4] read-cache: add strcmp_offset function git
2017-04-06 14:19   ` SZEDER Gábor
2017-04-06 15:58     ` Jeff Hostetler
2017-04-05 17:38 ` [PATCH v5 3/4] test-strcmp-offset: created test for strcmp_offset git
2017-04-05 22:47   ` SZEDER Gábor
2017-04-06  8:21     ` Johannes Schindelin
2017-04-06 14:25       ` Jeff Hostetler
2017-04-06 20:20   ` René Scharfe
2017-04-06 20:42     ` Jeff Hostetler
2017-04-05 17:38 ` [PATCH v5 4/4] read-cache: speed up add_index_entry during checkout git
2017-04-05 22:54   ` SZEDER Gábor
2017-04-06 14:05     ` Jeff Hostetler

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.