All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v7 0/1] refs.c: SSE4.2 optimizations for check_refname_component
@ 2014-06-05 23:56 David Turner
  2014-06-05 23:56 ` [PATCH v7 1/1] " David Turner
  2014-06-09 22:16 ` [PATCH v7 0/1] " Junio C Hamano
  0 siblings, 2 replies; 12+ messages in thread
From: David Turner @ 2014-06-05 23:56 UTC (permalink / raw)
  To: git, gitster, mhagger

Since Junio has picked up the first patch from previous versions of
this series, I'm just going to send the second (SSE) one.  I decided
not to s/NO_SSE42/!HAVE_SSE42/ because it looks like git mostly uses
the former convention (for instance, that's what GIT_PARSE_WITH
generates).

Thanks for all of the comments so far.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH v7 1/1] refs.c: SSE4.2 optimizations for check_refname_component
  2014-06-05 23:56 [PATCH v7 0/1] refs.c: SSE4.2 optimizations for check_refname_component David Turner
@ 2014-06-05 23:56 ` David Turner
  2014-06-14 15:22   ` Ondřej Bílka
  2014-06-09 22:16 ` [PATCH v7 0/1] " Junio C Hamano
  1 sibling, 1 reply; 12+ messages in thread
From: David Turner @ 2014-06-05 23:56 UTC (permalink / raw)
  To: git, gitster, mhagger; +Cc: David Turner

Optimize check_refname_component using SSE4.2, where available.

git rev-parse HEAD is a good test-case for this, since it does almost
nothing except parse refs.  For one particular repo with about 60k
refs, almost all packed, the timings are:

Look up table: 29 ms
SSE4.2:        25 ms

This is about a 15% improvement.

The configure.ac changes include code from the GNU C Library written
by Joseph S. Myers <joseph at codesourcery dot com>.

Only supports GCC and Clang at present, because C interfaces to the
cpuid instruction are not well-standardized.

Signed-off-by: David Turner <dturner@twitter.com>
---
 Makefile           |   6 +++
 aclocal.m4         |   6 +++
 compat/cpuid.h     |  25 +++++++++
 configure.ac       |  17 ++++++
 git-compat-util.h  |  23 ++++++++
 refs.c             | 150 +++++++++++++++++++++++++++++++++++++++++++++++------
 t/t5511-refspec.sh |  13 +++++
 7 files changed, 225 insertions(+), 15 deletions(-)
 create mode 100644 compat/cpuid.h

diff --git a/Makefile b/Makefile
index a53f3a8..dd2127a 100644
--- a/Makefile
+++ b/Makefile
@@ -1326,6 +1326,11 @@ else
 		COMPAT_OBJS += compat/win32mmap.o
 	endif
 endif
+ifdef NO_SSE42
+	BASIC_CFLAGS += -DNO_SSE42
+else
+	BASIC_CFLAGS += -msse4.2
+endif
 ifdef OBJECT_CREATION_USES_RENAMES
 	COMPAT_CFLAGS += -DOBJECT_CREATION_MODE=1
 endif
@@ -2199,6 +2204,7 @@ GIT-BUILD-OPTIONS: FORCE
 	@echo NO_PERL=\''$(subst ','\'',$(subst ','\'',$(NO_PERL)))'\' >>$@
 	@echo NO_PYTHON=\''$(subst ','\'',$(subst ','\'',$(NO_PYTHON)))'\' >>$@
 	@echo NO_UNIX_SOCKETS=\''$(subst ','\'',$(subst ','\'',$(NO_UNIX_SOCKETS)))'\' >>$@
+	@echo NO_SSE42=\''$(subst ','\'',$(subst ','\'',$(NO_SSE42)))'\' >>$@
 ifdef TEST_OUTPUT_DIRECTORY
 	@echo TEST_OUTPUT_DIRECTORY=\''$(subst ','\'',$(subst ','\'',$(TEST_OUTPUT_DIRECTORY)))'\' >>$@
 endif
diff --git a/aclocal.m4 b/aclocal.m4
index f11bc7e..d9f3f19 100644
--- a/aclocal.m4
+++ b/aclocal.m4
@@ -38,3 +38,9 @@ AC_DEFUN([TYPE_SOCKLEN_T],
       [#include <sys/types.h>
 #include <sys/socket.h>])
 ])
+
+dnl Test a compiler option or options with an empty input file.
+dnl LIBC_TRY_CC_OPTION([options], [action-if-true], [action-if-false])
+AC_DEFUN([LIBC_TRY_CC_OPTION],
+[AS_IF([AC_TRY_COMMAND([${CC-cc} $1 -xc /dev/null -S -o /dev/null])],
+	[$2], [$3])])
diff --git a/compat/cpuid.h b/compat/cpuid.h
new file mode 100644
index 0000000..7709e35
--- /dev/null
+++ b/compat/cpuid.h
@@ -0,0 +1,25 @@
+#ifndef COMPAT_CPUID_H
+#define COMPAT_CPUID_H
+
+#if (defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)))
+#include <cpuid.h>
+#define CPUID_SSE42_BIT (1 << 20)
+
+static int processor_supports_sse42(void)
+{
+	unsigned eax, ebx, ecx, edx;
+	__cpuid(1, eax, ebx, ecx, edx);
+	return ecx & CPUID_SSE42_BIT;
+}
+
+#else
+
+static int processor_supports_sse42(void)
+{
+	/* TODO: handle CPUID on non-GCC compilers */
+	return 0;
+}
+
+#endif
+
+#endif /* COMPAT_CPUID_H */
diff --git a/configure.ac b/configure.ac
index b711254..3a5bda9 100644
--- a/configure.ac
+++ b/configure.ac
@@ -382,6 +382,23 @@ AS_HELP_STRING([],[Tcl/Tk interpreter will be found in a system.]),
 GIT_PARSE_WITH(tcltk))
 #
 
+# Declare the with-sse42/without-sse42 options.
+AC_ARG_WITH(sse42,
+AS_HELP_STRING([--with-sse42],[use SSE4.2 instructions])
+AS_HELP_STRING([],[(default is YES if your compiler supports -msse4.2)]),
+GIT_PARSE_WITH(sse42))
+
+if test "$NO_SSE42" != "YesPlease"; then
+   dnl Check if -msse4.2 works.
+   AC_CACHE_CHECK(for SSE4.2 support, cc_cv_sse42, [dnl
+   LIBC_TRY_CC_OPTION([-msse4.2], [cc_cv_sse42=yes], [cc_cv_sse42=no])
+   ])
+   if test $cc_cv_sse42 = no; then
+     NO_SSE42=1
+   fi
+fi
+
+GIT_CONF_SUBST([NO_SSE42])
 
 ## Checks for programs.
 AC_MSG_NOTICE([CHECKS for programs])
diff --git a/git-compat-util.h b/git-compat-util.h
index f6d3a46..2c66d6d 100644
--- a/git-compat-util.h
+++ b/git-compat-util.h
@@ -668,6 +668,29 @@ void git_qsort(void *base, size_t nmemb, size_t size,
 #endif
 #endif
 
+#ifndef NO_SSE42
+#include <nmmintrin.h>
+#include "compat/cpuid.h"
+/*
+ * Clang ships with a version of nmmintrin.h that's incomplete; if
+ * necessary, we define the constants that we're going to use.
+ */
+#ifndef _SIDD_UBYTE_OPS
+#define _SIDD_UBYTE_OPS                 0x00
+#define _SIDD_CMP_EQUAL_ANY             0x00
+#define _SIDD_CMP_RANGES                0x04
+#define _SIDD_CMP_EQUAL_ORDERED         0x0c
+#define _SIDD_NEGATIVE_POLARITY         0x10
+#endif
+
+/* This is the system memory page size; it's used so that we can read
+ * outside the bounds of an allocation without segfaulting.
+ */
+#ifndef PAGE_SIZE
+#define PAGE_SIZE 4096
+#endif
+#endif
+
 #ifdef UNRELIABLE_FSTAT
 #define fstat_is_reliable() 0
 #else
diff --git a/refs.c b/refs.c
index 46139d2..b3bd333 100644
--- a/refs.c
+++ b/refs.c
@@ -24,6 +24,25 @@ static unsigned char refname_disposition[256] = {
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 4, 4
 };
 
+static int check_refname_component_trailer(const char *cp, const char *refname, int flags)
+{
+	if (cp == refname)
+		return 0; /* Component has zero length. */
+	if (refname[0] == '.') {
+		if (!(flags & REFNAME_DOT_COMPONENT))
+			return -1; /* Component starts with '.'. */
+		/*
+		 * Even if leading dots are allowed, don't allow "."
+		 * as a component (".." is prevented by a rule above).
+		 */
+		if (refname[1] == '\0')
+			return -1; /* Component equals ".". */
+	}
+	if (cp - refname >= 5 && !memcmp(cp - 5, ".lock", 5))
+		return -1; /* Refname ends with ".lock". */
+	return cp - refname;
+}
+
 /*
  * Try to read one refname component from the front of refname.
  * Return the length of the component found, or -1 if the component is
@@ -37,7 +56,7 @@ static unsigned char refname_disposition[256] = {
  * - it ends with ".lock"
  * - it contains a "\" (backslash)
  */
-static int check_refname_component(const char *refname, int flags)
+static int check_refname_component_bytewise(const char *refname, int flags)
 {
 	const char *cp;
 	char last = '\0';
@@ -47,7 +66,7 @@ static int check_refname_component(const char *refname, int flags)
 		unsigned char disp = refname_disposition[ch];
 		switch(disp) {
 		case 1:
-			goto out;
+			return check_refname_component_trailer(cp, refname, flags);
 		case 2:
 			if (last == '.')
 				return -1; /* Refname contains "..". */
@@ -61,24 +80,125 @@ static int check_refname_component(const char *refname, int flags)
 		}
 		last = ch;
 	}
-out:
-	if (cp == refname)
-		return 0; /* Component has zero length. */
-	if (refname[0] == '.') {
-		if (!(flags & REFNAME_DOT_COMPONENT))
-			return -1; /* Component starts with '.'. */
+}
+
+#if defined(NO_SSE42) || !(defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)))
+#define check_refname_component check_refname_component_bytewise
+#else
+#define SSE_VECTOR_BYTES 16
+
+/* Vectorized version of check_refname_component */
+static int check_refname_component_sse42(const char *refname, int flags)
+{
+	const __m128i *refname_vec = (__m128i*) refname;
+
+	/* Character ranges for characters forbidden in refs; see above */
+	static const __v16qi bad = {
+		0x01, 0x20,  0x7e, 0x7f,  0x5e, 0x5e,  0x3a, 0x3a,
+		0x5b, 0x5c,  0x2a, 0x2a,  0x3f, 0x3f,  0x3f, 0x3f};
+
+	static const __v16qi nonslashes = {
+		'\001', '/' -1, '/' + 1, 0xff,
+	};
+
+	static const __v16qi dotdot = {'.', '.', 0};
+	static const __v16qi atcurly = {'@', '{', 0};
+
+	const __m128i *vp;
+	const char *cp = (const char *)refname_vec;
+
+	int dotdotpos = SSE_VECTOR_BYTES, atcurlypos = SSE_VECTOR_BYTES;
+	for (vp = refname_vec; ; vp++) {
+		__m128i tmp;
+		int endpos;
+
 		/*
-		 * Even if leading dots are allowed, don't allow "."
-		 * as a component (".." is prevented by a rule above).
+		 * Handle case of forbidden substrings .. and @{ crossing
+		 * sixteen-byte boundaries
 		 */
-		if (refname[1] == '\0')
-			return -1; /* Component equals ".". */
+		if (dotdotpos == 15 && *cp == '.')
+			return -1;
+
+		if (atcurlypos == 15 && *cp == '{')
+			return -1;
+
+		if (((uintptr_t) vp % PAGE_SIZE) > PAGE_SIZE - SSE_VECTOR_BYTES)
+			/*
+			 * End-of-page; fall back to slow method for
+			 * this entire component.
+			 */
+			return check_refname_component_bytewise(refname, flags);
+
+		tmp = _mm_lddqu_si128(vp);
+
+		/*
+		 * Find slashes or end-of-string. The double-negative
+		 * (negative-polarity search for non-slashes) is
+		 * necessary so that \0 will also be counted.
+		 */
+		endpos = _mm_cmpistri((__m128i) nonslashes, tmp,
+				      _SIDD_UBYTE_OPS | _SIDD_CMP_RANGES |
+				      _SIDD_NEGATIVE_POLARITY);
+
+		if (_mm_cmpestrc((__m128i) bad, SSE_VECTOR_BYTES, tmp, endpos,
+				 _SIDD_UBYTE_OPS | _SIDD_CMP_RANGES))
+			return -1;
+
+		dotdotpos = _mm_cmpestri((__m128i) dotdot, 2, tmp, endpos,
+					 _SIDD_UBYTE_OPS |
+					 _SIDD_CMP_EQUAL_ORDERED);
+		if (dotdotpos < 15)
+			return -1;
+
+		atcurlypos = _mm_cmpestri((__m128i) atcurly, 2, tmp, endpos,
+					  _SIDD_UBYTE_OPS |
+					  _SIDD_CMP_EQUAL_ORDERED);
+		if (atcurlypos < 15)
+			return -1;
+
+		if (endpos < SSE_VECTOR_BYTES) {
+			cp = ((const char*) vp) + endpos;
+			break;
+		}
+		cp = (const char*) vp + SSE_VECTOR_BYTES;
 	}
-	if (cp - refname >= 5 && !memcmp(cp - 5, ".lock", 5))
-		return -1; /* Refname ends with ".lock". */
-	return cp - refname;
+	return check_refname_component_trailer(cp, refname, flags);
+}
+
+static void* find_check_refname_component(void)
+{
+	if (processor_supports_sse42())
+		return check_refname_component_sse42;
+	else
+		return check_refname_component_bytewise;
 }
 
+#ifdef __clang__
+
+static int check_refname_component_autodetect(const char *refname, int flags);
+
+static int (*check_refname_component)(const char *, int) = check_refname_component_autodetect;
+
+/*
+ * On the first run of check_refname_component, autodetect the correct
+ * version for this CPU.  This is necessary because Clang doesn't support
+ * ifunc.
+ */
+static int check_refname_component_autodetect(const char *refname, int flags)
+{
+	check_refname_component = find_check_refname_component();
+	return check_refname_component(refname, flags);
+}
+
+#else
+
+static int check_refname_component(const char *refname, int flags)
+	__attribute__ ((ifunc ("find_check_refname_component")));
+
+#endif /* __clang__ */
+
+#endif /* NO_SSE42 */
+
 int check_refname_format(const char *refname, int flags)
 {
 	int component_len, component_count = 0;
diff --git a/t/t5511-refspec.sh b/t/t5511-refspec.sh
index de6db86..7f1bd74 100755
--- a/t/t5511-refspec.sh
+++ b/t/t5511-refspec.sh
@@ -88,4 +88,17 @@ test_refspec fetch "refs/heads/${good}"
 bad=$(printf '\011tab')
 test_refspec fetch "refs/heads/${bad}"				invalid
 
+test_refspec fetch 'refs/heads/a-very-long-refname'
+test_refspec fetch 'refs/heads/.a-very-long-refname'		invalid
+test_refspec fetch 'refs/heads/abcdefgh0123..'			invalid
+test_refspec fetch 'refs/heads/abcdefgh01234..'			invalid
+test_refspec fetch 'refs/heads/abcdefgh012345..'		invalid
+test_refspec fetch 'refs/heads/abcdefgh0123456..'		invalid
+test_refspec fetch 'refs/heads/abcdefgh01234567..'		invalid
+test_refspec fetch 'refs/heads/abcdefgh0123.a'
+test_refspec fetch 'refs/heads/abcdefgh01234.a'
+test_refspec fetch 'refs/heads/abcdefgh012345.a'
+test_refspec fetch 'refs/heads/abcdefgh0123456.a'
+test_refspec fetch 'refs/heads/abcdefgh01234567.a'
+
 test_done
-- 
2.0.0.rc1.24.g0588c94.dirty

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH v7 0/1] refs.c: SSE4.2 optimizations for check_refname_component
  2014-06-05 23:56 [PATCH v7 0/1] refs.c: SSE4.2 optimizations for check_refname_component David Turner
  2014-06-05 23:56 ` [PATCH v7 1/1] " David Turner
@ 2014-06-09 22:16 ` Junio C Hamano
  2014-06-09 22:39   ` David Turner
  2014-06-09 23:05   ` Junio C Hamano
  1 sibling, 2 replies; 12+ messages in thread
From: Junio C Hamano @ 2014-06-09 22:16 UTC (permalink / raw)
  To: David Turner; +Cc: git, mhagger

David Turner <dturner@twopensource.com> writes:

> Since Junio has picked up the first patch from previous versions of
> this series, I'm just going to send the second (SSE) one.  I decided
> not to s/NO_SSE42/!HAVE_SSE42/ because it looks like git mostly uses
> the former convention (for instance, that's what GIT_PARSE_WITH
> generates).

Yeah but NO_FROTZ is used only when FROTZ is something everybody is
expected to have (e.g. it's in posix, people ought to have it, but
we do support those who don't), isn't it?  For a very arch specific
stuff like sse42, I'd feel better to make it purely opt-in by
forcing people to explicitly say HAVE_SSE42 to enable it.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v7 0/1] refs.c: SSE4.2 optimizations for check_refname_component
  2014-06-09 22:16 ` [PATCH v7 0/1] " Junio C Hamano
@ 2014-06-09 22:39   ` David Turner
  2014-06-09 23:05   ` Junio C Hamano
  1 sibling, 0 replies; 12+ messages in thread
From: David Turner @ 2014-06-09 22:39 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git, mhagger

On Mon, 2014-06-09 at 15:16 -0700, Junio C Hamano wrote:
> David Turner <dturner@twopensource.com> writes:
> 
> > Since Junio has picked up the first patch from previous versions of
> > this series, I'm just going to send the second (SSE) one.  I decided
> > not to s/NO_SSE42/!HAVE_SSE42/ because it looks like git mostly uses
> > the former convention (for instance, that's what GIT_PARSE_WITH
> > generates).
> 
> Yeah but NO_FROTZ is used only when FROTZ is something everybody is
> expected to have (e.g. it's in posix, people ought to have it, but
> we do support those who don't), isn't it?  For a very arch specific
> stuff like sse42, I'd feel better to make it purely opt-in by
> forcing people to explicitly say HAVE_SSE42 to enable it.

The patch now has two kinds of autodetection:

1. At build-time, we check for the compiler supporting -msse4.2.  If it
does, and if the user has not explicitly done --without-sse, then we
build with SSE support.  This does not mean that the SSE code will
necessarily be used because:
2. At run-time, if we have built with SSE support, we check cpuid to
choose a version of the function that will run on the current CPU.

So I think we never hit a case where we try to use SSE and fail, which
is the major reason I see to make it non-default.

To me, this means that we should not require people to explicitly
request SSE, because we generally want to try to provide the
most-efficient version of git that will work everywhere.  In fact, I am
not sure we need a --without-sse option at all, since all it saves is a
cpuid instruction.  But I don't need to remove the option, in case
there's a use for it I'm not thinking of.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v7 0/1] refs.c: SSE4.2 optimizations for check_refname_component
  2014-06-09 22:16 ` [PATCH v7 0/1] " Junio C Hamano
  2014-06-09 22:39   ` David Turner
@ 2014-06-09 23:05   ` Junio C Hamano
  2014-06-10  6:04     ` Johannes Sixt
  1 sibling, 1 reply; 12+ messages in thread
From: Junio C Hamano @ 2014-06-09 23:05 UTC (permalink / raw)
  To: David Turner; +Cc: git, mhagger

Junio C Hamano <gitster@pobox.com> writes:

> David Turner <dturner@twopensource.com> writes:
>
>> Since Junio has picked up the first patch from previous versions of
>> this series, I'm just going to send the second (SSE) one.  I decided
>> not to s/NO_SSE42/!HAVE_SSE42/ because it looks like git mostly uses
>> the former convention (for instance, that's what GIT_PARSE_WITH
>> generates).
>
> Yeah but NO_FROTZ is used only when FROTZ is something everybody is
> expected to have (e.g. it's in posix, people ought to have it, but
> we do support those who don't), isn't it?  For a very arch specific
> stuff like sse42, I'd feel better to make it purely opt-in by
> forcing people to explicitly say HAVE_SSE42 to enable it.

Just FYI: I am getting

compat/cpuid.h:8:12: error: 'processor_supports_sse42' defined but
not used [-Werror=unused-function]
cc1: all warnings being treated as errors

while building 'pu'; I'll have to rebuild 'pu' without this patch
before I can push the day's result out.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v7 0/1] refs.c: SSE4.2 optimizations for check_refname_component
  2014-06-09 23:05   ` Junio C Hamano
@ 2014-06-10  6:04     ` Johannes Sixt
  2014-06-10  6:55       ` Junio C Hamano
  2014-06-13  1:18       ` David Turner
  0 siblings, 2 replies; 12+ messages in thread
From: Johannes Sixt @ 2014-06-10  6:04 UTC (permalink / raw)
  To: Junio C Hamano, David Turner; +Cc: git, mhagger

Am 6/10/2014 1:05, schrieb Junio C Hamano:
> Junio C Hamano <gitster@pobox.com> writes:
> 
>> David Turner <dturner@twopensource.com> writes:
>>
>>> Since Junio has picked up the first patch from previous versions of
>>> this series, I'm just going to send the second (SSE) one.  I decided
>>> not to s/NO_SSE42/!HAVE_SSE42/ because it looks like git mostly uses
>>> the former convention (for instance, that's what GIT_PARSE_WITH
>>> generates).
>>
>> Yeah but NO_FROTZ is used only when FROTZ is something everybody is
>> expected to have (e.g. it's in posix, people ought to have it, but
>> we do support those who don't), isn't it?  For a very arch specific
>> stuff like sse42, I'd feel better to make it purely opt-in by
>> forcing people to explicitly say HAVE_SSE42 to enable it.
> 
> Just FYI: I am getting
> 
> compat/cpuid.h:8:12: error: 'processor_supports_sse42' defined but
> not used [-Werror=unused-function]
> cc1: all warnings being treated as errors
> 
> while building 'pu'; I'll have to rebuild 'pu' without this patch
> before I can push the day's result out.

And I get this when I compile on Windows with msysgit:

    CC abspath.o
In file included from git-compat-util.h:694,
                 from cache.h:4,
                 from abspath.c:1:
compat/cpuid.h: In function 'processor_supports_sse42':
compat/cpuid.h:11: warning: implicit declaration of function '__cpuid'
abspath.c: At top level:
compat/cpuid.h:8: warning: 'processor_supports_sse42' defined but not used
abspath.c: In function 'processor_supports_sse42':
compat/cpuid.h:11: warning: 'eax' is used uninitialized in this function
compat/cpuid.h:11: warning: 'ebx' is used uninitialized in this function
compat/cpuid.h:11: warning: 'ecx' is used uninitialized in this function
compat/cpuid.h:11: warning: 'edx' is used uninitialized in this function

Perhaps our gcc is too old?

-- Hannes

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v7 0/1] refs.c: SSE4.2 optimizations for check_refname_component
  2014-06-10  6:04     ` Johannes Sixt
@ 2014-06-10  6:55       ` Junio C Hamano
  2014-06-13  1:18       ` David Turner
  1 sibling, 0 replies; 12+ messages in thread
From: Junio C Hamano @ 2014-06-10  6:55 UTC (permalink / raw)
  To: Johannes Sixt; +Cc: David Turner, git, mhagger

Johannes Sixt <j.sixt@viscovery.net> writes:

> And I get this when I compile on Windows with msysgit:
>
>     CC abspath.o
> In file included from git-compat-util.h:694,
>                  from cache.h:4,
>                  from abspath.c:1:
> compat/cpuid.h: In function 'processor_supports_sse42':
> compat/cpuid.h:11: warning: implicit declaration of function '__cpuid'
> abspath.c: At top level:
> compat/cpuid.h:8: warning: 'processor_supports_sse42' defined but not used
> abspath.c: In function 'processor_supports_sse42':
> compat/cpuid.h:11: warning: 'eax' is used uninitialized in this function
> compat/cpuid.h:11: warning: 'ebx' is used uninitialized in this function
> compat/cpuid.h:11: warning: 'ecx' is used uninitialized in this function
> compat/cpuid.h:11: warning: 'edx' is used uninitialized in this function
>
> Perhaps our gcc is too old?

Maybe.

In any case, it is a good indication that it probably is a good idea
to start with an optional USE_SSE42 (not !NO_SSE42 or HAVE_SSE42) so
that it is clear to anybody that those with SSE42 does not have to
use this compilation option.  Once the code proves itself, we can
consider turning it on by default when able, but it seems that it is
a bit too premature for that (not that the code itself is premature
in the original author's environment, but its portability has not
been quite ready for everybody yet, it seems).

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v7 0/1] refs.c: SSE4.2 optimizations for check_refname_component
  2014-06-10  6:04     ` Johannes Sixt
  2014-06-10  6:55       ` Junio C Hamano
@ 2014-06-13  1:18       ` David Turner
  2014-06-13  4:11         ` Torsten Bögershausen
  1 sibling, 1 reply; 12+ messages in thread
From: David Turner @ 2014-06-13  1:18 UTC (permalink / raw)
  To: Johannes Sixt; +Cc: Junio C Hamano, git, mhagger

On Tue, 2014-06-10 at 08:04 +0200, Johannes Sixt wrote:
> Am 6/10/2014 1:05, schrieb Junio C Hamano:
> > Junio C Hamano <gitster@pobox.com> writes:
> > 
> >> David Turner <dturner@twopensource.com> writes:
> >>
> >>> Since Junio has picked up the first patch from previous versions of
> >>> this series, I'm just going to send the second (SSE) one.  I decided
> >>> not to s/NO_SSE42/!HAVE_SSE42/ because it looks like git mostly uses
> >>> the former convention (for instance, that's what GIT_PARSE_WITH
> >>> generates).
> >>
> >> Yeah but NO_FROTZ is used only when FROTZ is something everybody is
> >> expected to have (e.g. it's in posix, people ought to have it, but
> >> we do support those who don't), isn't it?  For a very arch specific
> >> stuff like sse42, I'd feel better to make it purely opt-in by
> >> forcing people to explicitly say HAVE_SSE42 to enable it.
> > 
> > Just FYI: I am getting
> > 
> > compat/cpuid.h:8:12: error: 'processor_supports_sse42' defined but
> > not used [-Werror=unused-function]
> > cc1: all warnings being treated as errors
> > 
> > while building 'pu'; I'll have to rebuild 'pu' without this patch
> > before I can push the day's result out.
> 
> And I get this when I compile on Windows with msysgit:
> 
>     CC abspath.o
> In file included from git-compat-util.h:694,
>                  from cache.h:4,
>                  from abspath.c:1:
> compat/cpuid.h: In function 'processor_supports_sse42':
> compat/cpuid.h:11: warning: implicit declaration of function '__cpuid'
> abspath.c: At top level:
> compat/cpuid.h:8: warning: 'processor_supports_sse42' defined but not used
> abspath.c: In function 'processor_supports_sse42':
> compat/cpuid.h:11: warning: 'eax' is used uninitialized in this function
> compat/cpuid.h:11: warning: 'ebx' is used uninitialized in this function
> compat/cpuid.h:11: warning: 'ecx' is used uninitialized in this function
> compat/cpuid.h:11: warning: 'edx' is used uninitialized in this function
> 
> Perhaps our gcc is too old?

It is too old for my patch because it doesn't support ifunc (and I
suspect that no version of GCC for Windows supports ifunc).  But that
does not seem to be what is going on in your error message.  Instead,
when we #include <cpuid.h>, we get compat/cpuid.h rather than the
system's cpuid.h.  When I rename compat/cpuid.h to something else (and
adjust the code accordingly), and add a check for gcc 4.5+ and for
Windows before using ifunc, then everything works.

So tomorrow, I'm going to send a new patch (which also fixes the
warnings Junio reported) which I've tested on Windows, GNU/Linux, and
Mac OS X. 

I'm testing on a Windows 8 VM from modern.ie with msysgit's
"netinstaller" -- is that a reasonable test environment?

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v7 0/1] refs.c: SSE4.2 optimizations for check_refname_component
  2014-06-13  1:18       ` David Turner
@ 2014-06-13  4:11         ` Torsten Bögershausen
  2014-06-14 10:24           ` Philip Oakley
  0 siblings, 1 reply; 12+ messages in thread
From: Torsten Bögershausen @ 2014-06-13  4:11 UTC (permalink / raw)
  To: David Turner, Johannes Sixt; +Cc: Junio C Hamano, git, mhagger

On 2014-06-13 03.18, David Turner wrote:
[]
> 
> It is too old for my patch because it doesn't support ifunc (and I
> suspect that no version of GCC for Windows supports ifunc).  But that
> does not seem to be what is going on in your error message.  Instead,
> when we #include <cpuid.h>, we get compat/cpuid.h rather than the
> system's cpuid.h. When I rename compat/cpuid.h to something else 
compat/git_cpuid.h ?

> I'm testing on a Windows 8 VM from modern.ie with msysgit's
> "netinstaller" -- is that a reasonable test environment?

Many people are using Windows 7,
and we shouldn't break for things for Windows XP.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v7 0/1] refs.c: SSE4.2 optimizations for check_refname_component
  2014-06-13  4:11         ` Torsten Bögershausen
@ 2014-06-14 10:24           ` Philip Oakley
  0 siblings, 0 replies; 12+ messages in thread
From: Philip Oakley @ 2014-06-14 10:24 UTC (permalink / raw)
  To: Torsten Bögershausen, David Turner, Johannes Sixt
  Cc: Junio C Hamano, Git List, mhagger

From: "Torsten Bögershausen" <tboegi@web.de>
> On 2014-06-13 03.18, David Turner wrote:
> []
>>
>> It is too old for my patch because it doesn't support ifunc (and I
>> suspect that no version of GCC for Windows supports ifunc).  But that
>> does not seem to be what is going on in your error message.  Instead,
>> when we #include <cpuid.h>, we get compat/cpuid.h rather than the
>> system's cpuid.h. When I rename compat/cpuid.h to something else
> compat/git_cpuid.h ?
>
>> I'm testing on a Windows 8 VM from modern.ie with msysgit's
>> "netinstaller" -- is that a reasonable test environment?
>
> Many people are using Windows 7,
> and we shouldn't break for things for Windows XP.
>
I'd like that.
--
Philip
sent from my XP netbook;-)

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v7 1/1] refs.c: SSE4.2 optimizations for check_refname_component
  2014-06-05 23:56 ` [PATCH v7 1/1] " David Turner
@ 2014-06-14 15:22   ` Ondřej Bílka
  2014-06-15  5:53     ` David Turner
  0 siblings, 1 reply; 12+ messages in thread
From: Ondřej Bílka @ 2014-06-14 15:22 UTC (permalink / raw)
  To: David Turner; +Cc: git, gitster, mhagger, David Turner

On Thu, Jun 05, 2014 at 07:56:15PM -0400, David Turner wrote:
> Optimize check_refname_component using SSE4.2, where available.
> 
> git rev-parse HEAD is a good test-case for this, since it does almost
> nothing except parse refs.  For one particular repo with about 60k
> refs, almost all packed, the timings are:
> 
> Look up table: 29 ms
> SSE4.2:        25 ms
> 
> This is about a 15% improvement.
> 
> The configure.ac changes include code from the GNU C Library written
> by Joseph S. Myers <joseph at codesourcery dot com>.
> 
> Only supports GCC and Clang at present, because C interfaces to the
> cpuid instruction are not well-standardized.
>
Still a SSE4.2 is not that useful, in most cases SSE2 is faster. Here I
think that difference will not be that big when correctly implemented.
That will avoid a runtime checks.

For parallelisation you need to take extra step and paralelize whole
check than going component-by-component.

For detecting sequences a faster way is construct bitmasks with SSE2 so
you could combine these. It avoids needing special casing on 16-byte
boundaries. 

Below is untested implementation where you could add a bad character
check with SSE4.2 which would speed it up. Are refs mostly
alphanumerical? If so we could speed this up by paralelized alnum check
and handling other characters in slower path.


#include <stdint.h>
#include <emmintrin.h>

char bad_table[256]; // TODO
int bad_characters(unsigned char *x)
{
	while (*x)
	if (bad_table[*x++])
		return -1;

	return 0;
}

int check_refname_skeleton(char *x)
{
	if (bad_characters(x))
		return -1;

	__m128i slash  = _mm_set1_epi8 ('/');
	__m128i dot    = _mm_set1_epi8 ('.');
	__m128i char_l = _mm_set1_epi8 ('l');
	__m128i at     = _mm_set1_epi8 ('@');
	__m128i brace  = _mm_set1_epi8 ('{');

	while (1) {
		if (((uint64_t) x) & 4095 < 4096 - 17)
			{
				if (bytewise_check(x) != -2);
					return bytewise_check(x);
	
				x += 16;
			}

		__m128i v0 = _mm_loadu_si128 ((__m128i *)(x));
		__m128i v1 = _mm_loadu_si128 ((__m128i *)(x + 1));

		__m128i result;

		// terminating 0
		result = _mm_cmpeq_epi8(v0, _mm_set1_epi8('\000'));

		// sequence ..
		result = _mm_or_si128(result, _mm_and_si128 (_mm_cmpeq_epi8(v0, dot),
                		              		     _mm_cmpeq_epi8(v1, dot)));

		// sequence /.
		result = _mm_or_si128(result, _mm_and_si128 (_mm_cmpeq_epi8(v0, slash),
                		              		     _mm_cmpeq_epi8(v1, dot)));

		// sequence //
		result = _mm_or_si128(result, _mm_and_si128 (_mm_cmpeq_epi8(v0, slash),
                		              		     _mm_cmpeq_epi8(v1, slash)));

                                                                 
		// sequence .l                                                   
		result = _mm_or_si128(result, _mm_and_si128 (_mm_cmpeq_epi8(v0, dot),
                		              		     _mm_cmpeq_epi8(v1, char_l)));
                                                                 
		// sequence @{                                                   
                                                                 
		result = _mm_or_si128(result, _mm_and_si128 (_mm_cmpeq_epi8(v0, at),
                		                             _mm_cmpeq_epi8(v1, brace)));

		uint64_t mask = _mm_movemask_epi8(result);
		if (mask) {
		        char *p = x + __builtin_ctzl(mask);

		        if (!*p)
                		return 0;
		        else if (p[0] == '.' && p[1] == 'l')
                		if (bytewise_check(x) != -2)
					return bytewise_check(x);
		        else
                		return -1;
		}
		x += 16;
	}
}

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v7 1/1] refs.c: SSE4.2 optimizations for check_refname_component
  2014-06-14 15:22   ` Ondřej Bílka
@ 2014-06-15  5:53     ` David Turner
  0 siblings, 0 replies; 12+ messages in thread
From: David Turner @ 2014-06-15  5:53 UTC (permalink / raw)
  To: Ondřej Bílka; +Cc: git, gitster, mhagger, David Turner

On Sat, 2014-06-14 at 17:22 +0200, Ondřej Bílka wrote:
> On Thu, Jun 05, 2014 at 07:56:15PM -0400, David Turner wrote:
> > Optimize check_refname_component using SSE4.2, where available.
> > 
> > git rev-parse HEAD is a good test-case for this, since it does almost
> > nothing except parse refs.  For one particular repo with about 60k
> > refs, almost all packed, the timings are:
> > 
> > Look up table: 29 ms
> > SSE4.2:        25 ms
> > 
> > This is about a 15% improvement.
> > 
> > The configure.ac changes include code from the GNU C Library written
> > by Joseph S. Myers <joseph at codesourcery dot com>.
> > 
> > Only supports GCC and Clang at present, because C interfaces to the
> > cpuid instruction are not well-standardized.
> >
> Still a SSE4.2 is not that useful, in most cases SSE2 is faster. Here I
> think that difference will not be that big when correctly implemented.
> That will avoid a runtime checks.

Surprisingly to me, this is true!  At least, on my machine.  Sadly, the
only way to make it avoid a runtime check is to exclude 32-bit machines
(or to make the option non-default, which I would prefer not to do). 

> For parallelisation you need to take extra step and paralelize whole
> check than going component-by-component.

Good idea.

> For detecting sequences a faster way is construct bitmasks with SSE2 so
> you could combine these. It avoids needing special casing on 16-byte
> boundaries.

That does seem to be faster.

> Below is untested implementation where you could add a bad character
> check with SSE4.2 which would speed it up. Are refs mostly
> alphanumerical? If so we could speed this up by paralelized alnum check
> and handling other characters in slower path.

Twitter's are almost entirely in [-._/a-zA-Z0-9] -- there are only a
handful of exceptions. So, a method that has some bycatch outside of
this range is just as fast as the SSE4.2 bad character check (but
somewhat more code).

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2014-06-15  5:54 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-06-05 23:56 [PATCH v7 0/1] refs.c: SSE4.2 optimizations for check_refname_component David Turner
2014-06-05 23:56 ` [PATCH v7 1/1] " David Turner
2014-06-14 15:22   ` Ondřej Bílka
2014-06-15  5:53     ` David Turner
2014-06-09 22:16 ` [PATCH v7 0/1] " Junio C Hamano
2014-06-09 22:39   ` David Turner
2014-06-09 23:05   ` Junio C Hamano
2014-06-10  6:04     ` Johannes Sixt
2014-06-10  6:55       ` Junio C Hamano
2014-06-13  1:18       ` David Turner
2014-06-13  4:11         ` Torsten Bögershausen
2014-06-14 10:24           ` Philip Oakley

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.