linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/3] DCACHE_WORD_ACCESS support for ppc64le
@ 2014-09-18 23:40 Anton Blanchard
  2014-09-18 23:40 ` [PATCH 1/3] powerpc: Implement load_unaligned_zeropad Anton Blanchard
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: Anton Blanchard @ 2014-09-18 23:40 UTC (permalink / raw)
  To: benh, paulus, mpe, amodra, torvalds; +Cc: linuxppc-dev

This series adds an optimised version of word-at-a-time.h for ppc64le.
It uses the cmpb instruction which compares each byte in two 64 bit
values and for each matching byte places 0xff in the target and 0x00
otherwise.

The intermediate functions diverge a bit from what might have been 
intended (eg create_zero_mask returns the number of bits to the NULL),
but it makes find_zero and zero_bytemask simpler:

static inline unsigned long has_zero(unsigned long a, unsigned long *bits, const struct word_at_a_time *c)
{
        unsigned long ret;
        unsigned long zero = 0;

        asm("cmpb %0,%1,%2" : "=r" (ret) : "r" (a), "r" (zero));
        *bits = ret;

        return ret;
}

static inline unsigned long prep_zero_mask(unsigned long a, unsigned long bits, const struct word_at_a_time *c)
{
        return bits;
}

/* Alan Modra's little-endian strlen tail for 64-bit */
static inline unsigned long create_zero_mask(unsigned long bits)
{
        unsigned long leading_zero_bits;
        long trailing_zero_bit_mask;

        asm("addi       %1,%2,-1\n\t"
            "andc       %1,%1,%2\n\t"
            "popcntd    %0,%1"
                : "=r" (leading_zero_bits), "=&r" (trailing_zero_bit_mask)
                : "r" (bits));

        return leading_zero_bits;
}

static inline unsigned long find_zero(unsigned long mask)
{
        return mask >> 3;
}

/* This assumes that we never ask for an all 1s bitmask */
static inline unsigned long zero_bytemask(unsigned long mask)
{
        return (1UL << mask) - 1;
}

Anton Blanchard (3):
  powerpc: Implement load_unaligned_zeropad
  powerpc: ppc64le optimised word at a time
  powerpc: Enable DCACHE_WORD_ACCESS on ppc64le

 arch/powerpc/Kconfig                      |   1 +
 arch/powerpc/include/asm/word-at-a-time.h | 101 +++++++++++++++++++++++++-----
 2 files changed, 88 insertions(+), 14 deletions(-)

-- 
1.9.1

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH 1/3] powerpc: Implement load_unaligned_zeropad
  2014-09-18 23:40 [PATCH 0/3] DCACHE_WORD_ACCESS support for ppc64le Anton Blanchard
@ 2014-09-18 23:40 ` Anton Blanchard
  2014-09-18 23:40 ` [PATCH 2/3] powerpc: ppc64le optimised word at a time Anton Blanchard
  2014-09-18 23:40 ` [PATCH 3/3] powerpc: Enable DCACHE_WORD_ACCESS on ppc64le Anton Blanchard
  2 siblings, 0 replies; 4+ messages in thread
From: Anton Blanchard @ 2014-09-18 23:40 UTC (permalink / raw)
  To: benh, paulus, mpe, amodra, torvalds; +Cc: linuxppc-dev

Implement a bi-arch and bi-endian version of load_unaligned_zeropad.

Since the fallback case is so rare, a userspace test harness was used
to test this on ppc64le, ppc64 and ppc32:

http://ozlabs.org/~anton/junkcode/test_load_unaligned_zeropad.c

It uses mprotect to force a SEGV across a page boundary, and a SEGV
handler to lookup the exception tables and run the fixup routine.
It also compares the result against a normal load.

Signed-off-by: Anton Blanchard <anton@samba.org>
---
 arch/powerpc/include/asm/word-at-a-time.h | 40 +++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/arch/powerpc/include/asm/word-at-a-time.h b/arch/powerpc/include/asm/word-at-a-time.h
index 9a5c928..07cc121 100644
--- a/arch/powerpc/include/asm/word-at-a-time.h
+++ b/arch/powerpc/include/asm/word-at-a-time.h
@@ -116,4 +116,44 @@ static inline unsigned long prep_zero_mask(unsigned long a, unsigned long bits,
 
 #endif
 
+static inline unsigned long load_unaligned_zeropad(const void *addr)
+{
+	unsigned long ret, offset, tmp;
+
+	asm(
+	"1:	" PPC_LL "%[ret], 0(%[addr])\n"
+	"2:\n"
+	".section .fixup,\"ax\"\n"
+	"3:	"
+#ifdef __powerpc64__
+	"clrrdi		%[tmp], %[addr], 3\n\t"
+	"clrlsldi	%[offset], %[addr], 61, 3\n\t"
+	"ld		%[ret], 0(%[tmp])\n\t"
+#ifdef __BIG_ENDIAN__
+	"sld		%[ret], %[ret], %[offset]\n\t"
+#else
+	"srd		%[ret], %[ret], %[offset]\n\t"
+#endif
+#else
+	"clrrwi		%[tmp], %[addr], 2\n\t"
+	"clrlslwi	%[offset], %[addr], 30, 3\n\t"
+	"lwz		%[ret], 0(%[tmp])\n\t"
+#ifdef __BIG_ENDIAN__
+	"slw		%[ret], %[ret], %[offset]\n\t"
+#else
+	"srw		%[ret], %[ret], %[offset]\n\t"
+#endif
+#endif
+	"b	2b\n"
+	".previous\n"
+	".section __ex_table,\"a\"\n\t"
+		PPC_LONG_ALIGN "\n\t"
+		PPC_LONG "1b,3b\n"
+	".previous"
+	: [tmp] "=&b" (tmp), [offset] "=&r" (offset), [ret] "=&r" (ret)
+	: [addr] "b" (addr), "m" (*(unsigned long *)addr));
+
+	return ret;
+}
+
 #endif /* _ASM_WORD_AT_A_TIME_H */
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH 2/3] powerpc: ppc64le optimised word at a time
  2014-09-18 23:40 [PATCH 0/3] DCACHE_WORD_ACCESS support for ppc64le Anton Blanchard
  2014-09-18 23:40 ` [PATCH 1/3] powerpc: Implement load_unaligned_zeropad Anton Blanchard
@ 2014-09-18 23:40 ` Anton Blanchard
  2014-09-18 23:40 ` [PATCH 3/3] powerpc: Enable DCACHE_WORD_ACCESS on ppc64le Anton Blanchard
  2 siblings, 0 replies; 4+ messages in thread
From: Anton Blanchard @ 2014-09-18 23:40 UTC (permalink / raw)
  To: benh, paulus, mpe, amodra, torvalds; +Cc: linuxppc-dev

Use cmpb which compares each byte in two 64 bit values and
for each matching byte places 0xff in the target and 0x00
otherwise.

A simple hash_name microbenchmark:

http://ozlabs.org/~anton/junkcode/hash_name_bench.c

shows this version to be 10-20% faster than running the x86
version on POWER8, depending on the length.

Signed-off-by: Anton Blanchard <anton@samba.org>
---
 arch/powerpc/include/asm/word-at-a-time.h | 61 ++++++++++++++++++++++++-------
 1 file changed, 47 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/include/asm/word-at-a-time.h b/arch/powerpc/include/asm/word-at-a-time.h
index 07cc121..7cff3de 100644
--- a/arch/powerpc/include/asm/word-at-a-time.h
+++ b/arch/powerpc/include/asm/word-at-a-time.h
@@ -42,32 +42,65 @@ static inline bool has_zero(unsigned long val, unsigned long *data, const struct
 
 #else
 
+#ifdef CONFIG_64BIT
+
+/* unused */
 struct word_at_a_time {
-	const unsigned long one_bits, high_bits;
 };
 
-#define WORD_AT_A_TIME_CONSTANTS { REPEAT_BYTE(0x01), REPEAT_BYTE(0x80) }
+#define WORD_AT_A_TIME_CONSTANTS { }
 
-#ifdef CONFIG_64BIT
+/* This will give us 0xff for a NULL char and 0x00 elsewhere */
+static inline unsigned long has_zero(unsigned long a, unsigned long *bits, const struct word_at_a_time *c)
+{
+	unsigned long ret;
+	unsigned long zero = 0;
 
-/* Alan Modra's little-endian strlen tail for 64-bit */
-#define create_zero_mask(mask) (mask)
+	asm("cmpb %0,%1,%2" : "=r" (ret) : "r" (a), "r" (zero));
+	*bits = ret;
 
-static inline unsigned long find_zero(unsigned long mask)
+	return ret;
+}
+
+static inline unsigned long prep_zero_mask(unsigned long a, unsigned long bits, const struct word_at_a_time *c)
+{
+	return bits;
+}
+
+/* Alan Modra's little-endian strlen tail for 64-bit */
+static inline unsigned long create_zero_mask(unsigned long bits)
 {
 	unsigned long leading_zero_bits;
 	long trailing_zero_bit_mask;
 
-	asm ("addi %1,%2,-1\n\t"
-	     "andc %1,%1,%2\n\t"
-	     "popcntd %0,%1"
-	     : "=r" (leading_zero_bits), "=&r" (trailing_zero_bit_mask)
-	     : "r" (mask));
-	return leading_zero_bits >> 3;
+	asm("addi	%1,%2,-1\n\t"
+	    "andc	%1,%1,%2\n\t"
+	    "popcntd	%0,%1"
+		: "=r" (leading_zero_bits), "=&r" (trailing_zero_bit_mask)
+		: "r" (bits));
+
+	return leading_zero_bits;
+}
+
+static inline unsigned long find_zero(unsigned long mask)
+{
+	return mask >> 3;
+}
+
+/* This assumes that we never ask for an all 1s bitmask */
+static inline unsigned long zero_bytemask(unsigned long mask)
+{
+	return (1UL << mask) - 1;
 }
 
 #else	/* 32-bit case */
 
+struct word_at_a_time {
+	const unsigned long one_bits, high_bits;
+};
+
+#define WORD_AT_A_TIME_CONSTANTS { REPEAT_BYTE(0x01), REPEAT_BYTE(0x80) }
+
 /*
  * This is largely generic for little-endian machines, but the
  * optimal byte mask counting is probably going to be something
@@ -96,8 +129,6 @@ static inline unsigned long find_zero(unsigned long mask)
 	return count_masked_bytes(mask);
 }
 
-#endif
-
 /* Return nonzero if it has a zero */
 static inline unsigned long has_zero(unsigned long a, unsigned long *bits, const struct word_at_a_time *c)
 {
@@ -116,6 +147,8 @@ static inline unsigned long prep_zero_mask(unsigned long a, unsigned long bits,
 
 #endif
 
+#endif
+
 static inline unsigned long load_unaligned_zeropad(const void *addr)
 {
 	unsigned long ret, offset, tmp;
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH 3/3] powerpc: Enable DCACHE_WORD_ACCESS on ppc64le
  2014-09-18 23:40 [PATCH 0/3] DCACHE_WORD_ACCESS support for ppc64le Anton Blanchard
  2014-09-18 23:40 ` [PATCH 1/3] powerpc: Implement load_unaligned_zeropad Anton Blanchard
  2014-09-18 23:40 ` [PATCH 2/3] powerpc: ppc64le optimised word at a time Anton Blanchard
@ 2014-09-18 23:40 ` Anton Blanchard
  2 siblings, 0 replies; 4+ messages in thread
From: Anton Blanchard @ 2014-09-18 23:40 UTC (permalink / raw)
  To: benh, paulus, mpe, amodra, torvalds; +Cc: linuxppc-dev

Enable on DCACHE_WORD_ACCESS on ppc64le. It should work on
ppc64 and ppc32 but we need to do some testing first.

A somewhat reasonable testcase used to show the performance
improvement - a repeated stat of a 33 byte filename that
doesn't exist:

 #include <sys/types.h>
 #include <sys/stat.h>
 #include <unistd.h>

 #define ITERATIONS 10000000

 #define PATH "123456781234567812345678123456781"

 int main(void)
 {
 	unsigned long i;
 	struct stat buf;

 	for (i = 0; i < ITERATIONS; i++)
 		stat(PATH, &buf);

 	return 0;
 }

runs 27% faster on POWER8.

Signed-off-by: Anton Blanchard <anton@samba.org>
---
 arch/powerpc/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 90fe77a..7992f35 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -148,6 +148,7 @@ config PPC
 	select HAVE_ARCH_AUDITSYSCALL
 	select ARCH_SUPPORTS_ATOMIC_RMW
 	select HAVE_PERF_EVENTS_NMI if PPC64
+	select DCACHE_WORD_ACCESS if PPC64 && CPU_LITTLE_ENDIAN
 
 config GENERIC_CSUM
 	def_bool CPU_LITTLE_ENDIAN
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2014-09-18 23:40 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-09-18 23:40 [PATCH 0/3] DCACHE_WORD_ACCESS support for ppc64le Anton Blanchard
2014-09-18 23:40 ` [PATCH 1/3] powerpc: Implement load_unaligned_zeropad Anton Blanchard
2014-09-18 23:40 ` [PATCH 2/3] powerpc: ppc64le optimised word at a time Anton Blanchard
2014-09-18 23:40 ` [PATCH 3/3] powerpc: Enable DCACHE_WORD_ACCESS on ppc64le Anton Blanchard

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).