All of lore.kernel.org
 help / color / mirror / Atom feed
From: Richard Henderson <rth@twiddle.net>
To: qemu-devel@nongnu.org
Cc: pbonzini@redhat.com, vijay.kilari@gmail.com
Subject: [Qemu-devel] [PATCH v3 7/9] cutils: Rewrite x86 buffer zero checking
Date: Mon, 29 Aug 2016 11:46:18 -0700	[thread overview]
Message-ID: <1472496380-19706-8-git-send-email-rth@twiddle.net> (raw)
In-Reply-To: <1472496380-19706-1-git-send-email-rth@twiddle.net>

Handle alignment of buffers, so that the vector paths can be
used more often.  Add versions for AVX1 and SSE4.1, both of
which have incremental improvements over SSE2.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 util/bufferiszero.c | 209 ++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 179 insertions(+), 30 deletions(-)

diff --git a/util/bufferiszero.c b/util/bufferiszero.c
index 2c5801b..7fcc8e1 100644
--- a/util/bufferiszero.c
+++ b/util/bufferiszero.c
@@ -122,29 +122,177 @@ static bool select_accel_fn(const void *buf, size_t len)
     return buffer_zero_int(buf, len);
 }
 
-#elif defined(CONFIG_AVX2_OPT)
+#elif defined(CONFIG_AVX2_OPT) || defined(__SSE2__)
 #include <cpuid.h>
 #include <x86intrin.h>
 
+/* Note that we're going to check for LEN >= 64 for all of these.  */
+
+#ifdef CONFIG_AVX2_OPT
 #pragma GCC push_options
 #pragma GCC target("avx2")
-#define AVX2_NONZERO(X)  !_mm256_testz_si256((X), (X))
-ACCEL_BUFFER_ZERO(buffer_zero_avx2, 128, __m256i, AVX2_NONZERO)
+
+static bool
+buffer_zero_avx2(const void *buf, size_t len)
+{
+    /* Begin with an unaligned head of 32 bytes.  */
+    __m256i t = _mm256_loadu_si256(buf);
+    __m256i *p = (__m256i *)(((uintptr_t)buf + 5 * 32) & -32);
+    __m256i *e = (__m256i *)(((uintptr_t)buf + len) & -32);
+
+    if (likely(p <= e)) {
+        /* Loop over 32-byte aligned blocks of 128.  */
+        do {
+            __builtin_prefetch(p);
+            if (unlikely(!_mm256_testz_si256(t, t))) {
+                return false;
+            }
+            t = p[-4] | p[-3] | p[-2] | p[-1];
+            p += 4;
+        } while (p <= e);
+    } else {
+        t |= _mm256_loadu_si256(buf + 32);
+        if (len <= 128) {
+            goto last2;
+        }
+    }
+
+    /* Finish the last block of 128 unaligned.  */
+    t |= _mm256_loadu_si256(buf + len - 4 * 32);
+    t |= _mm256_loadu_si256(buf + len - 3 * 32);
+ last2:
+    t |= _mm256_loadu_si256(buf + len - 2 * 32);
+    t |= _mm256_loadu_si256(buf + len - 1 * 32);
+
+    return _mm256_testz_si256(t, t);
+}
+
+#pragma GCC pop_options
+#pragma GCC push_options
+#pragma GCC target("avx")
+
+static bool
+buffer_zero_avx(const void *buf, size_t len)
+{
+    __m128i t = _mm_loadu_si128(buf);
+    __m128i *p = (__m128i *)(((uintptr_t)buf + 5 * 16) & -16);
+    __m128i *e = (__m128i *)(((uintptr_t)buf + len) & -16);
+
+    /* Loop over 16-byte aligned blocks of 64.  */
+    while (likely(p <= e)) {
+        __builtin_prefetch(p);
+        if (unlikely(!_mm_testz_si128(t, t))) {
+            return false;
+        }
+        t = p[-4] | p[-3] | p[-2] | p[-1];
+        p += 4;
+    }
+
+    /* Finish the last block of 64 unaligned.  */
+    t |= _mm_loadu_si128(buf + len - 4 * 16);
+    t |= _mm_loadu_si128(buf + len - 3 * 16);
+    t |= _mm_loadu_si128(buf + len - 2 * 16);
+    t |= _mm_loadu_si128(buf + len - 1 * 16);
+
+    return _mm_testz_si128(t, t);
+}
+
 #pragma GCC pop_options
+#pragma GCC push_options
+#pragma GCC target("sse4")
+
+static bool
+buffer_zero_sse4(const void *buf, size_t len)
+{
+    __m128i t = _mm_loadu_si128(buf);
+    __m128i *p = (__m128i *)(((uintptr_t)buf + 5 * 16) & -16);
+    __m128i *e = (__m128i *)(((uintptr_t)buf + len) & -16);
+
+    /* Loop over 16-byte aligned blocks of 64.  */
+    while (likely(p <= e)) {
+        __builtin_prefetch(p);
+        if (unlikely(!_mm_testz_si128(t, t))) {
+            return false;
+        }
+        t = p[-4] | p[-3] | p[-2] | p[-1];
+        p += 4;
+    }
 
+    /* Finish the aligned tail.  */
+    t |= e[-3];
+    t |= e[-2];
+    t |= e[-1];
+
+    /* Finish the unaligned tail.  */
+    t |= _mm_loadu_si128(buf + len - 16);
+
+    return _mm_testz_si128(t, t);
+}
+
+#pragma GCC pop_options
 #pragma GCC push_options
 #pragma GCC target("sse2")
-#define SSE2_NONZERO(X) \
-    (_mm_movemask_epi8(_mm_cmpeq_epi8((X), _mm_setzero_si128())) != 0xFFFF)
-ACCEL_BUFFER_ZERO(buffer_zero_sse2, 64, __m128i, SSE2_NONZERO)
+#endif /* CONFIG_AVX2_OPT */
+
+static bool
+buffer_zero_sse2(const void *buf, size_t len)
+{
+    __m128i t = _mm_loadu_si128(buf);
+    __m128i *p = (__m128i *)(((uintptr_t)buf + 5 * 16) & -16);
+    __m128i *e = (__m128i *)(((uintptr_t)buf + len) & -16);
+    __m128i zero = _mm_setzero_si128();
+
+    /* Loop over 16-byte aligned blocks of 64.  */
+    while (likely(p <= e)) {
+        __builtin_prefetch(p);
+        t = _mm_cmpeq_epi8(t, zero);
+        if (unlikely(_mm_movemask_epi8(t) != 0xFFFF)) {
+            return false;
+        }
+        t = p[-4] | p[-3] | p[-2] | p[-1];
+        p += 4;
+    }
+
+    /* Finish the aligned tail.  */
+    t |= e[-3];
+    t |= e[-2];
+    t |= e[-1];
+
+    /* Finish the unaligned tail.  */
+    t |= _mm_loadu_si128(buf + len - 16);
+
+    return _mm_movemask_epi8(_mm_cmpeq_epi8(t, zero)) == 0xFFFF;
+}
+
+#ifdef CONFIG_AVX2_OPT
 #pragma GCC pop_options
 
-#define CACHE_AVX2    2
-#define CACHE_AVX1    4
-#define CACHE_SSE4    8
-#define CACHE_SSE2    16
+/* These values must be most preferable alternative first.
+   See test_buffer_is_zero_next_accel.  */
+#define CACHE_AVX2    1
+#define CACHE_AVX1    2
+#define CACHE_SSE4    4
+#define CACHE_SSE2    8
 
 static unsigned cpuid_cache;
+static accel_zero_fn buffer_accel;
+
+static void init_accel(unsigned cache)
+{
+    accel_zero_fn fn;
+    if (cache & CACHE_AVX2) {
+        fn = buffer_zero_avx2;
+    } else if (cache & CACHE_AVX1) {
+        fn = buffer_zero_avx;
+    } else if (cache & CACHE_SSE4) {
+        fn = buffer_zero_sse4;
+    } else if (cache & CACHE_SSE2) {
+        fn = buffer_zero_sse2;
+    } else {
+        fn = buffer_zero_int;
+    }
+    buffer_accel = fn;
+}
 
 static void __attribute__((constructor)) init_cpuid_cache(void)
 {
@@ -163,8 +311,9 @@ static void __attribute__((constructor)) init_cpuid_cache(void)
 
         /* We must check that AVX is not just available, but usable.  */
         if ((c & bit_OSXSAVE) && (c & bit_AVX)) {
-            __asm("xgetbv" : "=a"(a), "=d"(d) : "c"(0));
-            if ((a & 6) == 6) {
+            int bv;
+            __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
+            if ((bv & 6) == 6) {
                 cache |= CACHE_AVX1;
                 if (max >= 7) {
                     __cpuid_count(7, 0, a, b, c, d);
@@ -176,34 +325,34 @@ static void __attribute__((constructor)) init_cpuid_cache(void)
         }
     }
     cpuid_cache = cache;
+    init_accel(cache);
 }
 
-static bool select_accel_fn(const void *buf, size_t len)
+#define HAVE_NEXT_ACCEL
+bool test_buffer_is_zero_next_accel(void)
 {
-    uintptr_t ibuf = (uintptr_t)buf;
-    if (len % 128 == 0 && ibuf % 32 == 0 && (cpuid_cache & CACHE_AVX2)) {
-        return buffer_zero_avx2(buf, len);
-    }
-    if (len % 64 == 0 && ibuf % 16 == 0 && (cpuid_cache & CACHE_SSE2)) {
-        return buffer_zero_sse2(buf, len);
+    /* If no bits set, we just tested buffer_zero_int, and there
+       are no more acceleration options to test.  */
+    if (cpuid_cache == 0) {
+        return false;
     }
-    return buffer_zero_int(buf, len);
+    /* Disable the accelerator we used before and select a new one.  */
+    cpuid_cache &= cpuid_cache - 1;
+    init_accel(cpuid_cache);
+    return true;
 }
-
-#elif defined __SSE2__
-#include <emmintrin.h>
-
-#define SSE2_NONZERO(X) \
-    (_mm_movemask_epi8(_mm_cmpeq_epi8((X), _mm_setzero_si128())) != 0xFFFF)
-ACCEL_BUFFER_ZERO(buffer_zero_sse2, 64, __m128i, SSE2_NONZERO)
+#endif /* CONFIG_AVX2_OPT */
 
 static bool select_accel_fn(const void *buf, size_t len)
 {
-    uintptr_t ibuf = (uintptr_t)buf;
-    if (len % 64 == 0 && ibuf % sizeof(__m128i) == 0) {
+    if (likely(len >= 64)) {
+#ifdef CONFIG_AVX2_OPT
+        return buffer_accel(buf, len);
+#else
         return buffer_zero_sse2(buf, len);
+#endif
     }
-    return select_accel_int(buf, len);
+    return buffer_zero_int(buf, len);
 }
 
 #elif defined(__aarch64__)
-- 
2.7.4

  parent reply	other threads:[~2016-08-29 18:47 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-08-29 18:46 [Qemu-devel] [PATCH v3 0/9] Improve buffer_is_zero Richard Henderson
2016-08-29 18:46 ` [Qemu-devel] [PATCH v3 1/9] cutils: Move buffer_is_zero and subroutines to a new file Richard Henderson
2016-08-29 18:46 ` [Qemu-devel] [PATCH v3 3/9] cutils: Export only buffer_is_zero Richard Henderson
2016-08-29 18:46 ` [Qemu-devel] [PATCH v3 4/9] cutils: Rearrange buffer_is_zero acceleration Richard Henderson
2016-08-29 18:46 ` [Qemu-devel] [PATCH v3 5/9] cutils: Add test for buffer_is_zero Richard Henderson
2016-08-29 18:46 ` [Qemu-devel] [PATCH v3 6/9] cutils: Add generic prefetch Richard Henderson
2016-08-29 18:46 ` Richard Henderson [this message]
2016-09-13 13:26   ` [Qemu-devel] [PATCH v3 7/9] cutils: Rewrite x86 buffer zero checking Paolo Bonzini
2016-09-13 14:17     ` Paolo Bonzini
2016-09-13 14:49       ` Paolo Bonzini
2016-09-13 15:47         ` Paolo Bonzini
2016-08-29 18:46 ` [Qemu-devel] [PATCH v3 8/9] cutils: Remove aarch64 " Richard Henderson
2016-08-29 18:46 ` [Qemu-devel] [PATCH v3 9/9] cutils: Remove ppc " Richard Henderson
2016-08-30 11:48 ` [Qemu-devel] [PATCH v3 0/9] Improve buffer_is_zero Paolo Bonzini
2016-09-05 15:08 ` Dr. David Alan Gilbert

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1472496380-19706-8-git-send-email-rth@twiddle.net \
    --to=rth@twiddle.net \
    --cc=pbonzini@redhat.com \
    --cc=qemu-devel@nongnu.org \
    --cc=vijay.kilari@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.