All of lore.kernel.org
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH v4 00/10] Improve buffer_is_zero
@ 2016-09-13 16:09 Paolo Bonzini
  2016-09-13 16:09 ` [Qemu-devel] [PATCH 01/10] cutils: Move buffer_is_zero and subroutines to a new file Paolo Bonzini
                   ` (9 more replies)
  0 siblings, 10 replies; 13+ messages in thread
From: Paolo Bonzini @ 2016-09-13 16:09 UTC (permalink / raw)
  To: qemu-devel; +Cc: rth

Some reorganization of Richard's patches that helped me find the
problem with the AVX version.

The new patch is an adaptation of the SSE4 version using ptest,
before the final rewrite that includes unaligned buffer support.

Paolo

Paolo Bonzini (1):
  cutils: Add SSE4 version

Richard Henderson (9):
  cutils: Move buffer_is_zero and subroutines to a new file
  cutils: Remove SPLAT macro
  cutils: Export only buffer_is_zero
  cutils: Rearrange buffer_is_zero acceleration
  cutils: Remove aarch64 buffer zero checking
  cutils: Remove ppc buffer zero checking
  cutils: Add test for buffer_is_zero
  cutils: Add generic prefetch
  cutils: Rewrite x86 buffer zero checking

 configure                 |  21 +---
 include/qemu/cutils.h     |   3 +-
 migration/ram.c           |   2 +-
 migration/rdma.c          |   5 +-
 tests/Makefile.include    |   3 +
 tests/test-bufferiszero.c |  78 +++++++++++++
 util/Makefile.objs        |   1 +
 util/bufferiszero.c       | 274 ++++++++++++++++++++++++++++++++++++++++++++++
 util/cutils.c             | 244 -----------------------------------------
 9 files changed, 365 insertions(+), 266 deletions(-)
 create mode 100644 tests/test-bufferiszero.c
 create mode 100644 util/bufferiszero.c

-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [Qemu-devel] [PATCH 01/10] cutils: Move buffer_is_zero and subroutines to a new file
  2016-09-13 16:09 [Qemu-devel] [PATCH v4 00/10] Improve buffer_is_zero Paolo Bonzini
@ 2016-09-13 16:09 ` Paolo Bonzini
  2016-09-13 16:09 ` [Qemu-devel] [PATCH 02/10] cutils: Remove SPLAT macro Paolo Bonzini
                   ` (8 subsequent siblings)
  9 siblings, 0 replies; 13+ messages in thread
From: Paolo Bonzini @ 2016-09-13 16:09 UTC (permalink / raw)
  To: qemu-devel; +Cc: rth

From: Richard Henderson <rth@twiddle.net>

Signed-off-by: Richard Henderson <rth@twiddle.net>
Message-Id: <1472496380-19706-2-git-send-email-rth@twiddle.net>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 util/Makefile.objs  |   1 +
 util/bufferiszero.c | 272 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 util/cutils.c       | 244 ----------------------------------------------
 3 files changed, 273 insertions(+), 244 deletions(-)
 create mode 100644 util/bufferiszero.c

diff --git a/util/Makefile.objs b/util/Makefile.objs
index 96cb1e0..ffca8f3 100644
--- a/util/Makefile.objs
+++ b/util/Makefile.objs
@@ -1,4 +1,5 @@
 util-obj-y = osdep.o cutils.o unicode.o qemu-timer-common.o
+util-obj-y += bufferiszero.o
 util-obj-$(CONFIG_POSIX) += compatfd.o
 util-obj-$(CONFIG_POSIX) += event_notifier-posix.o
 util-obj-$(CONFIG_POSIX) += mmap-alloc.o
diff --git a/util/bufferiszero.c b/util/bufferiszero.c
new file mode 100644
index 0000000..9bb1ae5
--- /dev/null
+++ b/util/bufferiszero.c
@@ -0,0 +1,272 @@
+/*
+ * Simple C functions to supplement the C library
+ *
+ * Copyright (c) 2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "qemu/cutils.h"
+
+
+/* vector definitions */
+#ifdef __ALTIVEC__
+#include <altivec.h>
+/* The altivec.h header says we're allowed to undef these for
+ * C++ compatibility.  Here we don't care about C++, but we
+ * undef them anyway to avoid namespace pollution.
+ */
+#undef vector
+#undef pixel
+#undef bool
+#define VECTYPE        __vector unsigned char
+#define SPLAT(p)       vec_splat(vec_ld(0, p), 0)
+#define ALL_EQ(v1, v2) vec_all_eq(v1, v2)
+#define VEC_OR(v1, v2) ((v1) | (v2))
+/* altivec.h may redefine the bool macro as vector type.
+ * Reset it to POSIX semantics. */
+#define bool _Bool
+#elif defined __SSE2__
+#include <emmintrin.h>
+#define VECTYPE        __m128i
+#define SPLAT(p)       _mm_set1_epi8(*(p))
+#define ALL_EQ(v1, v2) (_mm_movemask_epi8(_mm_cmpeq_epi8(v1, v2)) == 0xFFFF)
+#define VEC_OR(v1, v2) (_mm_or_si128(v1, v2))
+#elif defined(__aarch64__)
+#include "arm_neon.h"
+#define VECTYPE        uint64x2_t
+#define ALL_EQ(v1, v2) \
+        ((vgetq_lane_u64(v1, 0) == vgetq_lane_u64(v2, 0)) && \
+         (vgetq_lane_u64(v1, 1) == vgetq_lane_u64(v2, 1)))
+#define VEC_OR(v1, v2) ((v1) | (v2))
+#else
+#define VECTYPE        unsigned long
+#define SPLAT(p)       (*(p) * (~0UL / 255))
+#define ALL_EQ(v1, v2) ((v1) == (v2))
+#define VEC_OR(v1, v2) ((v1) | (v2))
+#endif
+
+#define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 8
+
+static bool
+can_use_buffer_find_nonzero_offset_inner(const void *buf, size_t len)
+{
+    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
+                   * sizeof(VECTYPE)) == 0
+            && ((uintptr_t) buf) % sizeof(VECTYPE) == 0);
+}
+
+/*
+ * Searches for an area with non-zero content in a buffer
+ *
+ * Attention! The len must be a multiple of
+ * BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR * sizeof(VECTYPE)
+ * and addr must be a multiple of sizeof(VECTYPE) due to
+ * restriction of optimizations in this function.
+ *
+ * can_use_buffer_find_nonzero_offset_inner() can be used to
+ * check these requirements.
+ *
+ * The return value is the offset of the non-zero area rounded
+ * down to a multiple of sizeof(VECTYPE) for the first
+ * BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR chunks and down to
+ * BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR * sizeof(VECTYPE)
+ * afterwards.
+ *
+ * If the buffer is all zero the return value is equal to len.
+ */
+
+static size_t buffer_find_nonzero_offset_inner(const void *buf, size_t len)
+{
+    const VECTYPE *p = buf;
+    const VECTYPE zero = (VECTYPE){0};
+    size_t i;
+
+    assert(can_use_buffer_find_nonzero_offset_inner(buf, len));
+
+    if (!len) {
+        return 0;
+    }
+
+    for (i = 0; i < BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i++) {
+        if (!ALL_EQ(p[i], zero)) {
+            return i * sizeof(VECTYPE);
+        }
+    }
+
+    for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR;
+         i < len / sizeof(VECTYPE);
+         i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) {
+        VECTYPE tmp0 = VEC_OR(p[i + 0], p[i + 1]);
+        VECTYPE tmp1 = VEC_OR(p[i + 2], p[i + 3]);
+        VECTYPE tmp2 = VEC_OR(p[i + 4], p[i + 5]);
+        VECTYPE tmp3 = VEC_OR(p[i + 6], p[i + 7]);
+        VECTYPE tmp01 = VEC_OR(tmp0, tmp1);
+        VECTYPE tmp23 = VEC_OR(tmp2, tmp3);
+        if (!ALL_EQ(VEC_OR(tmp01, tmp23), zero)) {
+            break;
+        }
+    }
+
+    return i * sizeof(VECTYPE);
+}
+
+#if defined CONFIG_AVX2_OPT
+#pragma GCC push_options
+#pragma GCC target("avx2")
+#include <cpuid.h>
+#include <immintrin.h>
+
+#define AVX2_VECTYPE        __m256i
+#define AVX2_SPLAT(p)       _mm256_set1_epi8(*(p))
+#define AVX2_ALL_EQ(v1, v2) \
+    (_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, v2)) == 0xFFFFFFFF)
+#define AVX2_VEC_OR(v1, v2) (_mm256_or_si256(v1, v2))
+
+static bool
+can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
+{
+    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
+                   * sizeof(AVX2_VECTYPE)) == 0
+            && ((uintptr_t) buf) % sizeof(AVX2_VECTYPE) == 0);
+}
+
+static size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
+{
+    const AVX2_VECTYPE *p = buf;
+    const AVX2_VECTYPE zero = (AVX2_VECTYPE){0};
+    size_t i;
+
+    assert(can_use_buffer_find_nonzero_offset_avx2(buf, len));
+
+    if (!len) {
+        return 0;
+    }
+
+    for (i = 0; i < BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i++) {
+        if (!AVX2_ALL_EQ(p[i], zero)) {
+            return i * sizeof(AVX2_VECTYPE);
+        }
+    }
+
+    for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR;
+         i < len / sizeof(AVX2_VECTYPE);
+         i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) {
+        AVX2_VECTYPE tmp0 = AVX2_VEC_OR(p[i + 0], p[i + 1]);
+        AVX2_VECTYPE tmp1 = AVX2_VEC_OR(p[i + 2], p[i + 3]);
+        AVX2_VECTYPE tmp2 = AVX2_VEC_OR(p[i + 4], p[i + 5]);
+        AVX2_VECTYPE tmp3 = AVX2_VEC_OR(p[i + 6], p[i + 7]);
+        AVX2_VECTYPE tmp01 = AVX2_VEC_OR(tmp0, tmp1);
+        AVX2_VECTYPE tmp23 = AVX2_VEC_OR(tmp2, tmp3);
+        if (!AVX2_ALL_EQ(AVX2_VEC_OR(tmp01, tmp23), zero)) {
+            break;
+        }
+    }
+
+    return i * sizeof(AVX2_VECTYPE);
+}
+
+static bool avx2_support(void)
+{
+    int a, b, c, d;
+
+    if (__get_cpuid_max(0, NULL) < 7) {
+        return false;
+    }
+
+    __cpuid_count(7, 0, a, b, c, d);
+
+    return b & bit_AVX2;
+}
+
+bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len) \
+         __attribute__ ((ifunc("can_use_buffer_find_nonzero_offset_ifunc")));
+size_t buffer_find_nonzero_offset(const void *buf, size_t len) \
+         __attribute__ ((ifunc("buffer_find_nonzero_offset_ifunc")));
+
+static void *buffer_find_nonzero_offset_ifunc(void)
+{
+    typeof(buffer_find_nonzero_offset) *func = (avx2_support()) ?
+        buffer_find_nonzero_offset_avx2 : buffer_find_nonzero_offset_inner;
+
+    return func;
+}
+
+static void *can_use_buffer_find_nonzero_offset_ifunc(void)
+{
+    typeof(can_use_buffer_find_nonzero_offset) *func = (avx2_support()) ?
+        can_use_buffer_find_nonzero_offset_avx2 :
+        can_use_buffer_find_nonzero_offset_inner;
+
+    return func;
+}
+#pragma GCC pop_options
+#else
+bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
+{
+    return can_use_buffer_find_nonzero_offset_inner(buf, len);
+}
+
+size_t buffer_find_nonzero_offset(const void *buf, size_t len)
+{
+    return buffer_find_nonzero_offset_inner(buf, len);
+}
+#endif
+
+/*
+ * Checks if a buffer is all zeroes
+ *
+ * Attention! The len must be a multiple of 4 * sizeof(long) due to
+ * restriction of optimizations in this function.
+ */
+bool buffer_is_zero(const void *buf, size_t len)
+{
+    /*
+     * Use long as the biggest available internal data type that fits into the
+     * CPU register and unroll the loop to smooth out the effect of memory
+     * latency.
+     */
+
+    size_t i;
+    long d0, d1, d2, d3;
+    const long * const data = buf;
+
+    /* use vector optimized zero check if possible */
+    if (can_use_buffer_find_nonzero_offset(buf, len)) {
+        return buffer_find_nonzero_offset(buf, len) == len;
+    }
+
+    assert(len % (4 * sizeof(long)) == 0);
+    len /= sizeof(long);
+
+    for (i = 0; i < len; i += 4) {
+        d0 = data[i + 0];
+        d1 = data[i + 1];
+        d2 = data[i + 2];
+        d3 = data[i + 3];
+
+        if (d0 || d1 || d2 || d3) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
diff --git a/util/cutils.c b/util/cutils.c
index 7505fda..4fefcf3 100644
--- a/util/cutils.c
+++ b/util/cutils.c
@@ -161,250 +161,6 @@ int qemu_fdatasync(int fd)
 #endif
 }
 
-/* vector definitions */
-#ifdef __ALTIVEC__
-#include <altivec.h>
-/* The altivec.h header says we're allowed to undef these for
- * C++ compatibility.  Here we don't care about C++, but we
- * undef them anyway to avoid namespace pollution.
- */
-#undef vector
-#undef pixel
-#undef bool
-#define VECTYPE        __vector unsigned char
-#define SPLAT(p)       vec_splat(vec_ld(0, p), 0)
-#define ALL_EQ(v1, v2) vec_all_eq(v1, v2)
-#define VEC_OR(v1, v2) ((v1) | (v2))
-/* altivec.h may redefine the bool macro as vector type.
- * Reset it to POSIX semantics. */
-#define bool _Bool
-#elif defined __SSE2__
-#include <emmintrin.h>
-#define VECTYPE        __m128i
-#define SPLAT(p)       _mm_set1_epi8(*(p))
-#define ALL_EQ(v1, v2) (_mm_movemask_epi8(_mm_cmpeq_epi8(v1, v2)) == 0xFFFF)
-#define VEC_OR(v1, v2) (_mm_or_si128(v1, v2))
-#elif defined(__aarch64__)
-#include "arm_neon.h"
-#define VECTYPE        uint64x2_t
-#define ALL_EQ(v1, v2) \
-        ((vgetq_lane_u64(v1, 0) == vgetq_lane_u64(v2, 0)) && \
-         (vgetq_lane_u64(v1, 1) == vgetq_lane_u64(v2, 1)))
-#define VEC_OR(v1, v2) ((v1) | (v2))
-#else
-#define VECTYPE        unsigned long
-#define SPLAT(p)       (*(p) * (~0UL / 255))
-#define ALL_EQ(v1, v2) ((v1) == (v2))
-#define VEC_OR(v1, v2) ((v1) | (v2))
-#endif
-
-#define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 8
-
-static bool
-can_use_buffer_find_nonzero_offset_inner(const void *buf, size_t len)
-{
-    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
-                   * sizeof(VECTYPE)) == 0
-            && ((uintptr_t) buf) % sizeof(VECTYPE) == 0);
-}
-
-/*
- * Searches for an area with non-zero content in a buffer
- *
- * Attention! The len must be a multiple of
- * BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR * sizeof(VECTYPE)
- * and addr must be a multiple of sizeof(VECTYPE) due to
- * restriction of optimizations in this function.
- *
- * can_use_buffer_find_nonzero_offset_inner() can be used to
- * check these requirements.
- *
- * The return value is the offset of the non-zero area rounded
- * down to a multiple of sizeof(VECTYPE) for the first
- * BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR chunks and down to
- * BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR * sizeof(VECTYPE)
- * afterwards.
- *
- * If the buffer is all zero the return value is equal to len.
- */
-
-static size_t buffer_find_nonzero_offset_inner(const void *buf, size_t len)
-{
-    const VECTYPE *p = buf;
-    const VECTYPE zero = (VECTYPE){0};
-    size_t i;
-
-    assert(can_use_buffer_find_nonzero_offset_inner(buf, len));
-
-    if (!len) {
-        return 0;
-    }
-
-    for (i = 0; i < BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i++) {
-        if (!ALL_EQ(p[i], zero)) {
-            return i * sizeof(VECTYPE);
-        }
-    }
-
-    for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR;
-         i < len / sizeof(VECTYPE);
-         i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) {
-        VECTYPE tmp0 = VEC_OR(p[i + 0], p[i + 1]);
-        VECTYPE tmp1 = VEC_OR(p[i + 2], p[i + 3]);
-        VECTYPE tmp2 = VEC_OR(p[i + 4], p[i + 5]);
-        VECTYPE tmp3 = VEC_OR(p[i + 6], p[i + 7]);
-        VECTYPE tmp01 = VEC_OR(tmp0, tmp1);
-        VECTYPE tmp23 = VEC_OR(tmp2, tmp3);
-        if (!ALL_EQ(VEC_OR(tmp01, tmp23), zero)) {
-            break;
-        }
-    }
-
-    return i * sizeof(VECTYPE);
-}
-
-#if defined CONFIG_AVX2_OPT
-#pragma GCC push_options
-#pragma GCC target("avx2")
-#include <cpuid.h>
-#include <immintrin.h>
-
-#define AVX2_VECTYPE        __m256i
-#define AVX2_SPLAT(p)       _mm256_set1_epi8(*(p))
-#define AVX2_ALL_EQ(v1, v2) \
-    (_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, v2)) == 0xFFFFFFFF)
-#define AVX2_VEC_OR(v1, v2) (_mm256_or_si256(v1, v2))
-
-static bool
-can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
-{
-    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
-                   * sizeof(AVX2_VECTYPE)) == 0
-            && ((uintptr_t) buf) % sizeof(AVX2_VECTYPE) == 0);
-}
-
-static size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
-{
-    const AVX2_VECTYPE *p = buf;
-    const AVX2_VECTYPE zero = (AVX2_VECTYPE){0};
-    size_t i;
-
-    assert(can_use_buffer_find_nonzero_offset_avx2(buf, len));
-
-    if (!len) {
-        return 0;
-    }
-
-    for (i = 0; i < BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i++) {
-        if (!AVX2_ALL_EQ(p[i], zero)) {
-            return i * sizeof(AVX2_VECTYPE);
-        }
-    }
-
-    for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR;
-         i < len / sizeof(AVX2_VECTYPE);
-         i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) {
-        AVX2_VECTYPE tmp0 = AVX2_VEC_OR(p[i + 0], p[i + 1]);
-        AVX2_VECTYPE tmp1 = AVX2_VEC_OR(p[i + 2], p[i + 3]);
-        AVX2_VECTYPE tmp2 = AVX2_VEC_OR(p[i + 4], p[i + 5]);
-        AVX2_VECTYPE tmp3 = AVX2_VEC_OR(p[i + 6], p[i + 7]);
-        AVX2_VECTYPE tmp01 = AVX2_VEC_OR(tmp0, tmp1);
-        AVX2_VECTYPE tmp23 = AVX2_VEC_OR(tmp2, tmp3);
-        if (!AVX2_ALL_EQ(AVX2_VEC_OR(tmp01, tmp23), zero)) {
-            break;
-        }
-    }
-
-    return i * sizeof(AVX2_VECTYPE);
-}
-
-static bool avx2_support(void)
-{
-    int a, b, c, d;
-
-    if (__get_cpuid_max(0, NULL) < 7) {
-        return false;
-    }
-
-    __cpuid_count(7, 0, a, b, c, d);
-
-    return b & bit_AVX2;
-}
-
-bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len) \
-         __attribute__ ((ifunc("can_use_buffer_find_nonzero_offset_ifunc")));
-size_t buffer_find_nonzero_offset(const void *buf, size_t len) \
-         __attribute__ ((ifunc("buffer_find_nonzero_offset_ifunc")));
-
-static void *buffer_find_nonzero_offset_ifunc(void)
-{
-    typeof(buffer_find_nonzero_offset) *func = (avx2_support()) ?
-        buffer_find_nonzero_offset_avx2 : buffer_find_nonzero_offset_inner;
-
-    return func;
-}
-
-static void *can_use_buffer_find_nonzero_offset_ifunc(void)
-{
-    typeof(can_use_buffer_find_nonzero_offset) *func = (avx2_support()) ?
-        can_use_buffer_find_nonzero_offset_avx2 :
-        can_use_buffer_find_nonzero_offset_inner;
-
-    return func;
-}
-#pragma GCC pop_options
-#else
-bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
-{
-    return can_use_buffer_find_nonzero_offset_inner(buf, len);
-}
-
-size_t buffer_find_nonzero_offset(const void *buf, size_t len)
-{
-    return buffer_find_nonzero_offset_inner(buf, len);
-}
-#endif
-
-/*
- * Checks if a buffer is all zeroes
- *
- * Attention! The len must be a multiple of 4 * sizeof(long) due to
- * restriction of optimizations in this function.
- */
-bool buffer_is_zero(const void *buf, size_t len)
-{
-    /*
-     * Use long as the biggest available internal data type that fits into the
-     * CPU register and unroll the loop to smooth out the effect of memory
-     * latency.
-     */
-
-    size_t i;
-    long d0, d1, d2, d3;
-    const long * const data = buf;
-
-    /* use vector optimized zero check if possible */
-    if (can_use_buffer_find_nonzero_offset(buf, len)) {
-        return buffer_find_nonzero_offset(buf, len) == len;
-    }
-
-    assert(len % (4 * sizeof(long)) == 0);
-    len /= sizeof(long);
-
-    for (i = 0; i < len; i += 4) {
-        d0 = data[i + 0];
-        d1 = data[i + 1];
-        d2 = data[i + 2];
-        d3 = data[i + 3];
-
-        if (d0 || d1 || d2 || d3) {
-            return false;
-        }
-    }
-
-    return true;
-}
-
 #ifndef _WIN32
 /* Sets a specific flag */
 int fcntl_setfl(int fd, int flag)
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [Qemu-devel] [PATCH 02/10] cutils: Remove SPLAT macro
  2016-09-13 16:09 [Qemu-devel] [PATCH v4 00/10] Improve buffer_is_zero Paolo Bonzini
  2016-09-13 16:09 ` [Qemu-devel] [PATCH 01/10] cutils: Move buffer_is_zero and subroutines to a new file Paolo Bonzini
@ 2016-09-13 16:09 ` Paolo Bonzini
  2016-09-13 16:09 ` [Qemu-devel] [PATCH 03/10] cutils: Export only buffer_is_zero Paolo Bonzini
                   ` (7 subsequent siblings)
  9 siblings, 0 replies; 13+ messages in thread
From: Paolo Bonzini @ 2016-09-13 16:09 UTC (permalink / raw)
  To: qemu-devel; +Cc: rth

From: Richard Henderson <rth@twiddle.net>

This is unused and complicates the vector interface.

Signed-off-by: Richard Henderson <rth@twiddle.net>
Message-Id: <1472496380-19706-3-git-send-email-rth@twiddle.net>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 util/bufferiszero.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/util/bufferiszero.c b/util/bufferiszero.c
index 9bb1ae5..067d08f 100644
--- a/util/bufferiszero.c
+++ b/util/bufferiszero.c
@@ -37,7 +37,6 @@
 #undef pixel
 #undef bool
 #define VECTYPE        __vector unsigned char
-#define SPLAT(p)       vec_splat(vec_ld(0, p), 0)
 #define ALL_EQ(v1, v2) vec_all_eq(v1, v2)
 #define VEC_OR(v1, v2) ((v1) | (v2))
 /* altivec.h may redefine the bool macro as vector type.
@@ -46,7 +45,6 @@
 #elif defined __SSE2__
 #include <emmintrin.h>
 #define VECTYPE        __m128i
-#define SPLAT(p)       _mm_set1_epi8(*(p))
 #define ALL_EQ(v1, v2) (_mm_movemask_epi8(_mm_cmpeq_epi8(v1, v2)) == 0xFFFF)
 #define VEC_OR(v1, v2) (_mm_or_si128(v1, v2))
 #elif defined(__aarch64__)
@@ -58,7 +56,6 @@
 #define VEC_OR(v1, v2) ((v1) | (v2))
 #else
 #define VECTYPE        unsigned long
-#define SPLAT(p)       (*(p) * (~0UL / 255))
 #define ALL_EQ(v1, v2) ((v1) == (v2))
 #define VEC_OR(v1, v2) ((v1) | (v2))
 #endif
@@ -135,7 +132,6 @@ static size_t buffer_find_nonzero_offset_inner(const void *buf, size_t len)
 #include <immintrin.h>
 
 #define AVX2_VECTYPE        __m256i
-#define AVX2_SPLAT(p)       _mm256_set1_epi8(*(p))
 #define AVX2_ALL_EQ(v1, v2) \
     (_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, v2)) == 0xFFFFFFFF)
 #define AVX2_VEC_OR(v1, v2) (_mm256_or_si256(v1, v2))
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [Qemu-devel] [PATCH 03/10] cutils: Export only buffer_is_zero
  2016-09-13 16:09 [Qemu-devel] [PATCH v4 00/10] Improve buffer_is_zero Paolo Bonzini
  2016-09-13 16:09 ` [Qemu-devel] [PATCH 01/10] cutils: Move buffer_is_zero and subroutines to a new file Paolo Bonzini
  2016-09-13 16:09 ` [Qemu-devel] [PATCH 02/10] cutils: Remove SPLAT macro Paolo Bonzini
@ 2016-09-13 16:09 ` Paolo Bonzini
  2016-09-13 16:09 ` [Qemu-devel] [PATCH 04/10] cutils: Rearrange buffer_is_zero acceleration Paolo Bonzini
                   ` (6 subsequent siblings)
  9 siblings, 0 replies; 13+ messages in thread
From: Paolo Bonzini @ 2016-09-13 16:09 UTC (permalink / raw)
  To: qemu-devel; +Cc: rth

From: Richard Henderson <rth@twiddle.net>

Since the two users don't make use of the returned offset,
beyond ensuring that the entire buffer is zero, consider the
can_use_buffer_find_nonzero_offset and buffer_find_nonzero_offset
functions internal.

Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Richard Henderson <rth@twiddle.net>
Message-Id: <1472496380-19706-4-git-send-email-rth@twiddle.net>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/qemu/cutils.h | 2 --
 migration/ram.c       | 2 +-
 migration/rdma.c      | 5 +----
 util/bufferiszero.c   | 8 ++++----
 4 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/include/qemu/cutils.h b/include/qemu/cutils.h
index 3e4ea23..ca58577 100644
--- a/include/qemu/cutils.h
+++ b/include/qemu/cutils.h
@@ -168,8 +168,6 @@ int64_t qemu_strtosz_suffix_unit(const char *nptr, char **end,
 /* used to print char* safely */
 #define STR_OR_NULL(str) ((str) ? (str) : "null")
 
-bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len);
-size_t buffer_find_nonzero_offset(const void *buf, size_t len);
 bool buffer_is_zero(const void *buf, size_t len);
 
 /*
diff --git a/migration/ram.c b/migration/ram.c
index a3d70c4..a6e1c63 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -73,7 +73,7 @@ static const uint8_t ZERO_TARGET_PAGE[TARGET_PAGE_SIZE];
 
 static inline bool is_zero_range(uint8_t *p, uint64_t size)
 {
-    return buffer_find_nonzero_offset(p, size) == size;
+    return buffer_is_zero(p, size);
 }
 
 /* struct contains XBZRLE cache and a static page
diff --git a/migration/rdma.c b/migration/rdma.c
index 5110ec8..88bdb64 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -1934,10 +1934,7 @@ retry:
              * memset() + madvise() the entire chunk without RDMA.
              */
 
-            if (can_use_buffer_find_nonzero_offset((void *)(uintptr_t)sge.addr,
-                                                   length)
-                   && buffer_find_nonzero_offset((void *)(uintptr_t)sge.addr,
-                                                    length) == length) {
+            if (buffer_is_zero((void *)(uintptr_t)sge.addr, length)) {
                 RDMACompress comp = {
                                         .offset = current_addr,
                                         .value = 0,
diff --git a/util/bufferiszero.c b/util/bufferiszero.c
index 067d08f..0cf8b6e 100644
--- a/util/bufferiszero.c
+++ b/util/bufferiszero.c
@@ -192,9 +192,9 @@ static bool avx2_support(void)
     return b & bit_AVX2;
 }
 
-bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len) \
+static bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len) \
          __attribute__ ((ifunc("can_use_buffer_find_nonzero_offset_ifunc")));
-size_t buffer_find_nonzero_offset(const void *buf, size_t len) \
+static size_t buffer_find_nonzero_offset(const void *buf, size_t len) \
          __attribute__ ((ifunc("buffer_find_nonzero_offset_ifunc")));
 
 static void *buffer_find_nonzero_offset_ifunc(void)
@@ -215,12 +215,12 @@ static void *can_use_buffer_find_nonzero_offset_ifunc(void)
 }
 #pragma GCC pop_options
 #else
-bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
+static bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
 {
     return can_use_buffer_find_nonzero_offset_inner(buf, len);
 }
 
-size_t buffer_find_nonzero_offset(const void *buf, size_t len)
+static size_t buffer_find_nonzero_offset(const void *buf, size_t len)
 {
     return buffer_find_nonzero_offset_inner(buf, len);
 }
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [Qemu-devel] [PATCH 04/10] cutils: Rearrange buffer_is_zero acceleration
  2016-09-13 16:09 [Qemu-devel] [PATCH v4 00/10] Improve buffer_is_zero Paolo Bonzini
                   ` (2 preceding siblings ...)
  2016-09-13 16:09 ` [Qemu-devel] [PATCH 03/10] cutils: Export only buffer_is_zero Paolo Bonzini
@ 2016-09-13 16:09 ` Paolo Bonzini
  2016-09-13 16:10 ` [Qemu-devel] [PATCH 05/10] cutils: Remove aarch64 buffer zero checking Paolo Bonzini
                   ` (5 subsequent siblings)
  9 siblings, 0 replies; 13+ messages in thread
From: Paolo Bonzini @ 2016-09-13 16:09 UTC (permalink / raw)
  To: qemu-devel; +Cc: rth

From: Richard Henderson <rth@twiddle.net>

Allow selection of several acceleration functions
based on the size and alignment of the buffer.
Do not require ifunc support for AVX2 acceleration.

Signed-off-by: Richard Henderson <rth@twiddle.net>
Message-Id: <1472496380-19706-5-git-send-email-rth@twiddle.net>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 configure           |  21 +--
 util/bufferiszero.c | 375 +++++++++++++++++++++++-----------------------------
 2 files changed, 172 insertions(+), 224 deletions(-)

diff --git a/configure b/configure
index 331c36f..7e09b79 100755
--- a/configure
+++ b/configure
@@ -1794,28 +1794,19 @@ fi
 ##########################################
 # avx2 optimization requirement check
 
-
-if test "$static" = "no" ; then
-  cat > $TMPC << EOF
+cat > $TMPC << EOF
 #pragma GCC push_options
 #pragma GCC target("avx2")
 #include <cpuid.h>
 #include <immintrin.h>
-
 static int bar(void *a) {
-    return _mm256_movemask_epi8(_mm256_cmpeq_epi8(*(__m256i *)a, (__m256i){0}));
+    __m256i x = *(__m256i *)a;
+    return _mm256_testz_si256(x, x);
 }
-static void *bar_ifunc(void) {return (void*) bar;}
-int foo(void *a) __attribute__((ifunc("bar_ifunc")));
-int main(int argc, char *argv[]) { return foo(argv[0]);}
+int main(int argc, char *argv[]) { return bar(argv[0]); }
 EOF
-  if compile_object "" ; then
-      if has readelf; then
-          if readelf --syms $TMPO 2>/dev/null |grep -q "IFUNC.*foo"; then
-              avx2_opt="yes"
-          fi
-      fi
-  fi
+if compile_object "" ; then
+  avx2_opt="yes"
 fi
 
 #########################################
diff --git a/util/bufferiszero.c b/util/bufferiszero.c
index 0cf8b6e..0bcca70 100644
--- a/util/bufferiszero.c
+++ b/util/bufferiszero.c
@@ -24,245 +24,202 @@
 #include "qemu/osdep.h"
 #include "qemu-common.h"
 #include "qemu/cutils.h"
+#include "qemu/bswap.h"
 
 
 /* vector definitions */
-#ifdef __ALTIVEC__
+
+extern void link_error(void);
+
+#define ACCEL_BUFFER_ZERO(NAME, SIZE, VECTYPE, NONZERO)         \
+static bool NAME(const void *buf, size_t len)                   \
+{                                                               \
+    const void *end = buf + len;                                \
+    do {                                                        \
+        const VECTYPE *p = buf;                                 \
+        VECTYPE t;                                              \
+        if (SIZE == sizeof(VECTYPE) * 4) {                      \
+            t = (p[0] | p[1]) | (p[2] | p[3]);                  \
+        } else if (SIZE == sizeof(VECTYPE) * 8) {               \
+            t  = p[0] | p[1];                                   \
+            t |= p[2] | p[3];                                   \
+            t |= p[4] | p[5];                                   \
+            t |= p[6] | p[7];                                   \
+        } else {                                                \
+            link_error();                                       \
+        }                                                       \
+        if (unlikely(NONZERO(t))) {                             \
+            return false;                                       \
+        }                                                       \
+        buf += SIZE;                                            \
+    } while (buf < end);                                        \
+    return true;                                                \
+}
+
+static bool
+buffer_zero_int(const void *buf, size_t len)
+{
+    if (unlikely(len < 8)) {
+        /* For a very small buffer, simply accumulate all the bytes.  */
+        const unsigned char *p = buf;
+        const unsigned char *e = buf + len;
+        unsigned char t = 0;
+
+        do {
+            t |= *p++;
+        } while (p < e);
+
+        return t == 0;
+    } else {
+        /* Otherwise, use the unaligned memory access functions to
+           handle the beginning and end of the buffer, with a couple
+           of loops handling the middle aligned section.  */
+        uint64_t t = ldq_he_p(buf);
+        const uint64_t *p = (uint64_t *)(((uintptr_t)buf + 8) & -8);
+        const uint64_t *e = (uint64_t *)(((uintptr_t)buf + len) & -8);
+
+        for (; p + 8 <= e; p += 8) {
+            __builtin_prefetch(p + 8);
+            if (t) {
+                return false;
+            }
+            t = p[0] | p[1] | p[2] | p[3] | p[4] | p[5] | p[6] | p[7];
+        }
+        while (p < e) {
+            t |= *p++;
+        }
+        t |= ldq_he_p(buf + len - 8);
+
+        return t == 0;
+    }
+}
+
+#if defined(__ALTIVEC__)
 #include <altivec.h>
 /* The altivec.h header says we're allowed to undef these for
  * C++ compatibility.  Here we don't care about C++, but we
  * undef them anyway to avoid namespace pollution.
+ * altivec.h may redefine the bool macro as vector type.
+ * Reset it to POSIX semantics.
  */
 #undef vector
 #undef pixel
 #undef bool
-#define VECTYPE        __vector unsigned char
-#define ALL_EQ(v1, v2) vec_all_eq(v1, v2)
-#define VEC_OR(v1, v2) ((v1) | (v2))
-/* altivec.h may redefine the bool macro as vector type.
- * Reset it to POSIX semantics. */
 #define bool _Bool
-#elif defined __SSE2__
+#define DO_NONZERO(X)  vec_any_ne(X, (__vector unsigned char){ 0 })
+ACCEL_BUFFER_ZERO(buffer_zero_ppc, 128, __vector unsigned char, DO_NONZERO)
+
+static bool select_accel_fn(const void *buf, size_t len)
+{
+    uintptr_t ibuf = (uintptr_t)buf;
+    if (len % 128 == 0 && ibuf % sizeof(__vector unsigned char) == 0) {
+        return buffer_zero_ppc(buf, len);
+    }
+    return buffer_zero_int(buf, len);
+}
+
+#elif defined(CONFIG_AVX2_OPT) || (defined(CONFIG_CPUID_H) && defined(__SSE2__))
+#include <cpuid.h>
+
+#pragma GCC push_options
+#pragma GCC target("sse2")
 #include <emmintrin.h>
-#define VECTYPE        __m128i
-#define ALL_EQ(v1, v2) (_mm_movemask_epi8(_mm_cmpeq_epi8(v1, v2)) == 0xFFFF)
-#define VEC_OR(v1, v2) (_mm_or_si128(v1, v2))
-#elif defined(__aarch64__)
-#include "arm_neon.h"
-#define VECTYPE        uint64x2_t
-#define ALL_EQ(v1, v2) \
-        ((vgetq_lane_u64(v1, 0) == vgetq_lane_u64(v2, 0)) && \
-         (vgetq_lane_u64(v1, 1) == vgetq_lane_u64(v2, 1)))
-#define VEC_OR(v1, v2) ((v1) | (v2))
-#else
-#define VECTYPE        unsigned long
-#define ALL_EQ(v1, v2) ((v1) == (v2))
-#define VEC_OR(v1, v2) ((v1) | (v2))
-#endif
+#define SSE2_NONZERO(X) \
+    (_mm_movemask_epi8(_mm_cmpeq_epi8((X), _mm_setzero_si128())) != 0xFFFF)
+ACCEL_BUFFER_ZERO(buffer_zero_sse2, 64, __m128i, SSE2_NONZERO)
+#pragma GCC pop_options
 
-#define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 8
-
-static bool
-can_use_buffer_find_nonzero_offset_inner(const void *buf, size_t len)
-{
-    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
-                   * sizeof(VECTYPE)) == 0
-            && ((uintptr_t) buf) % sizeof(VECTYPE) == 0);
-}
-
-/*
- * Searches for an area with non-zero content in a buffer
- *
- * Attention! The len must be a multiple of
- * BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR * sizeof(VECTYPE)
- * and addr must be a multiple of sizeof(VECTYPE) due to
- * restriction of optimizations in this function.
- *
- * can_use_buffer_find_nonzero_offset_inner() can be used to
- * check these requirements.
- *
- * The return value is the offset of the non-zero area rounded
- * down to a multiple of sizeof(VECTYPE) for the first
- * BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR chunks and down to
- * BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR * sizeof(VECTYPE)
- * afterwards.
- *
- * If the buffer is all zero the return value is equal to len.
- */
-
-static size_t buffer_find_nonzero_offset_inner(const void *buf, size_t len)
-{
-    const VECTYPE *p = buf;
-    const VECTYPE zero = (VECTYPE){0};
-    size_t i;
-
-    assert(can_use_buffer_find_nonzero_offset_inner(buf, len));
-
-    if (!len) {
-        return 0;
-    }
-
-    for (i = 0; i < BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i++) {
-        if (!ALL_EQ(p[i], zero)) {
-            return i * sizeof(VECTYPE);
-        }
-    }
-
-    for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR;
-         i < len / sizeof(VECTYPE);
-         i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) {
-        VECTYPE tmp0 = VEC_OR(p[i + 0], p[i + 1]);
-        VECTYPE tmp1 = VEC_OR(p[i + 2], p[i + 3]);
-        VECTYPE tmp2 = VEC_OR(p[i + 4], p[i + 5]);
-        VECTYPE tmp3 = VEC_OR(p[i + 6], p[i + 7]);
-        VECTYPE tmp01 = VEC_OR(tmp0, tmp1);
-        VECTYPE tmp23 = VEC_OR(tmp2, tmp3);
-        if (!ALL_EQ(VEC_OR(tmp01, tmp23), zero)) {
-            break;
-        }
-    }
-
-    return i * sizeof(VECTYPE);
-}
-
-#if defined CONFIG_AVX2_OPT
+#ifdef CONFIG_AVX2_OPT
 #pragma GCC push_options
 #pragma GCC target("avx2")
-#include <cpuid.h>
 #include <immintrin.h>
-
-#define AVX2_VECTYPE        __m256i
-#define AVX2_ALL_EQ(v1, v2) \
-    (_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, v2)) == 0xFFFFFFFF)
-#define AVX2_VEC_OR(v1, v2) (_mm256_or_si256(v1, v2))
-
-static bool
-can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
-{
-    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
-                   * sizeof(AVX2_VECTYPE)) == 0
-            && ((uintptr_t) buf) % sizeof(AVX2_VECTYPE) == 0);
-}
-
-static size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
-{
-    const AVX2_VECTYPE *p = buf;
-    const AVX2_VECTYPE zero = (AVX2_VECTYPE){0};
-    size_t i;
-
-    assert(can_use_buffer_find_nonzero_offset_avx2(buf, len));
-
-    if (!len) {
-        return 0;
-    }
-
-    for (i = 0; i < BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i++) {
-        if (!AVX2_ALL_EQ(p[i], zero)) {
-            return i * sizeof(AVX2_VECTYPE);
-        }
-    }
-
-    for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR;
-         i < len / sizeof(AVX2_VECTYPE);
-         i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) {
-        AVX2_VECTYPE tmp0 = AVX2_VEC_OR(p[i + 0], p[i + 1]);
-        AVX2_VECTYPE tmp1 = AVX2_VEC_OR(p[i + 2], p[i + 3]);
-        AVX2_VECTYPE tmp2 = AVX2_VEC_OR(p[i + 4], p[i + 5]);
-        AVX2_VECTYPE tmp3 = AVX2_VEC_OR(p[i + 6], p[i + 7]);
-        AVX2_VECTYPE tmp01 = AVX2_VEC_OR(tmp0, tmp1);
-        AVX2_VECTYPE tmp23 = AVX2_VEC_OR(tmp2, tmp3);
-        if (!AVX2_ALL_EQ(AVX2_VEC_OR(tmp01, tmp23), zero)) {
-            break;
-        }
-    }
-
-    return i * sizeof(AVX2_VECTYPE);
-}
-
-static bool avx2_support(void)
-{
-    int a, b, c, d;
-
-    if (__get_cpuid_max(0, NULL) < 7) {
-        return false;
-    }
-
-    __cpuid_count(7, 0, a, b, c, d);
-
-    return b & bit_AVX2;
-}
-
-static bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len) \
-         __attribute__ ((ifunc("can_use_buffer_find_nonzero_offset_ifunc")));
-static size_t buffer_find_nonzero_offset(const void *buf, size_t len) \
-         __attribute__ ((ifunc("buffer_find_nonzero_offset_ifunc")));
-
-static void *buffer_find_nonzero_offset_ifunc(void)
-{
-    typeof(buffer_find_nonzero_offset) *func = (avx2_support()) ?
-        buffer_find_nonzero_offset_avx2 : buffer_find_nonzero_offset_inner;
-
-    return func;
-}
-
-static void *can_use_buffer_find_nonzero_offset_ifunc(void)
-{
-    typeof(can_use_buffer_find_nonzero_offset) *func = (avx2_support()) ?
-        can_use_buffer_find_nonzero_offset_avx2 :
-        can_use_buffer_find_nonzero_offset_inner;
-
-    return func;
-}
+#define AVX2_NONZERO(X)  !_mm256_testz_si256((X), (X))
+ACCEL_BUFFER_ZERO(buffer_zero_avx2, 128, __m256i, AVX2_NONZERO)
 #pragma GCC pop_options
-#else
-static bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
+#endif
+
+#define CACHE_AVX2    2
+#define CACHE_AVX1    4
+#define CACHE_SSE4    8
+#define CACHE_SSE2    16
+
+static unsigned cpuid_cache;
+
+static void __attribute__((constructor)) init_cpuid_cache(void)
 {
-    return can_use_buffer_find_nonzero_offset_inner(buf, len);
+    int max = __get_cpuid_max(0, NULL);
+    int a, b, c, d;
+    unsigned cache = 0;
+
+    if (max >= 1) {
+        __cpuid(1, a, b, c, d);
+        if (d & bit_SSE2) {
+            cache |= CACHE_SSE2;
+        }
+        if (c & bit_SSE4_1) {
+            cache |= CACHE_SSE4;
+        }
+
+        /* We must check that AVX is not just available, but usable.  */
+        if ((c & bit_OSXSAVE) && (c & bit_AVX)) {
+            __asm("xgetbv" : "=a"(a), "=d"(d) : "c"(0));
+            if ((a & 6) == 6) {
+                cache |= CACHE_AVX1;
+                if (max >= 7) {
+                    __cpuid_count(7, 0, a, b, c, d);
+                    if (b & bit_AVX2) {
+                        cache |= CACHE_AVX2;
+                    }
+                }
+            }
+        }
+    }
+    cpuid_cache = cache;
 }
 
-static size_t buffer_find_nonzero_offset(const void *buf, size_t len)
+static bool select_accel_fn(const void *buf, size_t len)
 {
-    return buffer_find_nonzero_offset_inner(buf, len);
+    uintptr_t ibuf = (uintptr_t)buf;
+#ifdef CONFIG_AVX2_OPT
+    if (len % 128 == 0 && ibuf % 32 == 0 && (cpuid_cache & CACHE_AVX2)) {
+        return buffer_zero_avx2(buf, len);
+    }
+#endif
+    if (len % 64 == 0 && ibuf % 16 == 0 && (cpuid_cache & CACHE_SSE2)) {
+        return buffer_zero_sse2(buf, len);
+    }
+    return buffer_zero_int(buf, len);
 }
+
+#elif defined(__aarch64__)
+#include "arm_neon.h"
+
+#define DO_NONZERO(X)  (vgetq_lane_u64((X), 0) | vgetq_lane_u64((X), 1))
+ACCEL_BUFFER_ZERO(buffer_zero_neon, 128, uint64x2_t, DO_NONZERO)
+
+static bool select_accel_fn(const void *buf, size_t len)
+{
+    uintptr_t ibuf = (uintptr_t)buf;
+    if (len % 128 == 0 && ibuf % sizeof(uint64x2_t) == 0) {
+        return buffer_zero_neon(buf, len);
+    }
+    return buffer_zero_int(buf, len);
+}
+
+#else
+#define select_accel_fn  buffer_zero_int
 #endif
 
 /*
  * Checks if a buffer is all zeroes
- *
- * Attention! The len must be a multiple of 4 * sizeof(long) due to
- * restriction of optimizations in this function.
  */
 bool buffer_is_zero(const void *buf, size_t len)
 {
-    /*
-     * Use long as the biggest available internal data type that fits into the
-     * CPU register and unroll the loop to smooth out the effect of memory
-     * latency.
-     */
-
-    size_t i;
-    long d0, d1, d2, d3;
-    const long * const data = buf;
-
-    /* use vector optimized zero check if possible */
-    if (can_use_buffer_find_nonzero_offset(buf, len)) {
-        return buffer_find_nonzero_offset(buf, len) == len;
+    if (unlikely(len == 0)) {
+        return true;
     }
 
-    assert(len % (4 * sizeof(long)) == 0);
-    len /= sizeof(long);
-
-    for (i = 0; i < len; i += 4) {
-        d0 = data[i + 0];
-        d1 = data[i + 1];
-        d2 = data[i + 2];
-        d3 = data[i + 3];
-
-        if (d0 || d1 || d2 || d3) {
-            return false;
-        }
-    }
-
-    return true;
+    /* Use an optimized zero check if possible.  Note that this also
+       includes a check for an unrolled loop over 64-bit integers.  */
+    return select_accel_fn(buf, len);
 }
-
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [Qemu-devel] [PATCH 05/10] cutils: Remove aarch64 buffer zero checking
  2016-09-13 16:09 [Qemu-devel] [PATCH v4 00/10] Improve buffer_is_zero Paolo Bonzini
                   ` (3 preceding siblings ...)
  2016-09-13 16:09 ` [Qemu-devel] [PATCH 04/10] cutils: Rearrange buffer_is_zero acceleration Paolo Bonzini
@ 2016-09-13 16:10 ` Paolo Bonzini
  2016-09-13 16:10 ` [Qemu-devel] [PATCH 06/10] cutils: Remove ppc " Paolo Bonzini
                   ` (4 subsequent siblings)
  9 siblings, 0 replies; 13+ messages in thread
From: Paolo Bonzini @ 2016-09-13 16:10 UTC (permalink / raw)
  To: qemu-devel; +Cc: rth

From: Richard Henderson <rth@twiddle.net>

The revised integer version is 4 times faster than the neon version
on an AppliedMicro Mustang.  Even with hand scheduling and additional
unrolling I cannot make any neon version run as fast as the integer.

Signed-off-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 util/bufferiszero.c | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/util/bufferiszero.c b/util/bufferiszero.c
index 0bcca70..107b0e9 100644
--- a/util/bufferiszero.c
+++ b/util/bufferiszero.c
@@ -191,21 +191,6 @@ static bool select_accel_fn(const void *buf, size_t len)
     return buffer_zero_int(buf, len);
 }
 
-#elif defined(__aarch64__)
-#include "arm_neon.h"
-
-#define DO_NONZERO(X)  (vgetq_lane_u64((X), 0) | vgetq_lane_u64((X), 1))
-ACCEL_BUFFER_ZERO(buffer_zero_neon, 128, uint64x2_t, DO_NONZERO)
-
-static bool select_accel_fn(const void *buf, size_t len)
-{
-    uintptr_t ibuf = (uintptr_t)buf;
-    if (len % 128 == 0 && ibuf % sizeof(uint64x2_t) == 0) {
-        return buffer_zero_neon(buf, len);
-    }
-    return buffer_zero_int(buf, len);
-}
-
 #else
 #define select_accel_fn  buffer_zero_int
 #endif
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [Qemu-devel] [PATCH 06/10] cutils: Remove ppc buffer zero checking
  2016-09-13 16:09 [Qemu-devel] [PATCH v4 00/10] Improve buffer_is_zero Paolo Bonzini
                   ` (4 preceding siblings ...)
  2016-09-13 16:10 ` [Qemu-devel] [PATCH 05/10] cutils: Remove aarch64 buffer zero checking Paolo Bonzini
@ 2016-09-13 16:10 ` Paolo Bonzini
  2016-09-13 16:10 ` [Qemu-devel] [PATCH 07/10] cutils: Add test for buffer_is_zero Paolo Bonzini
                   ` (3 subsequent siblings)
  9 siblings, 0 replies; 13+ messages in thread
From: Paolo Bonzini @ 2016-09-13 16:10 UTC (permalink / raw)
  To: qemu-devel; +Cc: rth

From: Richard Henderson <rth@twiddle.net>

For ppc64le, gcc6 does extremely poorly with the Altivec code.
Moreover, on POWER7 and POWER8, a hand-optimized Altivec version
turns out to be no faster than the revised integer version, and
therefore not worth the effort.

Signed-off-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 util/bufferiszero.c | 26 +-------------------------
 1 file changed, 1 insertion(+), 25 deletions(-)

diff --git a/util/bufferiszero.c b/util/bufferiszero.c
index 107b0e9..de35408 100644
--- a/util/bufferiszero.c
+++ b/util/bufferiszero.c
@@ -94,31 +94,7 @@ buffer_zero_int(const void *buf, size_t len)
     }
 }
 
-#if defined(__ALTIVEC__)
-#include <altivec.h>
-/* The altivec.h header says we're allowed to undef these for
- * C++ compatibility.  Here we don't care about C++, but we
- * undef them anyway to avoid namespace pollution.
- * altivec.h may redefine the bool macro as vector type.
- * Reset it to POSIX semantics.
- */
-#undef vector
-#undef pixel
-#undef bool
-#define bool _Bool
-#define DO_NONZERO(X)  vec_any_ne(X, (__vector unsigned char){ 0 })
-ACCEL_BUFFER_ZERO(buffer_zero_ppc, 128, __vector unsigned char, DO_NONZERO)
-
-static bool select_accel_fn(const void *buf, size_t len)
-{
-    uintptr_t ibuf = (uintptr_t)buf;
-    if (len % 128 == 0 && ibuf % sizeof(__vector unsigned char) == 0) {
-        return buffer_zero_ppc(buf, len);
-    }
-    return buffer_zero_int(buf, len);
-}
-
-#elif defined(CONFIG_AVX2_OPT) || (defined(CONFIG_CPUID_H) && defined(__SSE2__))
+#if defined(CONFIG_AVX2_OPT) || (defined(CONFIG_CPUID_H) && defined(__SSE2__))
 #include <cpuid.h>
 
 #pragma GCC push_options
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [Qemu-devel] [PATCH 07/10] cutils: Add test for buffer_is_zero
  2016-09-13 16:09 [Qemu-devel] [PATCH v4 00/10] Improve buffer_is_zero Paolo Bonzini
                   ` (5 preceding siblings ...)
  2016-09-13 16:10 ` [Qemu-devel] [PATCH 06/10] cutils: Remove ppc " Paolo Bonzini
@ 2016-09-13 16:10 ` Paolo Bonzini
  2016-09-13 16:10 ` [Qemu-devel] [PATCH 08/10] cutils: Add SSE4 version Paolo Bonzini
                   ` (2 subsequent siblings)
  9 siblings, 0 replies; 13+ messages in thread
From: Paolo Bonzini @ 2016-09-13 16:10 UTC (permalink / raw)
  To: qemu-devel; +Cc: rth

From: Richard Henderson <rth@twiddle.net>

Signed-off-by: Richard Henderson <rth@twiddle.net>
Message-Id: <1472496380-19706-6-git-send-email-rth@twiddle.net>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/qemu/cutils.h     |  1 +
 tests/Makefile.include    |  3 ++
 tests/test-bufferiszero.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++
 util/bufferiszero.c       | 20 ++++++++++++
 4 files changed, 102 insertions(+)
 create mode 100644 tests/test-bufferiszero.c

diff --git a/include/qemu/cutils.h b/include/qemu/cutils.h
index ca58577..8033929 100644
--- a/include/qemu/cutils.h
+++ b/include/qemu/cutils.h
@@ -169,6 +169,7 @@ int64_t qemu_strtosz_suffix_unit(const char *nptr, char **end,
 #define STR_OR_NULL(str) ((str) ? (str) : "null")
 
 bool buffer_is_zero(const void *buf, size_t len);
+bool test_buffer_is_zero_next_accel(void);
 
 /*
  * Implementation of ULEB128 (http://en.wikipedia.org/wiki/LEB128)
diff --git a/tests/Makefile.include b/tests/Makefile.include
index e3a3266..bde274d 100644
--- a/tests/Makefile.include
+++ b/tests/Makefile.include
@@ -112,6 +112,8 @@ check-unit-y += tests/test-crypto-xts$(EXESUF)
 check-unit-y += tests/test-crypto-block$(EXESUF)
 gcov-files-test-logging-y = tests/test-logging.c
 check-unit-y += tests/test-logging$(EXESUF)
+check-unit-y += tests/test-bufferiszero$(EXESUF)
+gcov-files-check-bufferiszero-y = util/bufferiszero.c
 
 check-block-$(CONFIG_POSIX) += tests/qemu-iotests-quick.sh
 
@@ -484,6 +486,7 @@ tests/test-qdist$(EXESUF): tests/test-qdist.o $(test-util-obj-y)
 tests/test-qht$(EXESUF): tests/test-qht.o $(test-util-obj-y)
 tests/test-qht-par$(EXESUF): tests/test-qht-par.o tests/qht-bench$(EXESUF) $(test-util-obj-y)
 tests/qht-bench$(EXESUF): tests/qht-bench.o $(test-util-obj-y)
+tests/test-bufferiszero$(EXESUF): tests/test-bufferiszero.o $(test-util-obj-y)
 
 tests/test-qdev-global-props$(EXESUF): tests/test-qdev-global-props.o \
 	hw/core/qdev.o hw/core/qdev-properties.o hw/core/hotplug.o\
diff --git a/tests/test-bufferiszero.c b/tests/test-bufferiszero.c
new file mode 100644
index 0000000..42d194c
--- /dev/null
+++ b/tests/test-bufferiszero.c
@@ -0,0 +1,78 @@
+/*
+ * QEMU buffer_is_zero test
+ *
+ * Copyright (c) 2016 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/cutils.h"
+
+static char buffer[8 * 1024 * 1024];
+
+static void test_1(void)
+{
+    size_t s, a, o;
+
+    /* Basic positive test.  */
+    g_assert(buffer_is_zero(buffer, sizeof(buffer)));
+
+    /* Basic negative test.  */
+    buffer[sizeof(buffer) - 1] = 1;
+    g_assert(!buffer_is_zero(buffer, sizeof(buffer)));
+    buffer[sizeof(buffer) - 1] = 0;
+
+    /* Positive tests for size and alignment.  */
+    for (a = 1; a <= 64; a++) {
+        for (s = 1; s < 1024; s++) {
+            buffer[a - 1] = 1;
+            buffer[a + s] = 1;
+            g_assert(buffer_is_zero(buffer + a, s));
+            buffer[a - 1] = 0;
+            buffer[a + s] = 0;
+        }
+    }
+
+    /* Negative tests for size, alignment, and the offset of the marker.  */
+    for (a = 1; a <= 64; a++) {
+        for (s = 1; s < 1024; s++) {
+            for (o = 0; o < s; ++o) {
+                buffer[a + o] = 1;
+                g_assert(!buffer_is_zero(buffer + a, s));
+                buffer[a + o] = 0;
+            }
+        }
+    }
+}
+
+static void test_2(void)
+{
+    if (g_test_perf()) {
+        test_1();
+    } else {
+        do {
+            test_1();
+        } while (test_buffer_is_zero_next_accel());
+    }
+}
+
+int main(int argc, char **argv)
+{
+    g_test_init(&argc, &argv, NULL);
+    g_test_add_func("/cutils/bufferiszero", test_2);
+
+    return g_test_run();
+}
diff --git a/util/bufferiszero.c b/util/bufferiszero.c
index de35408..d21d2af 100644
--- a/util/bufferiszero.c
+++ b/util/bufferiszero.c
@@ -153,6 +153,19 @@ static void __attribute__((constructor)) init_cpuid_cache(void)
     cpuid_cache = cache;
 }
 
+#define HAVE_NEXT_ACCEL
+bool test_buffer_is_zero_next_accel(void)
+{
+    /* If no bits set, we just tested buffer_zero_int, and there
+       are no more acceleration options to test.  */
+    if (cpuid_cache == 0) {
+        return false;
+    }
+    /* Disable the accelerator we used before and select a new one.  */
+    cpuid_cache &= cpuid_cache - 1;
+    return true;
+}
+
 static bool select_accel_fn(const void *buf, size_t len)
 {
     uintptr_t ibuf = (uintptr_t)buf;
@@ -171,6 +184,13 @@ static bool select_accel_fn(const void *buf, size_t len)
 #define select_accel_fn  buffer_zero_int
 #endif
 
+#ifndef HAVE_NEXT_ACCEL
+bool test_buffer_is_zero_next_accel(void)
+{
+    return false;
+}
+#endif
+
 /*
  * Checks if a buffer is all zeroes
  */
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [Qemu-devel] [PATCH 08/10] cutils: Add SSE4 version
  2016-09-13 16:09 [Qemu-devel] [PATCH v4 00/10] Improve buffer_is_zero Paolo Bonzini
                   ` (6 preceding siblings ...)
  2016-09-13 16:10 ` [Qemu-devel] [PATCH 07/10] cutils: Add test for buffer_is_zero Paolo Bonzini
@ 2016-09-13 16:10 ` Paolo Bonzini
  2016-09-13 16:10 ` [Qemu-devel] [PATCH 09/10] cutils: Add generic prefetch Paolo Bonzini
  2016-09-13 16:10 ` [Qemu-devel] [PATCH 10/10] cutils: Rewrite x86 buffer zero checking Paolo Bonzini
  9 siblings, 0 replies; 13+ messages in thread
From: Paolo Bonzini @ 2016-09-13 16:10 UTC (permalink / raw)
  To: qemu-devel; +Cc: rth

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 util/bufferiszero.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/util/bufferiszero.c b/util/bufferiszero.c
index d21d2af..932b809 100644
--- a/util/bufferiszero.c
+++ b/util/bufferiszero.c
@@ -107,6 +107,13 @@ ACCEL_BUFFER_ZERO(buffer_zero_sse2, 64, __m128i, SSE2_NONZERO)
 
 #ifdef CONFIG_AVX2_OPT
 #pragma GCC push_options
+#pragma GCC target("sse4")
+#include <smmintrin.h>
+#define SSE4_NONZERO(X)  !_mm_testz_si128((X), (X))
+ACCEL_BUFFER_ZERO(buffer_zero_sse4, 64, __m128i, SSE4_NONZERO)
+#pragma GCC pop_options
+
+#pragma GCC push_options
 #pragma GCC target("avx2")
 #include <immintrin.h>
 #define AVX2_NONZERO(X)  !_mm256_testz_si256((X), (X))
@@ -173,6 +180,9 @@ static bool select_accel_fn(const void *buf, size_t len)
     if (len % 128 == 0 && ibuf % 32 == 0 && (cpuid_cache & CACHE_AVX2)) {
         return buffer_zero_avx2(buf, len);
     }
+    if (len % 64 == 0 && ibuf % 16 == 0 && (cpuid_cache & CACHE_SSE4)) {
+        return buffer_zero_sse4(buf, len);
+    }
 #endif
     if (len % 64 == 0 && ibuf % 16 == 0 && (cpuid_cache & CACHE_SSE2)) {
         return buffer_zero_sse2(buf, len);
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [Qemu-devel] [PATCH 09/10] cutils: Add generic prefetch
  2016-09-13 16:09 [Qemu-devel] [PATCH v4 00/10] Improve buffer_is_zero Paolo Bonzini
                   ` (7 preceding siblings ...)
  2016-09-13 16:10 ` [Qemu-devel] [PATCH 08/10] cutils: Add SSE4 version Paolo Bonzini
@ 2016-09-13 16:10 ` Paolo Bonzini
  2016-09-13 16:10 ` [Qemu-devel] [PATCH 10/10] cutils: Rewrite x86 buffer zero checking Paolo Bonzini
  9 siblings, 0 replies; 13+ messages in thread
From: Paolo Bonzini @ 2016-09-13 16:10 UTC (permalink / raw)
  To: qemu-devel; +Cc: rth

From: Richard Henderson <rth@twiddle.net>

There's no real knowledge of the cacheline size,
just prefetching one loop ahead.

Signed-off-by: Richard Henderson <rth@twiddle.net>
Message-Id: <1472496380-19706-7-git-send-email-rth@twiddle.net>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 util/bufferiszero.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/util/bufferiszero.c b/util/bufferiszero.c
index 932b809..4d8a8c8 100644
--- a/util/bufferiszero.c
+++ b/util/bufferiszero.c
@@ -38,6 +38,8 @@ static bool NAME(const void *buf, size_t len)                   \
     do {                                                        \
         const VECTYPE *p = buf;                                 \
         VECTYPE t;                                              \
+        __builtin_prefetch(buf + SIZE);                         \
+        barrier();                                              \
         if (SIZE == sizeof(VECTYPE) * 4) {                      \
             t = (p[0] | p[1]) | (p[2] | p[3]);                  \
         } else if (SIZE == sizeof(VECTYPE) * 8) {               \
@@ -210,6 +212,9 @@ bool buffer_is_zero(const void *buf, size_t len)
         return true;
     }
 
+    /* Fetch the beginning of the buffer while we select the accelerator.  */
+    __builtin_prefetch(buf);
+
     /* Use an optimized zero check if possible.  Note that this also
        includes a check for an unrolled loop over 64-bit integers.  */
     return select_accel_fn(buf, len);
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [Qemu-devel] [PATCH 10/10] cutils: Rewrite x86 buffer zero checking
  2016-09-13 16:09 [Qemu-devel] [PATCH v4 00/10] Improve buffer_is_zero Paolo Bonzini
                   ` (8 preceding siblings ...)
  2016-09-13 16:10 ` [Qemu-devel] [PATCH 09/10] cutils: Add generic prefetch Paolo Bonzini
@ 2016-09-13 16:10 ` Paolo Bonzini
  2016-09-13 16:27   ` Richard Henderson
  9 siblings, 1 reply; 13+ messages in thread
From: Paolo Bonzini @ 2016-09-13 16:10 UTC (permalink / raw)
  To: qemu-devel; +Cc: rth

From: Richard Henderson <rth@twiddle.net>

Handle alignment of buffers, so that the vector paths can be
used more often.  Add versions for AVX1 and SSE4.1, both of
which have incremental improvements over SSE2.

Signed-off-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 util/bufferiszero.c | 139 ++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 96 insertions(+), 43 deletions(-)

diff --git a/util/bufferiszero.c b/util/bufferiszero.c
index 4d8a8c8..c23589c 100644
--- a/util/bufferiszero.c
+++ b/util/bufferiszero.c
@@ -26,38 +26,6 @@
 #include "qemu/cutils.h"
 #include "qemu/bswap.h"
 
-
-/* vector definitions */
-
-extern void link_error(void);
-
-#define ACCEL_BUFFER_ZERO(NAME, SIZE, VECTYPE, NONZERO)         \
-static bool NAME(const void *buf, size_t len)                   \
-{                                                               \
-    const void *end = buf + len;                                \
-    do {                                                        \
-        const VECTYPE *p = buf;                                 \
-        VECTYPE t;                                              \
-        __builtin_prefetch(buf + SIZE);                         \
-        barrier();                                              \
-        if (SIZE == sizeof(VECTYPE) * 4) {                      \
-            t = (p[0] | p[1]) | (p[2] | p[3]);                  \
-        } else if (SIZE == sizeof(VECTYPE) * 8) {               \
-            t  = p[0] | p[1];                                   \
-            t |= p[2] | p[3];                                   \
-            t |= p[4] | p[5];                                   \
-            t |= p[6] | p[7];                                   \
-        } else {                                                \
-            link_error();                                       \
-        }                                                       \
-        if (unlikely(NONZERO(t))) {                             \
-            return false;                                       \
-        }                                                       \
-        buf += SIZE;                                            \
-    } while (buf < end);                                        \
-    return true;                                                \
-}
-
 static bool
 buffer_zero_int(const void *buf, size_t len)
 {
@@ -102,24 +70,110 @@ buffer_zero_int(const void *buf, size_t len)
 #pragma GCC push_options
 #pragma GCC target("sse2")
 #include <emmintrin.h>
-#define SSE2_NONZERO(X) \
-    (_mm_movemask_epi8(_mm_cmpeq_epi8((X), _mm_setzero_si128())) != 0xFFFF)
-ACCEL_BUFFER_ZERO(buffer_zero_sse2, 64, __m128i, SSE2_NONZERO)
+
+static bool
+buffer_zero_sse2(const void *buf, size_t len)
+{
+    __m128i t = _mm_loadu_si128(buf);
+    __m128i *p = (__m128i *)(((uintptr_t)buf + 5 * 16) & -16);
+    __m128i *e = (__m128i *)(((uintptr_t)buf + len) & -16);
+    __m128i zero = _mm_setzero_si128();
+
+    /* Loop over 16-byte aligned blocks of 64.  */
+    while (likely(p <= e)) {
+        __builtin_prefetch(p);
+        t = _mm_cmpeq_epi8(t, zero);
+        if (unlikely(_mm_movemask_epi8(t) != 0xFFFF)) {
+            return false;
+        }
+        t = p[-4] | p[-3] | p[-2] | p[-1];
+        p += 4;
+    }
+
+    /* Finish the aligned tail.  */
+    t |= e[-3];
+    t |= e[-2];
+    t |= e[-1];
+
+    /* Finish the unaligned tail.  */
+    t |= _mm_loadu_si128(buf + len - 16);
+
+    return _mm_movemask_epi8(_mm_cmpeq_epi8(t, zero)) == 0xFFFF;
+}
 #pragma GCC pop_options
 
 #ifdef CONFIG_AVX2_OPT
 #pragma GCC push_options
 #pragma GCC target("sse4")
 #include <smmintrin.h>
-#define SSE4_NONZERO(X)  !_mm_testz_si128((X), (X))
-ACCEL_BUFFER_ZERO(buffer_zero_sse4, 64, __m128i, SSE4_NONZERO)
+
+static bool
+buffer_zero_sse4(const void *buf, size_t len)
+{
+    __m128i t = _mm_loadu_si128(buf);
+    __m128i *p = (__m128i *)(((uintptr_t)buf + 5 * 16) & -16);
+    __m128i *e = (__m128i *)(((uintptr_t)buf + len) & -16);
+
+    /* Loop over 16-byte aligned blocks of 64.  */
+    while (likely(p <= e)) {
+        __builtin_prefetch(p);
+        if (unlikely(!_mm_testz_si128(t, t))) {
+            return false;
+        }
+        t = p[-4] | p[-3] | p[-2] | p[-1];
+        p += 4;
+    }
+
+    /* Finish the aligned tail.  */
+    t |= e[-3];
+    t |= e[-2];
+    t |= e[-1];
+
+    /* Finish the unaligned tail.  */
+    t |= _mm_loadu_si128(buf + len - 16);
+
+    return _mm_testz_si128(t, t);
+}
 #pragma GCC pop_options
 
 #pragma GCC push_options
 #pragma GCC target("avx2")
 #include <immintrin.h>
-#define AVX2_NONZERO(X)  !_mm256_testz_si256((X), (X))
-ACCEL_BUFFER_ZERO(buffer_zero_avx2, 128, __m256i, AVX2_NONZERO)
+
+static bool
+buffer_zero_avx2(const void *buf, size_t len)
+{
+    /* Begin with an unaligned head of 32 bytes.  */
+    __m256i t = _mm256_loadu_si256(buf);
+    __m256i *p = (__m256i *)(((uintptr_t)buf + 5 * 32) & -32);
+    __m256i *e = (__m256i *)(((uintptr_t)buf + len) & -32);
+
+    if (likely(p <= e)) {
+        /* Loop over 32-byte aligned blocks of 128.  */
+        do {
+            __builtin_prefetch(p);
+            if (unlikely(!_mm256_testz_si256(t, t))) {
+                return false;
+            }
+            t = p[-4] | p[-3] | p[-2] | p[-1];
+            p += 4;
+        } while (p <= e);
+    } else {
+        t |= _mm256_loadu_si256(buf + 32);
+        if (len <= 128) {
+            goto last2;
+        }
+    }
+
+    /* Finish the last block of 128 unaligned.  */
+    t |= _mm256_loadu_si256(buf + len - 4 * 32);
+    t |= _mm256_loadu_si256(buf + len - 3 * 32);
+ last2:
+    t |= _mm256_loadu_si256(buf + len - 2 * 32);
+    t |= _mm256_loadu_si256(buf + len - 1 * 32);
+
+    return _mm256_testz_si256(t, t);
+}
 #pragma GCC pop_options
 #endif
 
@@ -177,16 +231,15 @@ bool test_buffer_is_zero_next_accel(void)
 
 static bool select_accel_fn(const void *buf, size_t len)
 {
-    uintptr_t ibuf = (uintptr_t)buf;
 #ifdef CONFIG_AVX2_OPT
-    if (len % 128 == 0 && ibuf % 32 == 0 && (cpuid_cache & CACHE_AVX2)) {
+    if (len >= 128 && (cpuid_cache & CACHE_AVX2)) {
         return buffer_zero_avx2(buf, len);
     }
-    if (len % 64 == 0 && ibuf % 16 == 0 && (cpuid_cache & CACHE_SSE4)) {
+    if (len >= 64 && (cpuid_cache & CACHE_SSE4)) {
         return buffer_zero_sse4(buf, len);
     }
 #endif
-    if (len % 64 == 0 && ibuf % 16 == 0 && (cpuid_cache & CACHE_SSE2)) {
+    if (len >= 64 && (cpuid_cache & CACHE_SSE2)) {
         return buffer_zero_sse2(buf, len);
     }
     return buffer_zero_int(buf, len);
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [Qemu-devel] [PATCH 10/10] cutils: Rewrite x86 buffer zero checking
  2016-09-13 16:10 ` [Qemu-devel] [PATCH 10/10] cutils: Rewrite x86 buffer zero checking Paolo Bonzini
@ 2016-09-13 16:27   ` Richard Henderson
  2016-09-13 16:33     ` Paolo Bonzini
  0 siblings, 1 reply; 13+ messages in thread
From: Richard Henderson @ 2016-09-13 16:27 UTC (permalink / raw)
  To: Paolo Bonzini, qemu-devel

On 09/13/2016 09:10 AM, Paolo Bonzini wrote:
> @@ -177,16 +231,15 @@ bool test_buffer_is_zero_next_accel(void)
>  
>  static bool select_accel_fn(const void *buf, size_t len)
>  {
> -    uintptr_t ibuf = (uintptr_t)buf;
>  #ifdef CONFIG_AVX2_OPT
> -    if (len % 128 == 0 && ibuf % 32 == 0 && (cpuid_cache & CACHE_AVX2)) {
> +    if (len >= 128 && (cpuid_cache & CACHE_AVX2)) {
>          return buffer_zero_avx2(buf, len);
>      }
> -    if (len % 64 == 0 && ibuf % 16 == 0 && (cpuid_cache & CACHE_SSE4)) {
> +    if (len >= 64 && (cpuid_cache & CACHE_SSE4)) {
>          return buffer_zero_sse4(buf, len);
>      }
>  #endif
> -    if (len % 64 == 0 && ibuf % 16 == 0 && (cpuid_cache & CACHE_SSE2)) {
> +    if (len >= 64 && (cpuid_cache & CACHE_SSE2)) {
>          return buffer_zero_sse2(buf, len);
>      }

You've dropped a major change to select_accel_fn here.

(1) The avx2 routine, as written, can support len >= 64, therefore a common
test works for all of the vectorized functions.

(2) I had saved the pointer to the routine, so that we didn't have to
repeatedly test multiple cpuid_cache bits.


r~

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Qemu-devel] [PATCH 10/10] cutils: Rewrite x86 buffer zero checking
  2016-09-13 16:27   ` Richard Henderson
@ 2016-09-13 16:33     ` Paolo Bonzini
  0 siblings, 0 replies; 13+ messages in thread
From: Paolo Bonzini @ 2016-09-13 16:33 UTC (permalink / raw)
  To: Richard Henderson, qemu-devel



On 13/09/2016 18:27, Richard Henderson wrote:
> On 09/13/2016 09:10 AM, Paolo Bonzini wrote:
>> @@ -177,16 +231,15 @@ bool test_buffer_is_zero_next_accel(void)
>>  
>>  static bool select_accel_fn(const void *buf, size_t len)
>>  {
>> -    uintptr_t ibuf = (uintptr_t)buf;
>>  #ifdef CONFIG_AVX2_OPT
>> -    if (len % 128 == 0 && ibuf % 32 == 0 && (cpuid_cache & CACHE_AVX2)) {
>> +    if (len >= 128 && (cpuid_cache & CACHE_AVX2)) {
>>          return buffer_zero_avx2(buf, len);
>>      }
>> -    if (len % 64 == 0 && ibuf % 16 == 0 && (cpuid_cache & CACHE_SSE4)) {
>> +    if (len >= 64 && (cpuid_cache & CACHE_SSE4)) {
>>          return buffer_zero_sse4(buf, len);
>>      }
>>  #endif
>> -    if (len % 64 == 0 && ibuf % 16 == 0 && (cpuid_cache & CACHE_SSE2)) {
>> +    if (len >= 64 && (cpuid_cache & CACHE_SSE2)) {
>>          return buffer_zero_sse2(buf, len);
>>      }
> 
> You've dropped a major change to select_accel_fn here.
> 
> (1) The avx2 routine, as written, can support len >= 64, therefore a common
> test works for all of the vectorized functions.
> 
> (2) I had saved the pointer to the routine, so that we didn't have to
> repeatedly test multiple cpuid_cache bits.

Can you send a replacement for this patch only?

Thanks,

Paolo

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2016-09-13 16:33 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-09-13 16:09 [Qemu-devel] [PATCH v4 00/10] Improve buffer_is_zero Paolo Bonzini
2016-09-13 16:09 ` [Qemu-devel] [PATCH 01/10] cutils: Move buffer_is_zero and subroutines to a new file Paolo Bonzini
2016-09-13 16:09 ` [Qemu-devel] [PATCH 02/10] cutils: Remove SPLAT macro Paolo Bonzini
2016-09-13 16:09 ` [Qemu-devel] [PATCH 03/10] cutils: Export only buffer_is_zero Paolo Bonzini
2016-09-13 16:09 ` [Qemu-devel] [PATCH 04/10] cutils: Rearrange buffer_is_zero acceleration Paolo Bonzini
2016-09-13 16:10 ` [Qemu-devel] [PATCH 05/10] cutils: Remove aarch64 buffer zero checking Paolo Bonzini
2016-09-13 16:10 ` [Qemu-devel] [PATCH 06/10] cutils: Remove ppc " Paolo Bonzini
2016-09-13 16:10 ` [Qemu-devel] [PATCH 07/10] cutils: Add test for buffer_is_zero Paolo Bonzini
2016-09-13 16:10 ` [Qemu-devel] [PATCH 08/10] cutils: Add SSE4 version Paolo Bonzini
2016-09-13 16:10 ` [Qemu-devel] [PATCH 09/10] cutils: Add generic prefetch Paolo Bonzini
2016-09-13 16:10 ` [Qemu-devel] [PATCH 10/10] cutils: Rewrite x86 buffer zero checking Paolo Bonzini
2016-09-13 16:27   ` Richard Henderson
2016-09-13 16:33     ` Paolo Bonzini

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.