All of lore.kernel.org
 help / color / mirror / Atom feed
From: Liang Li <liang.z.li@intel.com>
To: qemu-devel@nongnu.org
Cc: Liang Li <liang.z.li@intel.com>,
	quintela@redhat.com, mst@redhat.com, dgilbert@redhat.com,
	stefanha@redhat.com, amit.shah@redhat.com, pbonzini@redhat.com,
	rth@twiddle.net
Subject: [Qemu-devel] [v3 1/3] cutils: add avx2 instruction optimization
Date: Tue,  8 Dec 2015 20:08:53 +0800	[thread overview]
Message-ID: <1449576535-3369-2-git-send-email-liang.z.li@intel.com> (raw)
In-Reply-To: <1449576535-3369-1-git-send-email-liang.z.li@intel.com>

buffer_find_nonzero_offset() is a hot function during live migration.
Now it use SSE2 intructions for optimization. For platform supports
AVX2 instructions, use the AVX2 instructions for optimization can help
to improve the performance about 30% comparing to SSE2.
Zero page check can be faster with this optimization, the test result
shows that for an 8GB RAM idle guest, this patch can help to shorten
the total live migration time about 6%.

This patch use the ifunc mechanism to select the proper function when
running, for platform supports AVX2, excute the AVX2 instructions,
else, excute the original code.

Signed-off-by: Liang Li <liang.z.li@intel.com>
---
 include/qemu-common.h   | 13 +++++-----
 util/Makefile.objs      |  2 ++
 util/buffer-zero-avx2.c | 54 ++++++++++++++++++++++++++++++++++++++++
 util/cutils.c           | 65 +++++++++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 125 insertions(+), 9 deletions(-)
 create mode 100644 util/buffer-zero-avx2.c

diff --git a/include/qemu-common.h b/include/qemu-common.h
index 405364f..be8ba79 100644
--- a/include/qemu-common.h
+++ b/include/qemu-common.h
@@ -484,15 +484,14 @@ void qemu_hexdump(const char *buf, FILE *fp, const char *prefix, size_t size);
 #endif
 
 #define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 8
-static inline bool
-can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
-{
-    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
-                   * sizeof(VECTYPE)) == 0
-            && ((uintptr_t) buf) % sizeof(VECTYPE) == 0);
-}
+bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len);
 size_t buffer_find_nonzero_offset(const void *buf, size_t len);
 
+#if defined CONFIG_IFUNC && defined CONFIG_AVX2
+bool can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len);
+size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len);
+#endif
+
 /*
  * helper to parse debug environment variables
  */
diff --git a/util/Makefile.objs b/util/Makefile.objs
index 89dd80e..a130b35 100644
--- a/util/Makefile.objs
+++ b/util/Makefile.objs
@@ -1,4 +1,5 @@
 util-obj-y = osdep.o cutils.o unicode.o qemu-timer-common.o
+util-obj-$(CONFIG_AVX2) += buffer-zero-avx2.o
 util-obj-$(CONFIG_POSIX) += compatfd.o
 util-obj-$(CONFIG_POSIX) += event_notifier-posix.o
 util-obj-$(CONFIG_POSIX) += mmap-alloc.o
@@ -30,3 +31,4 @@ util-obj-y += qemu-coroutine-sleep.o
 util-obj-y += coroutine-$(CONFIG_COROUTINE_BACKEND).o
 util-obj-y += buffer.o
 util-obj-y += timed-average.o
+buffer-zero-avx2.o-cflags      := $(AVX2_CFLAGS)
diff --git a/util/buffer-zero-avx2.c b/util/buffer-zero-avx2.c
new file mode 100644
index 0000000..b9da0e3
--- /dev/null
+++ b/util/buffer-zero-avx2.c
@@ -0,0 +1,54 @@
+#include "qemu-common.h"
+
+#if defined CONFIG_IFUNC && defined CONFIG_AVX2
+#include <immintrin.h>
+#define AVX2_VECTYPE        __m256i
+#define AVX2_SPLAT(p)       _mm256_set1_epi8(*(p))
+#define AVX2_ALL_EQ(v1, v2) \
+    (_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, v2)) == 0xFFFFFFFF)
+#define AVX2_VEC_OR(v1, v2) (_mm256_or_si256(v1, v2))
+
+inline bool
+can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
+{
+    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
+                   * sizeof(AVX2_VECTYPE)) == 0
+            && ((uintptr_t) buf) % sizeof(AVX2_VECTYPE) == 0);
+}
+
+size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
+{
+    const AVX2_VECTYPE *p = buf;
+    const AVX2_VECTYPE zero = (AVX2_VECTYPE){0};
+    size_t i;
+
+    assert(can_use_buffer_find_nonzero_offset_avx2(buf, len));
+
+    if (!len) {
+        return 0;
+    }
+
+    for (i = 0; i < BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i++) {
+        if (!AVX2_ALL_EQ(p[i], zero)) {
+            return i * sizeof(AVX2_VECTYPE);
+        }
+    }
+
+    for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR;
+         i < len / sizeof(AVX2_VECTYPE);
+         i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) {
+        AVX2_VECTYPE tmp0 = AVX2_VEC_OR(p[i + 0], p[i + 1]);
+        AVX2_VECTYPE tmp1 = AVX2_VEC_OR(p[i + 2], p[i + 3]);
+        AVX2_VECTYPE tmp2 = AVX2_VEC_OR(p[i + 4], p[i + 5]);
+        AVX2_VECTYPE tmp3 = AVX2_VEC_OR(p[i + 6], p[i + 7]);
+        AVX2_VECTYPE tmp01 = AVX2_VEC_OR(tmp0, tmp1);
+        AVX2_VECTYPE tmp23 = AVX2_VEC_OR(tmp2, tmp3);
+        if (!AVX2_ALL_EQ(AVX2_VEC_OR(tmp01, tmp23), zero)) {
+            break;
+        }
+    }
+
+    return i * sizeof(AVX2_VECTYPE);
+}
+
+#endif
diff --git a/util/cutils.c b/util/cutils.c
index cfeb848..3631c02 100644
--- a/util/cutils.c
+++ b/util/cutils.c
@@ -26,6 +26,7 @@
 #include <math.h>
 #include <limits.h>
 #include <errno.h>
+#include <cpuid.h>
 
 #include "qemu/sockets.h"
 #include "qemu/iov.h"
@@ -161,6 +162,14 @@ int qemu_fdatasync(int fd)
 #endif
 }
 
+static inline bool
+can_use_buffer_find_nonzero_offset_inner(const void *buf, size_t len)
+{
+    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
+                   * sizeof(VECTYPE)) == 0
+            && ((uintptr_t) buf) % sizeof(VECTYPE) == 0);
+}
+
 /*
  * Searches for an area with non-zero content in a buffer
  *
@@ -181,13 +190,13 @@ int qemu_fdatasync(int fd)
  * If the buffer is all zero the return value is equal to len.
  */
 
-size_t buffer_find_nonzero_offset(const void *buf, size_t len)
+static size_t buffer_find_nonzero_offset_inner(const void *buf, size_t len)
 {
     const VECTYPE *p = buf;
     const VECTYPE zero = (VECTYPE){0};
     size_t i;
 
-    assert(can_use_buffer_find_nonzero_offset(buf, len));
+    assert(can_use_buffer_find_nonzero_offset_inner(buf, len));
 
     if (!len) {
         return 0;
@@ -216,6 +225,58 @@ size_t buffer_find_nonzero_offset(const void *buf, size_t len)
     return i * sizeof(VECTYPE);
 }
 
+#if defined CONFIG_IFUNC && defined CONFIG_AVX2
+/* old compiler maynot define bit_AVX2 */
+#ifndef bit_AVX2
+#define bit_AVX2 (1 << 5)
+#endif
+
+static bool avx2_support(void)
+{
+    int a, b, c, d;
+
+    if (__get_cpuid_max(0, NULL) < 7) {
+        return false;
+    }
+
+    __cpuid_count(7, 0, a, b, c, d);
+    return b & bit_AVX2;
+}
+
+bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len) \
+         __attribute__ ((ifunc("can_use_buffer_find_nonzero_offset_ifunc")));
+size_t buffer_find_nonzero_offset(const void *buf, size_t len) \
+         __attribute__ ((ifunc("buffer_find_nonzero_offset_ifunc")));
+
+static void *buffer_find_nonzero_offset_ifunc(void)
+{
+    typeof(buffer_find_nonzero_offset) *func = (avx2_support()) ?
+        buffer_find_nonzero_offset_avx2 : buffer_find_nonzero_offset_inner;
+
+    return func;
+}
+
+static void *can_use_buffer_find_nonzero_offset_ifunc(void)
+{
+    typeof(can_use_buffer_find_nonzero_offset) *func = (avx2_support()) ?
+        can_use_buffer_find_nonzero_offset_avx2 :
+        can_use_buffer_find_nonzero_offset_inner;
+
+    return func;
+}
+#else
+
+inline bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
+{
+    return can_use_buffer_find_nonzero_offset_inner(buf, len);
+}
+
+size_t buffer_find_nonzero_offset(const void *buf, size_t len)
+{
+    return buffer_find_nonzero_offset_inner(buf, len);
+}
+#endif
+
 /*
  * Checks if a buffer is all zeroes
  *
-- 
1.9.1

  reply	other threads:[~2015-12-08 12:10 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-12-08 12:08 [Qemu-devel] [v3 0/3] add avx2 instruction optimization Liang Li
2015-12-08 12:08 ` Liang Li [this message]
2015-12-08 16:09   ` [Qemu-devel] [v3 1/3] cutils: " Richard Henderson
2015-12-09  9:32     ` Li, Liang Z
2015-12-09 14:57       ` Richard Henderson
2015-12-10  1:10         ` Li, Liang Z
2015-12-10  9:03         ` Paolo Bonzini
2015-12-10  9:22           ` Li, Liang Z
2015-12-10  9:51             ` Paolo Bonzini
2015-12-08 12:08 ` [Qemu-devel] [v3 2/3] configure: detect ifunc attribute Liang Li
2015-12-08 12:08 ` [Qemu-devel] [v3 3/3] configure: add options to config avx2 Liang Li
2015-12-08 12:54   ` Peter Maydell
2015-12-08 14:18     ` Li, Liang Z

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1449576535-3369-2-git-send-email-liang.z.li@intel.com \
    --to=liang.z.li@intel.com \
    --cc=amit.shah@redhat.com \
    --cc=dgilbert@redhat.com \
    --cc=mst@redhat.com \
    --cc=pbonzini@redhat.com \
    --cc=qemu-devel@nongnu.org \
    --cc=quintela@redhat.com \
    --cc=rth@twiddle.net \
    --cc=stefanha@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.