All of lore.kernel.org
 help / color / mirror / Atom feed
* [Qemu-devel] [v3 0/3] add avx2 instruction optimization
@ 2015-12-08 12:08 Liang Li
  2015-12-08 12:08 ` [Qemu-devel] [v3 1/3] cutils: " Liang Li
                   ` (2 more replies)
  0 siblings, 3 replies; 13+ messages in thread
From: Liang Li @ 2015-12-08 12:08 UTC (permalink / raw)
  To: qemu-devel
  Cc: Liang Li, quintela, mst, dgilbert, stefanha, amit.shah, pbonzini, rth

buffer_find_nonzero_offset() is a hot function during live migration.
Now it use SSE2 intructions for optimization. For platform supports
AVX2 instructions, use the AVX2 instructions for optimization can help
to improve the performance about 30% comparing to SSE2.
Zero page check can be faster with this optimization, the test result
shows that for an 8GB RAM idle guest, this patch can help to shorten
the total live migration time about 6%.

This patch use the ifunc mechanism to select the proper function when
running, for platform supports AVX2, excute the AVX2 instructions,
else, excute the original code.

With this patch, the QEMU binary can run on both platforms support AVX2
or not.

Compiler which desn't support the AVX2 or ifunc attribute can build the
source code successfully.


v2 -> v3 changes:
  * Detect the ifunc attribute support (Paolo's suggestion) 
  * Use the ifunc attribute instead of the inline asm (Richard's suggestion)
  * Change the configure (Juan's suggestion)


Liang Li (3):
  cutils: add avx2 instruction optimization
  configure: detect ifunc attribute
  configure: add options to config avx2

 configure               | 50 +++++++++++++++++++++++++++++++++++++
 include/qemu-common.h   | 13 +++++-----
 util/Makefile.objs      |  2 ++
 util/buffer-zero-avx2.c | 54 ++++++++++++++++++++++++++++++++++++++++
 util/cutils.c           | 65 +++++++++++++++++++++++++++++++++++++++++++++++--
 5 files changed, 175 insertions(+), 9 deletions(-)
 create mode 100644 util/buffer-zero-avx2.c

-- 
1.9.1

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [Qemu-devel] [v3 1/3] cutils: add avx2 instruction optimization
  2015-12-08 12:08 [Qemu-devel] [v3 0/3] add avx2 instruction optimization Liang Li
@ 2015-12-08 12:08 ` Liang Li
  2015-12-08 16:09   ` Richard Henderson
  2015-12-08 12:08 ` [Qemu-devel] [v3 2/3] configure: detect ifunc attribute Liang Li
  2015-12-08 12:08 ` [Qemu-devel] [v3 3/3] configure: add options to config avx2 Liang Li
  2 siblings, 1 reply; 13+ messages in thread
From: Liang Li @ 2015-12-08 12:08 UTC (permalink / raw)
  To: qemu-devel
  Cc: Liang Li, quintela, mst, dgilbert, stefanha, amit.shah, pbonzini, rth

buffer_find_nonzero_offset() is a hot function during live migration.
Now it use SSE2 intructions for optimization. For platform supports
AVX2 instructions, use the AVX2 instructions for optimization can help
to improve the performance about 30% comparing to SSE2.
Zero page check can be faster with this optimization, the test result
shows that for an 8GB RAM idle guest, this patch can help to shorten
the total live migration time about 6%.

This patch use the ifunc mechanism to select the proper function when
running, for platform supports AVX2, excute the AVX2 instructions,
else, excute the original code.

Signed-off-by: Liang Li <liang.z.li@intel.com>
---
 include/qemu-common.h   | 13 +++++-----
 util/Makefile.objs      |  2 ++
 util/buffer-zero-avx2.c | 54 ++++++++++++++++++++++++++++++++++++++++
 util/cutils.c           | 65 +++++++++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 125 insertions(+), 9 deletions(-)
 create mode 100644 util/buffer-zero-avx2.c

diff --git a/include/qemu-common.h b/include/qemu-common.h
index 405364f..be8ba79 100644
--- a/include/qemu-common.h
+++ b/include/qemu-common.h
@@ -484,15 +484,14 @@ void qemu_hexdump(const char *buf, FILE *fp, const char *prefix, size_t size);
 #endif
 
 #define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 8
-static inline bool
-can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
-{
-    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
-                   * sizeof(VECTYPE)) == 0
-            && ((uintptr_t) buf) % sizeof(VECTYPE) == 0);
-}
+bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len);
 size_t buffer_find_nonzero_offset(const void *buf, size_t len);
 
+#if defined CONFIG_IFUNC && defined CONFIG_AVX2
+bool can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len);
+size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len);
+#endif
+
 /*
  * helper to parse debug environment variables
  */
diff --git a/util/Makefile.objs b/util/Makefile.objs
index 89dd80e..a130b35 100644
--- a/util/Makefile.objs
+++ b/util/Makefile.objs
@@ -1,4 +1,5 @@
 util-obj-y = osdep.o cutils.o unicode.o qemu-timer-common.o
+util-obj-$(CONFIG_AVX2) += buffer-zero-avx2.o
 util-obj-$(CONFIG_POSIX) += compatfd.o
 util-obj-$(CONFIG_POSIX) += event_notifier-posix.o
 util-obj-$(CONFIG_POSIX) += mmap-alloc.o
@@ -30,3 +31,4 @@ util-obj-y += qemu-coroutine-sleep.o
 util-obj-y += coroutine-$(CONFIG_COROUTINE_BACKEND).o
 util-obj-y += buffer.o
 util-obj-y += timed-average.o
+buffer-zero-avx2.o-cflags      := $(AVX2_CFLAGS)
diff --git a/util/buffer-zero-avx2.c b/util/buffer-zero-avx2.c
new file mode 100644
index 0000000..b9da0e3
--- /dev/null
+++ b/util/buffer-zero-avx2.c
@@ -0,0 +1,54 @@
+#include "qemu-common.h"
+
+#if defined CONFIG_IFUNC && defined CONFIG_AVX2
+#include <immintrin.h>
+#define AVX2_VECTYPE        __m256i
+#define AVX2_SPLAT(p)       _mm256_set1_epi8(*(p))
+#define AVX2_ALL_EQ(v1, v2) \
+    (_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, v2)) == 0xFFFFFFFF)
+#define AVX2_VEC_OR(v1, v2) (_mm256_or_si256(v1, v2))
+
+inline bool
+can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
+{
+    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
+                   * sizeof(AVX2_VECTYPE)) == 0
+            && ((uintptr_t) buf) % sizeof(AVX2_VECTYPE) == 0);
+}
+
+size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
+{
+    const AVX2_VECTYPE *p = buf;
+    const AVX2_VECTYPE zero = (AVX2_VECTYPE){0};
+    size_t i;
+
+    assert(can_use_buffer_find_nonzero_offset_avx2(buf, len));
+
+    if (!len) {
+        return 0;
+    }
+
+    for (i = 0; i < BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i++) {
+        if (!AVX2_ALL_EQ(p[i], zero)) {
+            return i * sizeof(AVX2_VECTYPE);
+        }
+    }
+
+    for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR;
+         i < len / sizeof(AVX2_VECTYPE);
+         i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) {
+        AVX2_VECTYPE tmp0 = AVX2_VEC_OR(p[i + 0], p[i + 1]);
+        AVX2_VECTYPE tmp1 = AVX2_VEC_OR(p[i + 2], p[i + 3]);
+        AVX2_VECTYPE tmp2 = AVX2_VEC_OR(p[i + 4], p[i + 5]);
+        AVX2_VECTYPE tmp3 = AVX2_VEC_OR(p[i + 6], p[i + 7]);
+        AVX2_VECTYPE tmp01 = AVX2_VEC_OR(tmp0, tmp1);
+        AVX2_VECTYPE tmp23 = AVX2_VEC_OR(tmp2, tmp3);
+        if (!AVX2_ALL_EQ(AVX2_VEC_OR(tmp01, tmp23), zero)) {
+            break;
+        }
+    }
+
+    return i * sizeof(AVX2_VECTYPE);
+}
+
+#endif
diff --git a/util/cutils.c b/util/cutils.c
index cfeb848..3631c02 100644
--- a/util/cutils.c
+++ b/util/cutils.c
@@ -26,6 +26,7 @@
 #include <math.h>
 #include <limits.h>
 #include <errno.h>
+#include <cpuid.h>
 
 #include "qemu/sockets.h"
 #include "qemu/iov.h"
@@ -161,6 +162,14 @@ int qemu_fdatasync(int fd)
 #endif
 }
 
+static inline bool
+can_use_buffer_find_nonzero_offset_inner(const void *buf, size_t len)
+{
+    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
+                   * sizeof(VECTYPE)) == 0
+            && ((uintptr_t) buf) % sizeof(VECTYPE) == 0);
+}
+
 /*
  * Searches for an area with non-zero content in a buffer
  *
@@ -181,13 +190,13 @@ int qemu_fdatasync(int fd)
  * If the buffer is all zero the return value is equal to len.
  */
 
-size_t buffer_find_nonzero_offset(const void *buf, size_t len)
+static size_t buffer_find_nonzero_offset_inner(const void *buf, size_t len)
 {
     const VECTYPE *p = buf;
     const VECTYPE zero = (VECTYPE){0};
     size_t i;
 
-    assert(can_use_buffer_find_nonzero_offset(buf, len));
+    assert(can_use_buffer_find_nonzero_offset_inner(buf, len));
 
     if (!len) {
         return 0;
@@ -216,6 +225,58 @@ size_t buffer_find_nonzero_offset(const void *buf, size_t len)
     return i * sizeof(VECTYPE);
 }
 
+#if defined CONFIG_IFUNC && defined CONFIG_AVX2
+/* old compiler maynot define bit_AVX2 */
+#ifndef bit_AVX2
+#define bit_AVX2 (1 << 5)
+#endif
+
+static bool avx2_support(void)
+{
+    int a, b, c, d;
+
+    if (__get_cpuid_max(0, NULL) < 7) {
+        return false;
+    }
+
+    __cpuid_count(7, 0, a, b, c, d);
+    return b & bit_AVX2;
+}
+
+bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len) \
+         __attribute__ ((ifunc("can_use_buffer_find_nonzero_offset_ifunc")));
+size_t buffer_find_nonzero_offset(const void *buf, size_t len) \
+         __attribute__ ((ifunc("buffer_find_nonzero_offset_ifunc")));
+
+static void *buffer_find_nonzero_offset_ifunc(void)
+{
+    typeof(buffer_find_nonzero_offset) *func = (avx2_support()) ?
+        buffer_find_nonzero_offset_avx2 : buffer_find_nonzero_offset_inner;
+
+    return func;
+}
+
+static void *can_use_buffer_find_nonzero_offset_ifunc(void)
+{
+    typeof(can_use_buffer_find_nonzero_offset) *func = (avx2_support()) ?
+        can_use_buffer_find_nonzero_offset_avx2 :
+        can_use_buffer_find_nonzero_offset_inner;
+
+    return func;
+}
+#else
+
+inline bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
+{
+    return can_use_buffer_find_nonzero_offset_inner(buf, len);
+}
+
+size_t buffer_find_nonzero_offset(const void *buf, size_t len)
+{
+    return buffer_find_nonzero_offset_inner(buf, len);
+}
+#endif
+
 /*
  * Checks if a buffer is all zeroes
  *
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [Qemu-devel] [v3 2/3] configure: detect ifunc attribute
  2015-12-08 12:08 [Qemu-devel] [v3 0/3] add avx2 instruction optimization Liang Li
  2015-12-08 12:08 ` [Qemu-devel] [v3 1/3] cutils: " Liang Li
@ 2015-12-08 12:08 ` Liang Li
  2015-12-08 12:08 ` [Qemu-devel] [v3 3/3] configure: add options to config avx2 Liang Li
  2 siblings, 0 replies; 13+ messages in thread
From: Liang Li @ 2015-12-08 12:08 UTC (permalink / raw)
  To: qemu-devel
  Cc: Liang Li, quintela, mst, dgilbert, stefanha, amit.shah, pbonzini, rth

Detect if the compiler can support the ifunc attribute, the avx2
optimization depends on ifunc attribute.

Signed-off-by: Liang Li <liang.z.li@intel.com>
---
 configure | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/configure b/configure
index b9552fd..394db3b 100755
--- a/configure
+++ b/configure
@@ -310,6 +310,7 @@ smartcard=""
 libusb=""
 usb_redir=""
 opengl=""
+ifunc=""
 zlib="yes"
 lzo=""
 snappy=""
@@ -1827,6 +1828,20 @@ EOF
 fi
 
 ##########################################
+# ifunc check
+
+cat > $TMPC << EOF
+static void bar(void) {}
+static void foo(void) __attribute__((ifunc("bar")));
+int main(void) { foo(); return 0; }
+EOF
+if compile_prog "" "" ; then
+    ifunc="yes"
+else
+    ifunc="no"
+fi
+
+#########################################
 # zlib check
 
 if test "$zlib" != "no" ; then
@@ -4837,6 +4852,7 @@ echo "libssh2 support   $libssh2"
 echo "TPM passthrough   $tpm_passthrough"
 echo "QOM debugging     $qom_cast_debug"
 echo "vhdx              $vhdx"
+echo "ifunc support     $ifunc"
 echo "lzo support       $lzo"
 echo "snappy support    $snappy"
 echo "bzip2 support     $bzip2"
@@ -5221,6 +5237,10 @@ if test "$opengl" = "yes" ; then
   echo "OPENGL_LIBS=$opengl_libs" >> $config_host_mak
 fi
 
+if test "$ifunc" = "yes" ; then
+  echo "CONFIG_IFUNC=y" >> $config_host_mak
+fi
+
 if test "$lzo" = "yes" ; then
   echo "CONFIG_LZO=y" >> $config_host_mak
 fi
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [Qemu-devel] [v3 3/3] configure: add options to config avx2
  2015-12-08 12:08 [Qemu-devel] [v3 0/3] add avx2 instruction optimization Liang Li
  2015-12-08 12:08 ` [Qemu-devel] [v3 1/3] cutils: " Liang Li
  2015-12-08 12:08 ` [Qemu-devel] [v3 2/3] configure: detect ifunc attribute Liang Li
@ 2015-12-08 12:08 ` Liang Li
  2015-12-08 12:54   ` Peter Maydell
  2 siblings, 1 reply; 13+ messages in thread
From: Liang Li @ 2015-12-08 12:08 UTC (permalink / raw)
  To: qemu-devel
  Cc: Liang Li, quintela, mst, dgilbert, stefanha, amit.shah, pbonzini, rth

Add the '--enable-avx2' & '--disable-avx2' option so as to config
the AVX2 instruction optimization.

If '--disable-avx2' is not set, configure will detect if the compiler
can support AVX2 option, if yes, AVX2 optimization is eabled, else
disabled.

Signed-off-by: Liang Li <liang.z.li@intel.com>
---
 configure | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/configure b/configure
index 394db3b..94e45fa 100755
--- a/configure
+++ b/configure
@@ -311,6 +311,7 @@ libusb=""
 usb_redir=""
 opengl=""
 ifunc=""
+avx2=""
 zlib="yes"
 lzo=""
 snappy=""
@@ -1063,6 +1064,10 @@ for opt do
   ;;
   --enable-usb-redir) usb_redir="yes"
   ;;
+  --disable-avx2) avx2="no"
+  ;;
+  --enable-avx2) avx2="yes"
+  ;;
   --disable-zlib-test) zlib="no"
   ;;
   --disable-lzo) lzo="no"
@@ -1378,6 +1383,7 @@ disabled with --disable-FEATURE, default is enabled if available:
   smartcard       smartcard support (libcacard)
   libusb          libusb (for usb passthrough)
   usb-redir       usb network redirection support
+  avx2            support of avx2 instruction
   lzo             support of lzo compression library
   snappy          support of snappy compression library
   bzip2           support of bzip2 compression library
@@ -1841,6 +1847,23 @@ else
     ifunc="no"
 fi
 
+########################################
+# avx2 check
+
+if test "$avx2" != "no" ; then
+    cat > $TMPC << EOF
+int main(void) { return 0; }
+EOF
+    if compile_prog "" "-mavx2" ; then
+        avx2="yes"
+    else
+        if test "$avx2" = "yes" ; then
+            feature_not_found "avx2" "Your compiler don't support avx2"
+        fi
+        avx2="no"
+    fi
+fi
+
 #########################################
 # zlib check
 
@@ -4853,6 +4876,7 @@ echo "TPM passthrough   $tpm_passthrough"
 echo "QOM debugging     $qom_cast_debug"
 echo "vhdx              $vhdx"
 echo "ifunc support     $ifunc"
+echo "avx2 support      $avx2"
 echo "lzo support       $lzo"
 echo "snappy support    $snappy"
 echo "bzip2 support     $bzip2"
@@ -5241,6 +5265,12 @@ if test "$ifunc" = "yes" ; then
   echo "CONFIG_IFUNC=y" >> $config_host_mak
 fi
 
+if test "$avx2" = "yes" ; then
+  avx2_cflags=" -mavx2"
+  echo "AVX2_CFLAGS=$avx2_cflags" >> $config_host_mak
+  echo "CONFIG_AVX2=y" >> $config_host_mak
+fi
+
 if test "$lzo" = "yes" ; then
   echo "CONFIG_LZO=y" >> $config_host_mak
 fi
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [Qemu-devel] [v3 3/3] configure: add options to config avx2
  2015-12-08 12:08 ` [Qemu-devel] [v3 3/3] configure: add options to config avx2 Liang Li
@ 2015-12-08 12:54   ` Peter Maydell
  2015-12-08 14:18     ` Li, Liang Z
  0 siblings, 1 reply; 13+ messages in thread
From: Peter Maydell @ 2015-12-08 12:54 UTC (permalink / raw)
  To: Liang Li
  Cc: Michael S. Tsirkin, Juan Quintela, QEMU Developers,
	Dr. David Alan Gilbert, Stefan Hajnoczi, Amit Shah,
	Paolo Bonzini, Richard Henderson

On 8 December 2015 at 12:08, Liang Li <liang.z.li@intel.com> wrote:
> Add the '--enable-avx2' & '--disable-avx2' option so as to config
> the AVX2 instruction optimization.
>
> If '--disable-avx2' is not set, configure will detect if the compiler
> can support AVX2 option, if yes, AVX2 optimization is eabled, else
> disabled.

Is the configure option necessary? For other things like
this (eg our use of SSE2 or Altivec) we just go ahead and
use the feature if the compiler supports it.

When would somebody building QEMU want to disable this option?

thanks
-- PMM

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Qemu-devel] [v3 3/3] configure: add options to config avx2
  2015-12-08 12:54   ` Peter Maydell
@ 2015-12-08 14:18     ` Li, Liang Z
  0 siblings, 0 replies; 13+ messages in thread
From: Li, Liang Z @ 2015-12-08 14:18 UTC (permalink / raw)
  To: Peter Maydell
  Cc: Michael S. Tsirkin, Juan Quintela, QEMU Developers,
	Dr. David Alan Gilbert, Stefan Hajnoczi, Amit Shah,
	Paolo Bonzini, Richard Henderson

> On 8 December 2015 at 12:08, Liang Li <liang.z.li@intel.com> wrote:
> > Add the '--enable-avx2' & '--disable-avx2' option so as to config the
> > AVX2 instruction optimization.
> >
> > If '--disable-avx2' is not set, configure will detect if the compiler
> > can support AVX2 option, if yes, AVX2 optimization is eabled, else
> > disabled.
> 
> Is the configure option necessary? For other things like this (eg our use of
> SSE2 or Altivec) we just go ahead and use the feature if the compiler
> supports it.
> 

It seems unnecessary.

> When would somebody building QEMU want to disable this option?
> 
> thanks
> -- PMM

The v1 of this patch had the  '--enable-avx2' & '--disable-avx2'  options because this version did not
 support ifunc, and I left them here in the following version ...
I will remove them if they are unnecessary.  

Thanks for your comments.

Liang



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Qemu-devel] [v3 1/3] cutils: add avx2 instruction optimization
  2015-12-08 12:08 ` [Qemu-devel] [v3 1/3] cutils: " Liang Li
@ 2015-12-08 16:09   ` Richard Henderson
  2015-12-09  9:32     ` Li, Liang Z
  0 siblings, 1 reply; 13+ messages in thread
From: Richard Henderson @ 2015-12-08 16:09 UTC (permalink / raw)
  To: Liang Li, qemu-devel
  Cc: quintela, mst, dgilbert, stefanha, amit.shah, pbonzini

On 12/08/2015 04:08 AM, Liang Li wrote:
> +++ b/util/buffer-zero-avx2.c
> @@ -0,0 +1,54 @@
> +#include "qemu-common.h"
> +
> +#if defined CONFIG_IFUNC && defined CONFIG_AVX2
> +#include <immintrin.h>
> +#define AVX2_VECTYPE        __m256i
> +#define AVX2_SPLAT(p)       _mm256_set1_epi8(*(p))
> +#define AVX2_ALL_EQ(v1, v2) \
> +    (_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, v2)) == 0xFFFFFFFF)
> +#define AVX2_VEC_OR(v1, v2) (_mm256_or_si256(v1, v2))
> +
> +inline bool
> +can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
> +{
> +    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
> +                   * sizeof(AVX2_VECTYPE)) == 0
> +            && ((uintptr_t) buf) % sizeof(AVX2_VECTYPE) == 0);
> +}

I'm not keen on adding a new file for this.  You ought to be able to use
__attribute__((target("-mavx2"))) on any compiler that supports the
command-line option.  Which means you can do this all in one file with static
functions.

Nor am I keen on marking a function inline when we know it must be out-of-line
because of the ifunc usage.


r~

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Qemu-devel] [v3 1/3] cutils: add avx2 instruction optimization
  2015-12-08 16:09   ` Richard Henderson
@ 2015-12-09  9:32     ` Li, Liang Z
  2015-12-09 14:57       ` Richard Henderson
  0 siblings, 1 reply; 13+ messages in thread
From: Li, Liang Z @ 2015-12-09  9:32 UTC (permalink / raw)
  To: Richard Henderson, qemu-devel
  Cc: mst, quintela, dgilbert, stefanha, amit.shah, pbonzini

> On 12/08/2015 04:08 AM, Liang Li wrote:
> > +++ b/util/buffer-zero-avx2.c
> > @@ -0,0 +1,54 @@
> > +#include "qemu-common.h"
> > +
> > +#if defined CONFIG_IFUNC && defined CONFIG_AVX2 #include
> > +<immintrin.h>
> > +#define AVX2_VECTYPE        __m256i
> > +#define AVX2_SPLAT(p)       _mm256_set1_epi8(*(p))
> > +#define AVX2_ALL_EQ(v1, v2) \
> > +    (_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, v2)) ==
> 0xFFFFFFFF)
> > +#define AVX2_VEC_OR(v1, v2) (_mm256_or_si256(v1, v2))
> > +
> > +inline bool
> > +can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
> > +{
> > +    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
> > +                   * sizeof(AVX2_VECTYPE)) == 0
> > +            && ((uintptr_t) buf) % sizeof(AVX2_VECTYPE) == 0); }
> 
> I'm not keen on adding a new file for this.  You ought to be able to use
> __attribute__((target("-mavx2"))) on any compiler that supports the
> command-line option.  Which means you can do this all in one file with static
> functions.
> 

I think you means the ' __attribute__((target("avx2")))', I have tried this way, the issue here is:
 without the ' -mavx2' option for gcc, there are compiling error:  '__m256i undeclared', the __attribute__((target("avx2")))
can't solve this issue.  Any idea?

If I put these avx2 Intrinsics and the sse2 Intrinsics in a single file, the sse2  Intrinsics will be compiled to the avx2 instructions, this is not we want.

> Nor am I keen on marking a function inline when we know it must be out-of-
> line because of the ifunc usage.

Inline can be removed.

Thanks 

Liang
> 
> r~

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Qemu-devel] [v3 1/3] cutils: add avx2 instruction optimization
  2015-12-09  9:32     ` Li, Liang Z
@ 2015-12-09 14:57       ` Richard Henderson
  2015-12-10  1:10         ` Li, Liang Z
  2015-12-10  9:03         ` Paolo Bonzini
  0 siblings, 2 replies; 13+ messages in thread
From: Richard Henderson @ 2015-12-09 14:57 UTC (permalink / raw)
  To: Li, Liang Z, qemu-devel
  Cc: mst, quintela, dgilbert, stefanha, amit.shah, pbonzini

On 12/09/2015 01:32 AM, Li, Liang Z wrote:
> I think you means the ' __attribute__((target("avx2")))', I have tried this way, the issue here is:
>   without the ' -mavx2' option for gcc, there are compiling error:  '__m256i undeclared', the __attribute__((target("avx2")))
> can't solve this issue.  Any idea?

You're right that you can't use the normal __m256i, as it doesn't get declared. 
  But you can define the same type within the function itself.

Which is a simple matter of

   typedef long long __m256i __attribute__((vector_size(32)));

 From there, you might as well rely on other gcc extensions to instead write

    __m256i tmp0 = p[i + 0] | p[i + 1];

rather than obfuscating the code with AVX2_VEC_OR.



r~

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Qemu-devel] [v3 1/3] cutils: add avx2 instruction optimization
  2015-12-09 14:57       ` Richard Henderson
@ 2015-12-10  1:10         ` Li, Liang Z
  2015-12-10  9:03         ` Paolo Bonzini
  1 sibling, 0 replies; 13+ messages in thread
From: Li, Liang Z @ 2015-12-10  1:10 UTC (permalink / raw)
  To: Richard Henderson, qemu-devel
  Cc: mst, quintela, dgilbert, stefanha, amit.shah, pbonzini

> On 12/09/2015 01:32 AM, Li, Liang Z wrote:
> > I think you means the ' __attribute__((target("avx2")))', I have tried this
> way, the issue here is:
> >   without the ' -mavx2' option for gcc, there are compiling error:
> > '__m256i undeclared', the __attribute__((target("avx2"))) can't solve this
> issue.  Any idea?
> 
> You're right that you can't use the normal __m256i, as it doesn't get declared.
>   But you can define the same type within the function itself.
> 
> Which is a simple matter of
> 
>    typedef long long __m256i __attribute__((vector_size(32)));
> 
>  From there, you might as well rely on other gcc extensions to instead write
> 
>     __m256i tmp0 = p[i + 0] | p[i + 1];
> 
> rather than obfuscating the code with AVX2_VEC_OR.
> 
>
Comparing this way to  putting the related code to a separate file, I think the latter is more simple.

Thanks
Liang 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Qemu-devel] [v3 1/3] cutils: add avx2 instruction optimization
  2015-12-09 14:57       ` Richard Henderson
  2015-12-10  1:10         ` Li, Liang Z
@ 2015-12-10  9:03         ` Paolo Bonzini
  2015-12-10  9:22           ` Li, Liang Z
  1 sibling, 1 reply; 13+ messages in thread
From: Paolo Bonzini @ 2015-12-10  9:03 UTC (permalink / raw)
  To: Richard Henderson, Li, Liang Z, qemu-devel
  Cc: amit.shah, quintela, dgilbert, stefanha, mst



On 09/12/2015 15:57, Richard Henderson wrote:
>> I think you means the ' __attribute__((target("avx2")))', I have tried
>> this way, the issue here is:
>>   without the ' -mavx2' option for gcc, there are compiling error: 
>> '__m256i undeclared', the __attribute__((target("avx2")))
>> can't solve this issue.  Any idea?
> 
> You're right that you can't use the normal __m256i, as it doesn't get
> declared.

It should be declared.  *intrin.h uses #pragma GCC target and always
defines all vector types.

In fact, the following compiles for me with just "gcc foo.c" under
GCC 5.x:

#include <immintrin.h>

// #if defined CONFIG_IFUNC && defined CONFIG_AVX2
#pragma GCC push_options
#pragma GCC target("avx2")
#define AVX2_VECTYPE        __m256i
#define AVX2_SPLAT(p)       _mm256_set1_epi8(*(p))
#define AVX2_ALL_EQ(v1, v2) \
    (_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, v2)) == 0xFFFFFFFF)
#define AVX2_VEC_OR(v1, v2) (_mm256_or_si256(v1, v2))

size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
{
    const AVX2_VECTYPE *p = buf;
    const AVX2_VECTYPE zero = (AVX2_VECTYPE){0};
    size_t i;

    if (!len) {
        return 0;
    }

    for (i = 0; i < 4; i++) {
        if (!AVX2_ALL_EQ(p[i], zero)) {
            return i * sizeof(AVX2_VECTYPE);
        }
    }

    for (i = 4; i < len / sizeof(AVX2_VECTYPE); i += 4) {
        AVX2_VECTYPE tmp0 = AVX2_VEC_OR(p[i + 0], p[i + 1]);
        AVX2_VECTYPE tmp1 = AVX2_VEC_OR(p[i + 2], p[i + 3]);
        AVX2_VECTYPE tmp2 = AVX2_VEC_OR(p[i + 4], p[i + 5]);
        AVX2_VECTYPE tmp3 = AVX2_VEC_OR(p[i + 6], p[i + 7]);
        AVX2_VECTYPE tmp01 = AVX2_VEC_OR(tmp0, tmp1);
        AVX2_VECTYPE tmp23 = AVX2_VEC_OR(tmp2, tmp3);
        if (!AVX2_ALL_EQ(AVX2_VEC_OR(tmp01, tmp23), zero)) {
            break;
        }
    }

    return i * sizeof(AVX2_VECTYPE);
}

#pragma GCC pop_options
// #endif

so perhaps the configure test is testing the wrong thing?

Paolo

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Qemu-devel] [v3 1/3] cutils: add avx2 instruction optimization
  2015-12-10  9:03         ` Paolo Bonzini
@ 2015-12-10  9:22           ` Li, Liang Z
  2015-12-10  9:51             ` Paolo Bonzini
  0 siblings, 1 reply; 13+ messages in thread
From: Li, Liang Z @ 2015-12-10  9:22 UTC (permalink / raw)
  To: Paolo Bonzini, Richard Henderson, qemu-devel
  Cc: amit.shah, quintela, dgilbert, stefanha, mst

> >>   without the ' -mavx2' option for gcc, there are compiling error:
> >> '__m256i undeclared', the __attribute__((target("avx2"))) can't solve
> >> this issue.  Any idea?
> >
> > You're right that you can't use the normal __m256i, as it doesn't get
> > declared.
> 
> It should be declared.  *intrin.h uses #pragma GCC target and always defines
> all vector types.
> 
> In fact, the following compiles for me with just "gcc foo.c" under GCC 5.x:
> 
> #include <immintrin.h>
> 
> // #if defined CONFIG_IFUNC && defined CONFIG_AVX2 #pragma GCC
> push_options #pragma GCC target("avx2")
> #define AVX2_VECTYPE        __m256i
> #define AVX2_SPLAT(p)       _mm256_set1_epi8(*(p))
> #define AVX2_ALL_EQ(v1, v2) \
>     (_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, v2)) == 0xFFFFFFFF)
> #define AVX2_VEC_OR(v1, v2) (_mm256_or_si256(v1, v2))
> 
> size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len) {
>     const AVX2_VECTYPE *p = buf;
>     const AVX2_VECTYPE zero = (AVX2_VECTYPE){0};
>     size_t i;
> 
>     if (!len) {
>         return 0;
>     }
> 
>     for (i = 0; i < 4; i++) {
>         if (!AVX2_ALL_EQ(p[i], zero)) {
>             return i * sizeof(AVX2_VECTYPE);
>         }
>     }
> 
>     for (i = 4; i < len / sizeof(AVX2_VECTYPE); i += 4) {
>         AVX2_VECTYPE tmp0 = AVX2_VEC_OR(p[i + 0], p[i + 1]);
>         AVX2_VECTYPE tmp1 = AVX2_VEC_OR(p[i + 2], p[i + 3]);
>         AVX2_VECTYPE tmp2 = AVX2_VEC_OR(p[i + 4], p[i + 5]);
>         AVX2_VECTYPE tmp3 = AVX2_VEC_OR(p[i + 6], p[i + 7]);
>         AVX2_VECTYPE tmp01 = AVX2_VEC_OR(tmp0, tmp1);
>         AVX2_VECTYPE tmp23 = AVX2_VEC_OR(tmp2, tmp3);
>         if (!AVX2_ALL_EQ(AVX2_VEC_OR(tmp01, tmp23), zero)) {
>             break;
>         }
>     }
> 
>     return i * sizeof(AVX2_VECTYPE);
> }
> 
> #pragma GCC pop_options
> // #endif
> 
> so perhaps the configure test is testing the wrong thing?
> 
> Paolo

Hi Paolo,

what's your opinion?  putting the AVX2 related code to util/cutils.c and use the "#pragma ..." you referred?
The configure test is ok, it use the "-mavx2".

Liang

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Qemu-devel] [v3 1/3] cutils: add avx2 instruction optimization
  2015-12-10  9:22           ` Li, Liang Z
@ 2015-12-10  9:51             ` Paolo Bonzini
  0 siblings, 0 replies; 13+ messages in thread
From: Paolo Bonzini @ 2015-12-10  9:51 UTC (permalink / raw)
  To: Li, Liang Z, Richard Henderson, qemu-devel
  Cc: amit.shah, quintela, dgilbert, stefanha, mst



On 10/12/2015 10:22, Li, Liang Z wrote:
>>>>   without the ' -mavx2' option for gcc, there are compiling error:
>>>> '__m256i undeclared', the __attribute__((target("avx2"))) can't solve
>>>> this issue.  Any idea?
>>>
>>> You're right that you can't use the normal __m256i, as it doesn't get
>>> declared.
>>
>> It should be declared.  *intrin.h uses #pragma GCC target and always defines
>> all vector types.
>>
>> In fact, the following compiles for me with just "gcc foo.c" under GCC 5.x:
>>
>> #include <immintrin.h>
>>
>> // #if defined CONFIG_IFUNC && defined CONFIG_AVX2 #pragma GCC
>> push_options #pragma GCC target("avx2")
>> #define AVX2_VECTYPE        __m256i
>> #define AVX2_SPLAT(p)       _mm256_set1_epi8(*(p))
>> #define AVX2_ALL_EQ(v1, v2) \
>>     (_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, v2)) == 0xFFFFFFFF)
>> #define AVX2_VEC_OR(v1, v2) (_mm256_or_si256(v1, v2))
>>
>> size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len) {
>>     const AVX2_VECTYPE *p = buf;
>>     const AVX2_VECTYPE zero = (AVX2_VECTYPE){0};
>>     size_t i;
>>
>>     if (!len) {
>>         return 0;
>>     }
>>
>>     for (i = 0; i < 4; i++) {
>>         if (!AVX2_ALL_EQ(p[i], zero)) {
>>             return i * sizeof(AVX2_VECTYPE);
>>         }
>>     }
>>
>>     for (i = 4; i < len / sizeof(AVX2_VECTYPE); i += 4) {
>>         AVX2_VECTYPE tmp0 = AVX2_VEC_OR(p[i + 0], p[i + 1]);
>>         AVX2_VECTYPE tmp1 = AVX2_VEC_OR(p[i + 2], p[i + 3]);
>>         AVX2_VECTYPE tmp2 = AVX2_VEC_OR(p[i + 4], p[i + 5]);
>>         AVX2_VECTYPE tmp3 = AVX2_VEC_OR(p[i + 6], p[i + 7]);
>>         AVX2_VECTYPE tmp01 = AVX2_VEC_OR(tmp0, tmp1);
>>         AVX2_VECTYPE tmp23 = AVX2_VEC_OR(tmp2, tmp3);
>>         if (!AVX2_ALL_EQ(AVX2_VEC_OR(tmp01, tmp23), zero)) {
>>             break;
>>         }
>>     }
>>
>>     return i * sizeof(AVX2_VECTYPE);
>> }
>>
>> #pragma GCC pop_options
>> // #endif
>>
>> so perhaps the configure test is testing the wrong thing?
>>
>> Paolo
> 
> Hi Paolo,
> 
> what's your opinion?  putting the AVX2 related code to util/cutils.c and use the "#pragma ..." you referred?

Yes, that's best.  And you can keep using __m256i if you prefer that.

Paolo

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2015-12-10  9:51 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-12-08 12:08 [Qemu-devel] [v3 0/3] add avx2 instruction optimization Liang Li
2015-12-08 12:08 ` [Qemu-devel] [v3 1/3] cutils: " Liang Li
2015-12-08 16:09   ` Richard Henderson
2015-12-09  9:32     ` Li, Liang Z
2015-12-09 14:57       ` Richard Henderson
2015-12-10  1:10         ` Li, Liang Z
2015-12-10  9:03         ` Paolo Bonzini
2015-12-10  9:22           ` Li, Liang Z
2015-12-10  9:51             ` Paolo Bonzini
2015-12-08 12:08 ` [Qemu-devel] [v3 2/3] configure: detect ifunc attribute Liang Li
2015-12-08 12:08 ` [Qemu-devel] [v3 3/3] configure: add options to config avx2 Liang Li
2015-12-08 12:54   ` Peter Maydell
2015-12-08 14:18     ` Li, Liang Z

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.