From mboxrd@z Thu Jan  1 00:00:00 1970
From: "Jan Beulich" <JBeulich@suse.com>
Subject: [PATCH v2 11/11] x86emul: test coverage for SSE/SSE2
	insns
Date: Wed, 01 Feb 2017 04:19:09 -0700
Message-ID: <5891D23D0200007800135C17@prv-mh.provo.novell.com>
References: <5891CF990200007800135BC5@prv-mh.provo.novell.com>
Mime-Version: 1.0
Content-Type: multipart/mixed; boundary="=__Part58619C3D.1__="
Return-path: <xen-devel-bounces@lists.xen.org>
Received: from mail6.bemta6.messagelabs.com ([193.109.254.103])
 by lists.xenproject.org with esmtp (Exim 4.84_2)
 (envelope-from <JBeulich@suse.com>) id 1cYswW-0003m7-Pq
 for xen-devel@lists.xenproject.org; Wed, 01 Feb 2017 11:19:17 +0000
In-Reply-To: <5891CF990200007800135BC5@prv-mh.provo.novell.com>
List-Unsubscribe: <https://lists.xen.org/cgi-bin/mailman/options/xen-devel>,
 <mailto:xen-devel-request@lists.xen.org?subject=unsubscribe>
List-Post: <mailto:xen-devel@lists.xen.org>
List-Help: <mailto:xen-devel-request@lists.xen.org?subject=help>
List-Subscribe: <https://lists.xen.org/cgi-bin/mailman/listinfo/xen-devel>,
 <mailto:xen-devel-request@lists.xen.org?subject=subscribe>
Errors-To: xen-devel-bounces@lists.xen.org
Sender: "Xen-devel" <xen-devel-bounces@lists.xen.org>
To: xen-devel <xen-devel@lists.xenproject.org>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
List-Id: xen-devel@lists.xenproject.org

This is a MIME message. If you are reading this text, you may want to 
consider changing to a mail reader or gateway that understands how to 
properly handle MIME multipart messages.

--=__Part58619C3D.1__=
Content-Type: text/plain; charset=US-ASCII
Content-Transfer-Encoding: quoted-printable
Content-Disposition: inline

... and their AVX equivalents. Note that a few instructions aren't
covered (yet), but those all fall into common pattern groups, so I
would hope that for now we can do with what is there.

MMX insns aren't being covered at all, as they're not easy to deal
with: The compiler refuses to emit such for other than uses of built-in
functions.

The current way of testing AVX insns is meant to be temporary only:
Once we fully support that feature, the present tests should rather be
replaced than full ones simply added.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: New.

--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -11,11 +11,36 @@ all: $(TARGET)
 run: $(TARGET)
 	./$(TARGET)
=20
-TESTCASES :=3D blowfish
+TESTCASES :=3D blowfish simd
=20
 blowfish-cflags :=3D ""
 blowfish-cflags-x86_32 :=3D "-mno-accumulate-outgoing-args -Dstatic=3D"
=20
+sse-vecs :=3D 16
+sse-ints :=3D
+sse-flts :=3D 4
+sse2-vecs :=3D $(sse-vecs)
+sse2-ints :=3D 1 2 4 8
+sse2-flts :=3D 4 8
+
+# When converting SSE to AVX, have the compiler avoid XMM0 to widen
+# coverage og the VEX.vvvv checks in the emulator.
+sse2avx :=3D -ffixed-xmm0 -Wa,-msse2avx
+
+simd-cflags :=3D $(foreach flavor,sse sse2, \
+                 $(foreach vec,$($(flavor)-vecs), \
+                   $(foreach int,$($(flavor)-ints), \
+                     "-D$(flavor)_$(vec)i$(int) -m$(flavor) -O2 -DVEC_SIZE=
=3D$(vec) -DINT_SIZE=3D$(int)" \
+                     "-D$(flavor)_$(vec)u$(int) -m$(flavor) -O2 -DVEC_SIZE=
=3D$(vec) -DUINT_SIZE=3D$(int)" \
+                     "-D$(flavor)_avx_$(vec)i$(int) -m$(flavor) $(sse2avx)=
 -O2 -DVEC_SIZE=3D$(vec) -DINT_SIZE=3D$(int)" \
+                     "-D$(flavor)_avx_$(vec)u$(int) -m$(flavor) $(sse2avx)=
 -O2 -DVEC_SIZE=3D$(vec) -DUINT_SIZE=3D$(int)") \
+                   $(foreach flt,$($(flavor)-flts), \
+                     "-D$(flavor)_$(vec)f$(flt) -m$(flavor) -O2 -DVEC_SIZE=
=3D$(vec) -DFLOAT_SIZE=3D$(flt)" \
+                     "-D$(flavor)_avx_$(vec)f$(flt) -m$(flavor) $(sse2avx)=
 -O2 -DVEC_SIZE=3D$(vec) -DFLOAT_SIZE=3D$(flt)")) \
+                 $(foreach flt,$($(flavor)-flts), \
+                   "-D$(flavor)_f$(flt) -m$(flavor) -mfpmath=3Dsse -O2 =
-DFLOAT_SIZE=3D$(flt)" \
+                   "-D$(flavor)_avx_f$(flt) -m$(flavor) -mfpmath=3Dsse =
$(sse2avx) -O2 -DFLOAT_SIZE=3D$(flt)"))
+
 $(addsuffix .h,$(TESTCASES)): %.h: %.c testcase.mk Makefile
 	rm -f $@.new $*.bin
 	$(foreach arch,$(filter-out $(XEN_COMPILE_ARCH),x86_32) $(XEN_COMPI=
LE_ARCH), \
--- /dev/null
+++ b/tools/tests/x86_emulator/simd.c
@@ -0,0 +1,450 @@
+#include <stdbool.h>
+
+asm (
+    "\t.text\n"
+    "\t.globl _start\n"
+    "_start:\n"
+#if defined(__i386__) && VEC_SIZE =3D=3D 16
+    "\tpush %ebp\n"
+    "\tmov %esp,%ebp\n"
+    "\tand $~0xf,%esp\n"
+    "\tcall simd_test\n"
+    "\tleave\n"
+    "\tret"
+#else
+    "\tjmp simd_test"
+#endif
+    );
+
+typedef
+#if defined(INT_SIZE)
+# define ELEM_SIZE INT_SIZE
+signed int
+# if INT_SIZE =3D=3D 1
+#  define MODE QI
+# elif INT_SIZE =3D=3D 2
+#  define MODE HI
+# elif INT_SIZE =3D=3D 4
+#  define MODE SI
+# elif INT_SIZE =3D=3D 8
+#  define MODE DI
+# endif
+#elif defined(UINT_SIZE)
+# define ELEM_SIZE UINT_SIZE
+unsigned int
+# if UINT_SIZE =3D=3D 1
+#  define MODE QI
+# elif UINT_SIZE =3D=3D 2
+#  define MODE HI
+# elif UINT_SIZE =3D=3D 4
+#  define MODE SI
+# elif UINT_SIZE =3D=3D 8
+#  define MODE DI
+# endif
+#elif defined(FLOAT_SIZE)
+float
+# define ELEM_SIZE FLOAT_SIZE
+# if FLOAT_SIZE =3D=3D 4
+#  define MODE SF
+# elif FLOAT_SIZE =3D=3D 8
+#  define MODE DF
+# endif
+#endif
+#ifndef VEC_SIZE
+# define VEC_SIZE ELEM_SIZE
+#endif
+__attribute__((mode(MODE), vector_size(VEC_SIZE))) vec_t;
+
+#define ELEM_COUNT (VEC_SIZE / ELEM_SIZE)
+
+typedef unsigned int __attribute((mode(QI), vector_size(VEC_SIZE))) =
byte_vec_t;
+
+/* Various builtins want plain char / int / long long vector types ... */
+typedef char __attribute__((vector_size(VEC_SIZE))) vqi_t;
+typedef short __attribute__((vector_size(VEC_SIZE))) vhi_t;
+typedef int __attribute__((vector_size(VEC_SIZE))) vsi_t;
+#if VEC_SIZE >=3D 8
+typedef long long __attribute__((vector_size(VEC_SIZE))) vdi_t;
+#endif
+
+#if VEC_SIZE =3D=3D 8 && defined(__SSE__)
+# define to_bool(cmp) (__builtin_ia32_pmovmskb(cmp) =3D=3D 0xff)
+#elif VEC_SIZE =3D=3D 16
+# if defined(__SSE__) && ELEM_SIZE =3D=3D 4
+#  define to_bool(cmp) (__builtin_ia32_movmskps(cmp) =3D=3D 0xf)
+# elif defined(__SSE2__)
+#  if ELEM_SIZE =3D=3D 8
+#   define to_bool(cmp) (__builtin_ia32_movmskpd(cmp) =3D=3D 3)
+#  else
+#   define to_bool(cmp) (__builtin_ia32_pmovmskb128(cmp) =3D=3D 0xffff)
+#  endif
+# endif
+#endif
+
+#ifndef to_bool
+static inline bool _to_bool(byte_vec_t bv)
+{
+    unsigned int i;
+
+    for ( i =3D 0; i < VEC_SIZE; ++i )
+        if ( bv[i] !=3D 0xff )
+            return false;
+
+    return true;
+}
+# define to_bool(cmp) _to_bool((byte_vec_t)(cmp))
+#endif
+
+#if VEC_SIZE =3D=3D FLOAT_SIZE
+# define to_int(x) ((vec_t){ (int)(x)[0] })
+#elif VEC_SIZE =3D=3D 16 && defined(__SSE2__)
+# if FLOAT_SIZE =3D=3D 4
+#  define to_int(x) __builtin_ia32_cvtdq2ps(__builtin_ia32_cvtps2dq(x))
+# elif FLOAT_SIZE =3D=3D 8
+#  define to_int(x) __builtin_ia32_cvtdq2pd(__builtin_ia32_cvtpd2dq(x))
+# endif
+#endif
+
+#if VEC_SIZE =3D=3D FLOAT_SIZE
+# define scalar_1op(x, op) ({ \
+    typeof((x)[0]) __attribute__((vector_size(16))) r; \
+    asm ( op : [out] "=3D&x" (r) : [in] "m" (x) ); \
+    (vec_t){ r[0] }; \
+})
+#endif
+
+#if FLOAT_SIZE =3D=3D 4 && defined(__SSE__)
+# if VEC_SIZE =3D=3D 16
+#  define interleave_hi(x, y) __builtin_ia32_unpckhps(x, y)
+#  define interleave_lo(x, y) __builtin_ia32_unpcklps(x, y)
+#  define max(x, y) __builtin_ia32_maxps(x, y)
+#  define min(x, y) __builtin_ia32_minps(x, y)
+#  define recip(x) __builtin_ia32_rcpps(x)
+#  define rsqrt(x) __builtin_ia32_rsqrtps(x)
+#  define sqrt(x) __builtin_ia32_sqrtps(x)
+#  define swap(x) __builtin_ia32_shufps(x, x, 0b00011011)
+# elif VEC_SIZE =3D=3D 4
+#  define recip(x) scalar_1op(x, "rcpss %[in], %[out]")
+#  define rsqrt(x) scalar_1op(x, "rsqrtss %[in], %[out]")
+#  define sqrt(x) scalar_1op(x, "sqrtss %[in], %[out]")
+# endif
+#elif FLOAT_SIZE =3D=3D 8 && defined(__SSE2__)
+# if VEC_SIZE =3D=3D 16
+#  define interleave_hi(x, y) __builtin_ia32_unpckhpd(x, y)
+#  define interleave_lo(x, y) __builtin_ia32_unpcklpd(x, y)
+#  define max(x, y) __builtin_ia32_maxpd(x, y)
+#  define min(x, y) __builtin_ia32_minpd(x, y)
+#  define recip(x) __builtin_ia32_cvtps2pd(__builtin_ia32_rcpps(__builtin_=
ia32_cvtpd2ps(x)))
+#  define rsqrt(x) __builtin_ia32_cvtps2pd(__builtin_ia32_rsqrtps(__builti=
n_ia32_cvtpd2ps(x)))
+#  define sqrt(x) __builtin_ia32_sqrtpd(x)
+#  define swap(x) __builtin_ia32_shufpd(x, x, 0b01)
+# elif VEC_SIZE =3D=3D 8
+#  define recip(x) scalar_1op(x, "cvtsd2ss %[in], %[out]; rcpss %[out], =
%[out]; cvtss2sd %[out], %[out]")
+#  define rsqrt(x) scalar_1op(x, "cvtsd2ss %[in], %[out]; rsqrtss %[out], =
%[out]; cvtss2sd %[out], %[out]")
+#  define sqrt(x) scalar_1op(x, "sqrtsd %[in], %[out]")
+# endif
+#endif
+#if VEC_SIZE =3D=3D 16 && defined(__SSE2__)
+# if INT_SIZE =3D=3D 1 || UINT_SIZE =3D=3D 1
+#  define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhbw128((vqi_t)x=
, (vqi_t)y))
+#  define interleave_lo(x, y) ((vec_t)__builtin_ia32_punpcklbw128((vqi_t)x=
, (vqi_t)y))
+# elif INT_SIZE =3D=3D 2 || UINT_SIZE =3D=3D 2
+#  define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhwd128((vhi_t)x=
, (vhi_t)y))
+#  define interleave_lo(x, y) ((vec_t)__builtin_ia32_punpcklwd128((vhi_t)x=
, (vhi_t)y))
+#  define swap(x) ((vec_t)__builtin_ia32_pshufd( \
+                   (vsi_t)__builtin_ia32_pshufhw( \
+                          __builtin_ia32_pshuflw((vhi_t)x, 0b00011011), =
0b00011011), 0b01001110))
+# elif INT_SIZE =3D=3D 4 || UINT_SIZE =3D=3D 4
+#  define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhdq128((vsi_t)x=
, (vsi_t)y))
+#  define interleave_lo(x, y) ((vec_t)__builtin_ia32_punpckldq128((vsi_t)x=
, (vsi_t)y))
+#  define swap(x) ((vec_t)__builtin_ia32_pshufd((vsi_t)x, 0b00011011))
+# elif INT_SIZE =3D=3D 8 || UINT_SIZE =3D=3D 8
+#  define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhqdq128((vdi_t)=
x, (vdi_t)y))
+#  define interleave_lo(x, y) ((vec_t)__builtin_ia32_punpcklqdq128((vdi_t)=
x, (vdi_t)y))
+#  define swap(x) ((vec_t)__builtin_ia32_pshufd((vsi_t)x, 0b01001110))
+# endif
+# if UINT_SIZE =3D=3D 1
+#  define max(x, y) ((vec_t)__builtin_ia32_pmaxub128((vqi_t)x, (vqi_t)y))
+#  define min(x, y) ((vec_t)__builtin_ia32_pminub128((vqi_t)x, (vqi_t)y))
+# elif INT_SIZE =3D=3D 2
+#  define max(x, y) __builtin_ia32_pmaxsw128(x, y)
+#  define min(x, y) __builtin_ia32_pminsw128(x, y)
+#  define mul_hi(x, y) __builtin_ia32_pmulhw128(x, y)
+# elif UINT_SIZE =3D=3D 2
+#  define mul_hi(x, y) ((vec_t)__builtin_ia32_pmulhuw128((vhi_t)x, =
(vhi_t)y))
+# elif UINT_SIZE =3D=3D 4
+#  define mul_full(x, y) ((vec_t)__builtin_ia32_pmuludq128((vsi_t)x, =
(vsi_t)y))
+# endif
+# define select(d, x, y, m) ({ \
+    void *d_ =3D (d); \
+    vqi_t m_ =3D (vqi_t)(m); \
+    __builtin_ia32_maskmovdqu((vqi_t)(x),  m_, d_); \
+    __builtin_ia32_maskmovdqu((vqi_t)(y), ~m_, d_); \
+})
+#endif
+#if VEC_SIZE =3D=3D FLOAT_SIZE
+# define max(x, y) ((vec_t){({ typeof(x[0]) x_ =3D (x)[0], y_ =3D (y)[0]; =
x_ > y_ ? x_ : y_; })})
+# define min(x, y) ((vec_t){({ typeof(x[0]) x_ =3D (x)[0], y_ =3D (y)[0]; =
x_ < y_ ? x_ : y_; })})
+#endif
+
+/*
+ * Suppress value propagation by the compiler, preventing unwanted
+ * optimization. This at once makes the compiler use memory operands
+ * more often, which for our purposes is the more interesting case.
+ */
+#define touch(var) asm volatile ( "" : "+m" (var) )
+
+int simd_test(void)
+{
+    unsigned int i, j;
+    vec_t x, y, z, src, inv, alt, sh;
+
+    for ( i =3D 0, j =3D ELEM_SIZE << 3; i < ELEM_COUNT; ++i )
+    {
+        src[i] =3D i + 1;
+        inv[i] =3D ELEM_COUNT - i;
+#ifdef UINT_SIZE
+        alt[i] =3D -!(i & 1);
+#else
+        alt[i] =3D i & 1 ? -1 : 1;
+#endif
+        if ( !(i & (i + 1)) )
+            --j;
+        sh[i] =3D j;
+    }
+
+    touch(src);
+    x =3D src;
+    touch(x);
+    if ( !to_bool(x =3D=3D src) ) return __LINE__;
+
+    touch(src);
+    y =3D x + src;
+    touch(src);
+    touch(y);
+    if ( !to_bool(y =3D=3D 2 * src) ) return __LINE__;
+
+    touch(src);
+    z =3D y -=3D src;
+    touch(z);
+    if ( !to_bool(x =3D=3D z) ) return __LINE__;
+
+#if defined(UINT_SIZE)
+
+    touch(inv);
+    x |=3D inv;
+    touch(inv);
+    y &=3D inv;
+    touch(inv);
+    z ^=3D inv;
+    touch(inv);
+    touch(x);
+    if ( !to_bool((x & ~y) =3D=3D z) ) return __LINE__;
+
+#elif ELEM_SIZE > 1 || VEC_SIZE <=3D 8
+
+    touch(src);
+    x *=3D src;
+    y =3D inv * inv;
+    touch(src);
+    z =3D src + inv;
+    touch(inv);
+    z *=3D (src - inv);
+    if ( !to_bool(x - y =3D=3D z) ) return __LINE__;
+
+#endif
+
+#if defined(FLOAT_SIZE)
+
+    x =3D src * alt;
+    touch(alt);
+    y =3D src / alt;
+    if ( !to_bool(x =3D=3D y) ) return __LINE__;
+    touch(alt);
+    touch(src);
+    if ( !to_bool(x * -alt =3D=3D -src) ) return __LINE__;
+
+# if defined(recip) && defined(to_int)
+
+    touch(src);
+    x =3D recip(src);
+    touch(src);
+    touch(x);
+    if ( !to_bool(to_int(recip(x)) =3D=3D src) ) return __LINE__;
+
+#  ifdef rsqrt
+    x =3D src * src;
+    touch(x);
+    y =3D rsqrt(x);
+    touch(y);
+    if ( !to_bool(to_int(recip(y)) =3D=3D src) ) return __LINE__;
+    touch(src);
+    if ( !to_bool(to_int(y) =3D=3D to_int(recip(src))) ) return __LINE__;
+#  endif
+
+# endif
+
+# ifdef sqrt
+    x =3D src * src;
+    touch(x);
+    if ( !to_bool(sqrt(x) =3D=3D src) ) return __LINE__;
+# endif
+
+#else
+
+# if ELEM_SIZE > 1
+
+    touch(inv);
+    x =3D src * inv;
+    touch(inv);
+    y[ELEM_COUNT - 1] =3D y[0] =3D j =3D ELEM_COUNT;
+    for ( i =3D 1; i < ELEM_COUNT / 2; ++i )
+        y[ELEM_COUNT - i - 1] =3D y[i] =3D y[i - 1] + (j -=3D 2);
+    if ( !to_bool(x =3D=3D y) ) return __LINE__;
+
+# ifdef mul_hi
+    touch(alt);
+    x =3D mul_hi(src, alt);
+    touch(alt);
+#  ifdef INT_SIZE
+    if ( !to_bool(x =3D=3D (alt < 0)) ) return __LINE__;
+#  else
+    if ( !to_bool(x =3D=3D (src & alt) + alt) ) return __LINE__;
+#  endif
+# endif
+
+# ifdef mul_full
+    x =3D src ^ alt;
+    touch(inv);
+    y =3D mul_full(x, inv);
+    touch(inv);
+    for ( i =3D 0; i < ELEM_COUNT; i +=3D 2 )
+    {
+        unsigned long long res =3D x[i] * 1ULL * inv[i];
+
+        z[i] =3D res;
+        z[i + 1] =3D res >> (ELEM_SIZE << 3);
+    }
+    if ( !to_bool(y =3D=3D z) ) return __LINE__;
+# endif
+
+    z =3D src;
+#  ifdef INT_SIZE
+    z *=3D alt;
+#  endif
+    touch(z);
+    x =3D z << 3;
+    touch(z);
+    y =3D z << 2;
+    touch(z);
+    if ( !to_bool(x =3D=3D y + y) ) return __LINE__;
+
+    touch(x);
+    z =3D x >> 2;
+    touch(x);
+    if ( !to_bool(y =3D=3D z + z) ) return __LINE__;
+
+    z =3D src;
+#  ifdef INT_SIZE
+    z *=3D alt;
+#  endif
+    /*
+     * Note that despite the touch()-es here there doesn't appear to be a =
way
+     * to make the compiler use a memory operand for the shift instruction=
 (at
+     * least without resorting to built-ins).
+     */
+    j =3D 3;
+    touch(j);
+    x =3D z << j;
+    touch(j);
+    j =3D 2;
+    touch(j);
+    y =3D z << j;
+    touch(j);
+    if ( !to_bool(x =3D=3D y + y) ) return __LINE__;
+
+    z =3D x >> j;
+    touch(j);
+    if ( !to_bool(y =3D=3D z + z) ) return __LINE__;
+
+# endif
+
+# if ELEM_SIZE =3D=3D 2 || defined(__SSE4_1__)
+    /*
+     * While there are no instructions with varying shift counts per =
field,
+     * the code turns out to be a nice exercise for pextr/pinsr.
+     */
+    z =3D src;
+#  ifdef INT_SIZE
+    z *=3D alt;
+#  endif
+    /*
+     * Zap elements for which the shift count is negative (and the hence =
the
+     * decrement below would yield a negative count.
+     */
+    z &=3D (sh > 0);
+    touch(sh);
+    x =3D z << sh;
+    touch(sh);
+    --sh;
+    touch(sh);
+    y =3D z << sh;
+    touch(sh);
+    if ( !to_bool(x =3D=3D y + y) ) return __LINE__;
+
+# endif
+
+#endif
+
+#if defined(max) && defined(min)
+# ifdef UINT_SIZE
+    touch(inv);
+    x =3D min(src, inv);
+    touch(inv);
+    y =3D max(src, inv);
+    touch(inv);
+    if ( !to_bool(x + y =3D=3D src + inv) ) return __LINE__;
+# else
+    x =3D src * alt;
+    y =3D inv * alt;
+    touch(y);
+    z =3D max(x, y);
+    touch(y);
+    y =3D min(x, y);
+    touch(y);
+    if ( !to_bool((y + z) * alt =3D=3D src + inv) ) return __LINE__;
+# endif
+#endif
+
+#ifdef swap
+    touch(src);
+    if ( !to_bool(swap(src) =3D=3D inv) ) return __LINE__;
+#endif
+
+#if defined(interleave_lo) && defined(interleave_hi)
+    touch(src);
+    x =3D interleave_lo(inv, src);
+    touch(src);
+    y =3D interleave_hi(inv, src);
+    touch(src);
+# ifdef UINT_SIZE
+    z =3D ((x - y) ^ ~alt) - ~alt;
+# else
+    z =3D (x - y) * alt;
+# endif
+    if ( !to_bool(z =3D=3D ELEM_COUNT / 2) ) return __LINE__;
+#endif
+
+#ifdef select
+# ifdef UINT_SIZE
+    select(&z, src, inv, alt);
+# else
+    select(&z, src, inv, alt > 0);
+# endif
+    for ( i =3D 0; i < ELEM_COUNT; ++i )
+        y[i] =3D (i & 1 ? inv : src)[i];
+    if ( !to_bool(z =3D=3D y) ) return __LINE__;
+#endif
+
+    return 0;
+}
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -5,6 +5,7 @@
=20
 #include "x86_emulate.h"
 #include "blowfish.h"
+#include "simd.h"
=20
 #define verbose false /* Switch to true for far more logging. */
=20
@@ -19,11 +20,43 @@ static bool blowfish_check_regs(const st
     return regs->eax =3D=3D 2 && regs->edx =3D=3D 1;
 }
=20
+static bool simd_check_sse(void)
+{
+    return cpu_has_sse;
+}
+
+static bool simd_check_sse2(void)
+{
+    return cpu_has_sse2;
+}
+
+static bool simd_check_avx(void)
+{
+    return cpu_has_avx;
+}
+#define simd_check_sse_avx   simd_check_avx
+#define simd_check_sse2_avx  simd_check_avx
+
+static void simd_set_regs(struct cpu_user_regs *regs)
+{
+    if ( cpu_has_mmx )
+        asm volatile ( "emms" );
+}
+
+static bool simd_check_regs(const struct cpu_user_regs *regs)
+{
+    if ( !regs->eax )
+        return true;
+    printf("[line %u] ", (unsigned int)regs->eax);
+    return false;
+}
+
 static const struct {
     const void *code;
     size_t size;
     unsigned int bitness;
     const char*name;
+    bool (*check_cpu)(void);
     void (*set_regs)(struct cpu_user_regs *);
     bool (*check_regs)(const struct cpu_user_regs *);
 } blobs[] =3D {
@@ -39,6 +72,49 @@ static const struct {
     BLOWFISH(32, blowfish, ),
     BLOWFISH(32, blowfish (push), _mno_accumulate_outgoing_args),
 #undef BLOWFISH
+#define SIMD_(bits, desc, feat, form)                     \
+    { .code =3D simd_x86_##bits##_D##feat##_##form,         \
+      .size =3D sizeof(simd_x86_##bits##_D##feat##_##form), \
+      .bitness =3D bits, .name =3D #desc,                     \
+      .check_cpu =3D simd_check_##feat,                     \
+      .set_regs =3D simd_set_regs,                          \
+      .check_regs =3D simd_check_regs }
+#ifdef __x86_64__
+# define SIMD(desc, feat, form) SIMD_(64, desc, feat, form), \
+                                SIMD_(32, desc, feat, form)
+#else
+# define SIMD(desc, feat, form) SIMD_(32, desc, feat, form)
+#endif
+    SIMD(SSE scalar single,      sse,         f4),
+    SIMD(SSE packed single,      sse,       16f4),
+    SIMD(SSE2 scalar single,     sse2,        f4),
+    SIMD(SSE2 packed single,     sse2,      16f4),
+    SIMD(SSE2 scalar double,     sse2,        f8),
+    SIMD(SSE2 packed double,     sse2,      16f8),
+    SIMD(SSE2 packed s8,         sse2,      16i1),
+    SIMD(SSE2 packed u8,         sse2,      16u1),
+    SIMD(SSE2 packed s16,        sse2,      16i2),
+    SIMD(SSE2 packed u16,        sse2,      16u2),
+    SIMD(SSE2 packed s32,        sse2,      16i4),
+    SIMD(SSE2 packed u32,        sse2,      16u4),
+    SIMD(SSE2 packed s64,        sse2,      16i8),
+    SIMD(SSE2 packed u64,        sse2,      16u8),
+    SIMD(SSE/AVX scalar single,  sse_avx,     f4),
+    SIMD(SSE/AVX packed single,  sse_avx,   16f4),
+    SIMD(SSE2/AVX scalar single, sse2_avx,    f4),
+    SIMD(SSE2/AVX packed single, sse2_avx,  16f4),
+    SIMD(SSE2/AVX scalar double, sse2_avx,    f8),
+    SIMD(SSE2/AVX packed double, sse2_avx,  16f8),
+    SIMD(SSE2/AVX packed s8,     sse2_avx,  16i1),
+    SIMD(SSE2/AVX packed u8,     sse2_avx,  16u1),
+    SIMD(SSE2/AVX packed s16,    sse2_avx,  16i2),
+    SIMD(SSE2/AVX packed u16,    sse2_avx,  16u2),
+    SIMD(SSE2/AVX packed s32,    sse2_avx,  16i4),
+    SIMD(SSE2/AVX packed u32,    sse2_avx,  16u4),
+    SIMD(SSE2/AVX packed s64,    sse2_avx,  16i8),
+    SIMD(SSE2/AVX packed u64,    sse2_avx,  16u8),
+#undef SIMD_
+#undef SIMD
 };
=20
 /* EFLAGS bit definitions. */
@@ -2598,6 +2674,9 @@ int main(int argc, char **argv)
             continue;
         }
=20
+        if ( blobs[j].check_cpu && !blobs[j].check_cpu() )
+            continue;
+
         memcpy(res, blobs[j].code, blobs[j].size);
         ctxt.addr_size =3D ctxt.sp_size =3D blobs[j].bitness;
=20


--=__Part58619C3D.1__=
Content-Type: text/plain; name="x86emul-SSE-AVX-0f-test.patch"
Content-Transfer-Encoding: quoted-printable
Content-Disposition: attachment; filename="x86emul-SSE-AVX-0f-test.patch"

x86emul: test coverage for SSE/SSE2 insns=0A=0A... and their AVX equivalent=
s. Note that a few instructions aren't=0Acovered (yet), but those all fall =
into common pattern groups, so I=0Awould hope that for now we can do with =
what is there.=0A=0AMMX insns aren't being covered at all, as they're not =
easy to deal=0Awith: The compiler refuses to emit such for other than uses =
of built-in=0Afunctions.=0A=0AThe current way of testing AVX insns is =
meant to be temporary only:=0AOnce we fully support that feature, the =
present tests should rather be=0Areplaced than full ones simply =
added.=0A=0ASigned-off-by: Jan Beulich <jbeulich@suse.com>=0A---=0Av2: =
New.=0A=0A--- a/tools/tests/x86_emulator/Makefile=0A+++ b/tools/tests/x86_e=
mulator/Makefile=0A@@ -11,11 +11,36 @@ all: $(TARGET)=0A run: $(TARGET)=0A =
	./$(TARGET)=0A =0A-TESTCASES :=3D blowfish=0A+TESTCASES :=3D =
blowfish simd=0A =0A blowfish-cflags :=3D ""=0A blowfish-cflags-x86_32 =
:=3D "-mno-accumulate-outgoing-args -Dstatic=3D"=0A =0A+sse-vecs :=3D =
16=0A+sse-ints :=3D=0A+sse-flts :=3D 4=0A+sse2-vecs :=3D $(sse-vecs)=0A+sse=
2-ints :=3D 1 2 4 8=0A+sse2-flts :=3D 4 8=0A+=0A+# When converting SSE to =
AVX, have the compiler avoid XMM0 to widen=0A+# coverage og the VEX.vvvv =
checks in the emulator.=0A+sse2avx :=3D -ffixed-xmm0 -Wa,-msse2avx=0A+=0A+s=
imd-cflags :=3D $(foreach flavor,sse sse2, \=0A+                 $(foreach =
vec,$($(flavor)-vecs), \=0A+                   $(foreach int,$($(flavor)-in=
ts), \=0A+                     "-D$(flavor)_$(vec)i$(int) -m$(flavor) -O2 =
-DVEC_SIZE=3D$(vec) -DINT_SIZE=3D$(int)" \=0A+                     =
"-D$(flavor)_$(vec)u$(int) -m$(flavor) -O2 -DVEC_SIZE=3D$(vec) -DUINT_SIZE=
=3D$(int)" \=0A+                     "-D$(flavor)_avx_$(vec)i$(int) =
-m$(flavor) $(sse2avx) -O2 -DVEC_SIZE=3D$(vec) -DINT_SIZE=3D$(int)" \=0A+  =
                   "-D$(flavor)_avx_$(vec)u$(int) -m$(flavor) $(sse2avx) =
-O2 -DVEC_SIZE=3D$(vec) -DUINT_SIZE=3D$(int)") \=0A+                   =
$(foreach flt,$($(flavor)-flts), \=0A+                     "-D$(flavor)_$(v=
ec)f$(flt) -m$(flavor) -O2 -DVEC_SIZE=3D$(vec) -DFLOAT_SIZE=3D$(flt)" =
\=0A+                     "-D$(flavor)_avx_$(vec)f$(flt) -m$(flavor) =
$(sse2avx) -O2 -DVEC_SIZE=3D$(vec) -DFLOAT_SIZE=3D$(flt)")) \=0A+          =
       $(foreach flt,$($(flavor)-flts), \=0A+                   "-D$(flavor=
)_f$(flt) -m$(flavor) -mfpmath=3Dsse -O2 -DFLOAT_SIZE=3D$(flt)" \=0A+      =
             "-D$(flavor)_avx_f$(flt) -m$(flavor) -mfpmath=3Dsse $(sse2avx)=
 -O2 -DFLOAT_SIZE=3D$(flt)"))=0A+=0A $(addsuffix .h,$(TESTCASES)): %.h: =
%.c testcase.mk Makefile=0A 	rm -f $@.new $*.bin=0A 	$(foreach =
arch,$(filter-out $(XEN_COMPILE_ARCH),x86_32) $(XEN_COMPILE_ARCH), \=0A--- =
/dev/null=0A+++ b/tools/tests/x86_emulator/simd.c=0A@@ -0,0 +1,450 =
@@=0A+#include <stdbool.h>=0A+=0A+asm (=0A+    "\t.text\n"=0A+    =
"\t.globl _start\n"=0A+    "_start:\n"=0A+#if defined(__i386__) && =
VEC_SIZE =3D=3D 16=0A+    "\tpush %ebp\n"=0A+    "\tmov %esp,%ebp\n"=0A+   =
 "\tand $~0xf,%esp\n"=0A+    "\tcall simd_test\n"=0A+    "\tleave\n"=0A+   =
 "\tret"=0A+#else=0A+    "\tjmp simd_test"=0A+#endif=0A+    );=0A+=0A+typed=
ef=0A+#if defined(INT_SIZE)=0A+# define ELEM_SIZE INT_SIZE=0A+signed =
int=0A+# if INT_SIZE =3D=3D 1=0A+#  define MODE QI=0A+# elif INT_SIZE =
=3D=3D 2=0A+#  define MODE HI=0A+# elif INT_SIZE =3D=3D 4=0A+#  define =
MODE SI=0A+# elif INT_SIZE =3D=3D 8=0A+#  define MODE DI=0A+# endif=0A+#eli=
f defined(UINT_SIZE)=0A+# define ELEM_SIZE UINT_SIZE=0A+unsigned int=0A+# =
if UINT_SIZE =3D=3D 1=0A+#  define MODE QI=0A+# elif UINT_SIZE =3D=3D =
2=0A+#  define MODE HI=0A+# elif UINT_SIZE =3D=3D 4=0A+#  define MODE =
SI=0A+# elif UINT_SIZE =3D=3D 8=0A+#  define MODE DI=0A+# endif=0A+#elif =
defined(FLOAT_SIZE)=0A+float=0A+# define ELEM_SIZE FLOAT_SIZE=0A+# if =
FLOAT_SIZE =3D=3D 4=0A+#  define MODE SF=0A+# elif FLOAT_SIZE =3D=3D =
8=0A+#  define MODE DF=0A+# endif=0A+#endif=0A+#ifndef VEC_SIZE=0A+# =
define VEC_SIZE ELEM_SIZE=0A+#endif=0A+__attribute__((mode(MODE), =
vector_size(VEC_SIZE))) vec_t;=0A+=0A+#define ELEM_COUNT (VEC_SIZE / =
ELEM_SIZE)=0A+=0A+typedef unsigned int __attribute((mode(QI), vector_size(V=
EC_SIZE))) byte_vec_t;=0A+=0A+/* Various builtins want plain char / int / =
long long vector types ... */=0A+typedef char __attribute__((vector_size(VE=
C_SIZE))) vqi_t;=0A+typedef short __attribute__((vector_size(VEC_SIZE))) =
vhi_t;=0A+typedef int __attribute__((vector_size(VEC_SIZE))) vsi_t;=0A+#if =
VEC_SIZE >=3D 8=0A+typedef long long __attribute__((vector_size(VEC_SIZE)))=
 vdi_t;=0A+#endif=0A+=0A+#if VEC_SIZE =3D=3D 8 && defined(__SSE__)=0A+# =
define to_bool(cmp) (__builtin_ia32_pmovmskb(cmp) =3D=3D 0xff)=0A+#elif =
VEC_SIZE =3D=3D 16=0A+# if defined(__SSE__) && ELEM_SIZE =3D=3D 4=0A+#  =
define to_bool(cmp) (__builtin_ia32_movmskps(cmp) =3D=3D 0xf)=0A+# elif =
defined(__SSE2__)=0A+#  if ELEM_SIZE =3D=3D 8=0A+#   define to_bool(cmp) =
(__builtin_ia32_movmskpd(cmp) =3D=3D 3)=0A+#  else=0A+#   define to_bool(cm=
p) (__builtin_ia32_pmovmskb128(cmp) =3D=3D 0xffff)=0A+#  endif=0A+# =
endif=0A+#endif=0A+=0A+#ifndef to_bool=0A+static inline bool _to_bool(byte_=
vec_t bv)=0A+{=0A+    unsigned int i;=0A+=0A+    for ( i =3D 0; i < =
VEC_SIZE; ++i )=0A+        if ( bv[i] !=3D 0xff )=0A+            return =
false;=0A+=0A+    return true;=0A+}=0A+# define to_bool(cmp) _to_bool((byte=
_vec_t)(cmp))=0A+#endif=0A+=0A+#if VEC_SIZE =3D=3D FLOAT_SIZE=0A+# define =
to_int(x) ((vec_t){ (int)(x)[0] })=0A+#elif VEC_SIZE =3D=3D 16 && =
defined(__SSE2__)=0A+# if FLOAT_SIZE =3D=3D 4=0A+#  define to_int(x) =
__builtin_ia32_cvtdq2ps(__builtin_ia32_cvtps2dq(x))=0A+# elif FLOAT_SIZE =
=3D=3D 8=0A+#  define to_int(x) __builtin_ia32_cvtdq2pd(__builtin_ia32_cvtp=
d2dq(x))=0A+# endif=0A+#endif=0A+=0A+#if VEC_SIZE =3D=3D FLOAT_SIZE=0A+# =
define scalar_1op(x, op) ({ \=0A+    typeof((x)[0]) __attribute__((vector_s=
ize(16))) r; \=0A+    asm ( op : [out] "=3D&x" (r) : [in] "m" (x) ); \=0A+ =
   (vec_t){ r[0] }; \=0A+})=0A+#endif=0A+=0A+#if FLOAT_SIZE =3D=3D 4 && =
defined(__SSE__)=0A+# if VEC_SIZE =3D=3D 16=0A+#  define interleave_hi(x, =
y) __builtin_ia32_unpckhps(x, y)=0A+#  define interleave_lo(x, y) =
__builtin_ia32_unpcklps(x, y)=0A+#  define max(x, y) __builtin_ia32_maxps(x=
, y)=0A+#  define min(x, y) __builtin_ia32_minps(x, y)=0A+#  define =
recip(x) __builtin_ia32_rcpps(x)=0A+#  define rsqrt(x) __builtin_ia32_rsqrt=
ps(x)=0A+#  define sqrt(x) __builtin_ia32_sqrtps(x)=0A+#  define swap(x) =
__builtin_ia32_shufps(x, x, 0b00011011)=0A+# elif VEC_SIZE =3D=3D 4=0A+#  =
define recip(x) scalar_1op(x, "rcpss %[in], %[out]")=0A+#  define rsqrt(x) =
scalar_1op(x, "rsqrtss %[in], %[out]")=0A+#  define sqrt(x) scalar_1op(x, =
"sqrtss %[in], %[out]")=0A+# endif=0A+#elif FLOAT_SIZE =3D=3D 8 && =
defined(__SSE2__)=0A+# if VEC_SIZE =3D=3D 16=0A+#  define interleave_hi(x, =
y) __builtin_ia32_unpckhpd(x, y)=0A+#  define interleave_lo(x, y) =
__builtin_ia32_unpcklpd(x, y)=0A+#  define max(x, y) __builtin_ia32_maxpd(x=
, y)=0A+#  define min(x, y) __builtin_ia32_minpd(x, y)=0A+#  define =
recip(x) __builtin_ia32_cvtps2pd(__builtin_ia32_rcpps(__builtin_ia32_cvtpd2=
ps(x)))=0A+#  define rsqrt(x) __builtin_ia32_cvtps2pd(__builtin_ia32_rsqrtp=
s(__builtin_ia32_cvtpd2ps(x)))=0A+#  define sqrt(x) __builtin_ia32_sqrtpd(x=
)=0A+#  define swap(x) __builtin_ia32_shufpd(x, x, 0b01)=0A+# elif =
VEC_SIZE =3D=3D 8=0A+#  define recip(x) scalar_1op(x, "cvtsd2ss %[in], =
%[out]; rcpss %[out], %[out]; cvtss2sd %[out], %[out]")=0A+#  define =
rsqrt(x) scalar_1op(x, "cvtsd2ss %[in], %[out]; rsqrtss %[out], %[out]; =
cvtss2sd %[out], %[out]")=0A+#  define sqrt(x) scalar_1op(x, "sqrtsd =
%[in], %[out]")=0A+# endif=0A+#endif=0A+#if VEC_SIZE =3D=3D 16 && =
defined(__SSE2__)=0A+# if INT_SIZE =3D=3D 1 || UINT_SIZE =3D=3D 1=0A+#  =
define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhbw128((vqi_t)x, =
(vqi_t)y))=0A+#  define interleave_lo(x, y) ((vec_t)__builtin_ia32_punpcklb=
w128((vqi_t)x, (vqi_t)y))=0A+# elif INT_SIZE =3D=3D 2 || UINT_SIZE =3D=3D =
2=0A+#  define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhwd128((vhi=
_t)x, (vhi_t)y))=0A+#  define interleave_lo(x, y) ((vec_t)__builtin_ia32_pu=
npcklwd128((vhi_t)x, (vhi_t)y))=0A+#  define swap(x) ((vec_t)__builtin_ia32=
_pshufd( \=0A+                   (vsi_t)__builtin_ia32_pshufhw( \=0A+      =
                    __builtin_ia32_pshuflw((vhi_t)x, 0b00011011), =
0b00011011), 0b01001110))=0A+# elif INT_SIZE =3D=3D 4 || UINT_SIZE =3D=3D =
4=0A+#  define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhdq128((vsi=
_t)x, (vsi_t)y))=0A+#  define interleave_lo(x, y) ((vec_t)__builtin_ia32_pu=
npckldq128((vsi_t)x, (vsi_t)y))=0A+#  define swap(x) ((vec_t)__builtin_ia32=
_pshufd((vsi_t)x, 0b00011011))=0A+# elif INT_SIZE =3D=3D 8 || UINT_SIZE =
=3D=3D 8=0A+#  define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhqdq=
128((vdi_t)x, (vdi_t)y))=0A+#  define interleave_lo(x, y) ((vec_t)__builtin=
_ia32_punpcklqdq128((vdi_t)x, (vdi_t)y))=0A+#  define swap(x) ((vec_t)__bui=
ltin_ia32_pshufd((vsi_t)x, 0b01001110))=0A+# endif=0A+# if UINT_SIZE =
=3D=3D 1=0A+#  define max(x, y) ((vec_t)__builtin_ia32_pmaxub128((vqi_t)x, =
(vqi_t)y))=0A+#  define min(x, y) ((vec_t)__builtin_ia32_pminub128((vqi_t)x=
, (vqi_t)y))=0A+# elif INT_SIZE =3D=3D 2=0A+#  define max(x, y) __builtin_i=
a32_pmaxsw128(x, y)=0A+#  define min(x, y) __builtin_ia32_pminsw128(x, =
y)=0A+#  define mul_hi(x, y) __builtin_ia32_pmulhw128(x, y)=0A+# elif =
UINT_SIZE =3D=3D 2=0A+#  define mul_hi(x, y) ((vec_t)__builtin_ia32_pmulhuw=
128((vhi_t)x, (vhi_t)y))=0A+# elif UINT_SIZE =3D=3D 4=0A+#  define =
mul_full(x, y) ((vec_t)__builtin_ia32_pmuludq128((vsi_t)x, (vsi_t)y))=0A+# =
endif=0A+# define select(d, x, y, m) ({ \=0A+    void *d_ =3D (d); \=0A+   =
 vqi_t m_ =3D (vqi_t)(m); \=0A+    __builtin_ia32_maskmovdqu((vqi_t)(x),  =
m_, d_); \=0A+    __builtin_ia32_maskmovdqu((vqi_t)(y), ~m_, d_); =
\=0A+})=0A+#endif=0A+#if VEC_SIZE =3D=3D FLOAT_SIZE=0A+# define max(x, y) =
((vec_t){({ typeof(x[0]) x_ =3D (x)[0], y_ =3D (y)[0]; x_ > y_ ? x_ : y_; =
})})=0A+# define min(x, y) ((vec_t){({ typeof(x[0]) x_ =3D (x)[0], y_ =3D =
(y)[0]; x_ < y_ ? x_ : y_; })})=0A+#endif=0A+=0A+/*=0A+ * Suppress value =
propagation by the compiler, preventing unwanted=0A+ * optimization. This =
at once makes the compiler use memory operands=0A+ * more often, which for =
our purposes is the more interesting case.=0A+ */=0A+#define touch(var) =
asm volatile ( "" : "+m" (var) )=0A+=0A+int simd_test(void)=0A+{=0A+    =
unsigned int i, j;=0A+    vec_t x, y, z, src, inv, alt, sh;=0A+=0A+    for =
( i =3D 0, j =3D ELEM_SIZE << 3; i < ELEM_COUNT; ++i )=0A+    {=0A+        =
src[i] =3D i + 1;=0A+        inv[i] =3D ELEM_COUNT - i;=0A+#ifdef =
UINT_SIZE=0A+        alt[i] =3D -!(i & 1);=0A+#else=0A+        alt[i] =3D =
i & 1 ? -1 : 1;=0A+#endif=0A+        if ( !(i & (i + 1)) )=0A+            =
--j;=0A+        sh[i] =3D j;=0A+    }=0A+=0A+    touch(src);=0A+    x =3D =
src;=0A+    touch(x);=0A+    if ( !to_bool(x =3D=3D src) ) return =
__LINE__;=0A+=0A+    touch(src);=0A+    y =3D x + src;=0A+    touch(src);=
=0A+    touch(y);=0A+    if ( !to_bool(y =3D=3D 2 * src) ) return =
__LINE__;=0A+=0A+    touch(src);=0A+    z =3D y -=3D src;=0A+    =
touch(z);=0A+    if ( !to_bool(x =3D=3D z) ) return __LINE__;=0A+=0A+#if =
defined(UINT_SIZE)=0A+=0A+    touch(inv);=0A+    x |=3D inv;=0A+    =
touch(inv);=0A+    y &=3D inv;=0A+    touch(inv);=0A+    z ^=3D inv;=0A+   =
 touch(inv);=0A+    touch(x);=0A+    if ( !to_bool((x & ~y) =3D=3D z) ) =
return __LINE__;=0A+=0A+#elif ELEM_SIZE > 1 || VEC_SIZE <=3D 8=0A+=0A+    =
touch(src);=0A+    x *=3D src;=0A+    y =3D inv * inv;=0A+    touch(src);=
=0A+    z =3D src + inv;=0A+    touch(inv);=0A+    z *=3D (src - inv);=0A+ =
   if ( !to_bool(x - y =3D=3D z) ) return __LINE__;=0A+=0A+#endif=0A+=0A+#i=
f defined(FLOAT_SIZE)=0A+=0A+    x =3D src * alt;=0A+    touch(alt);=0A+   =
 y =3D src / alt;=0A+    if ( !to_bool(x =3D=3D y) ) return __LINE__;=0A+  =
  touch(alt);=0A+    touch(src);=0A+    if ( !to_bool(x * -alt =3D=3D =
-src) ) return __LINE__;=0A+=0A+# if defined(recip) && defined(to_int)=0A+=
=0A+    touch(src);=0A+    x =3D recip(src);=0A+    touch(src);=0A+    =
touch(x);=0A+    if ( !to_bool(to_int(recip(x)) =3D=3D src) ) return =
__LINE__;=0A+=0A+#  ifdef rsqrt=0A+    x =3D src * src;=0A+    touch(x);=0A=
+    y =3D rsqrt(x);=0A+    touch(y);=0A+    if ( !to_bool(to_int(recip(y))=
 =3D=3D src) ) return __LINE__;=0A+    touch(src);=0A+    if ( !to_bool(to_=
int(y) =3D=3D to_int(recip(src))) ) return __LINE__;=0A+#  endif=0A+=0A+# =
endif=0A+=0A+# ifdef sqrt=0A+    x =3D src * src;=0A+    touch(x);=0A+    =
if ( !to_bool(sqrt(x) =3D=3D src) ) return __LINE__;=0A+# endif=0A+=0A+#els=
e=0A+=0A+# if ELEM_SIZE > 1=0A+=0A+    touch(inv);=0A+    x =3D src * =
inv;=0A+    touch(inv);=0A+    y[ELEM_COUNT - 1] =3D y[0] =3D j =3D =
ELEM_COUNT;=0A+    for ( i =3D 1; i < ELEM_COUNT / 2; ++i )=0A+        =
y[ELEM_COUNT - i - 1] =3D y[i] =3D y[i - 1] + (j -=3D 2);=0A+    if ( =
!to_bool(x =3D=3D y) ) return __LINE__;=0A+=0A+# ifdef mul_hi=0A+    =
touch(alt);=0A+    x =3D mul_hi(src, alt);=0A+    touch(alt);=0A+#  ifdef =
INT_SIZE=0A+    if ( !to_bool(x =3D=3D (alt < 0)) ) return __LINE__;=0A+#  =
else=0A+    if ( !to_bool(x =3D=3D (src & alt) + alt) ) return __LINE__;=0A=
+#  endif=0A+# endif=0A+=0A+# ifdef mul_full=0A+    x =3D src ^ alt;=0A+   =
 touch(inv);=0A+    y =3D mul_full(x, inv);=0A+    touch(inv);=0A+    for =
( i =3D 0; i < ELEM_COUNT; i +=3D 2 )=0A+    {=0A+        unsigned long =
long res =3D x[i] * 1ULL * inv[i];=0A+=0A+        z[i] =3D res;=0A+        =
z[i + 1] =3D res >> (ELEM_SIZE << 3);=0A+    }=0A+    if ( !to_bool(y =
=3D=3D z) ) return __LINE__;=0A+# endif=0A+=0A+    z =3D src;=0A+#  ifdef =
INT_SIZE=0A+    z *=3D alt;=0A+#  endif=0A+    touch(z);=0A+    x =3D z << =
3;=0A+    touch(z);=0A+    y =3D z << 2;=0A+    touch(z);=0A+    if ( =
!to_bool(x =3D=3D y + y) ) return __LINE__;=0A+=0A+    touch(x);=0A+    z =
=3D x >> 2;=0A+    touch(x);=0A+    if ( !to_bool(y =3D=3D z + z) ) return =
__LINE__;=0A+=0A+    z =3D src;=0A+#  ifdef INT_SIZE=0A+    z *=3D =
alt;=0A+#  endif=0A+    /*=0A+     * Note that despite the touch()-es here =
there doesn't appear to be a way=0A+     * to make the compiler use a =
memory operand for the shift instruction (at=0A+     * least without =
resorting to built-ins).=0A+     */=0A+    j =3D 3;=0A+    touch(j);=0A+   =
 x =3D z << j;=0A+    touch(j);=0A+    j =3D 2;=0A+    touch(j);=0A+    y =
=3D z << j;=0A+    touch(j);=0A+    if ( !to_bool(x =3D=3D y + y) ) return =
__LINE__;=0A+=0A+    z =3D x >> j;=0A+    touch(j);=0A+    if ( !to_bool(y =
=3D=3D z + z) ) return __LINE__;=0A+=0A+# endif=0A+=0A+# if ELEM_SIZE =
=3D=3D 2 || defined(__SSE4_1__)=0A+    /*=0A+     * While there are no =
instructions with varying shift counts per field,=0A+     * the code turns =
out to be a nice exercise for pextr/pinsr.=0A+     */=0A+    z =3D =
src;=0A+#  ifdef INT_SIZE=0A+    z *=3D alt;=0A+#  endif=0A+    /*=0A+     =
* Zap elements for which the shift count is negative (and the hence =
the=0A+     * decrement below would yield a negative count.=0A+     */=0A+ =
   z &=3D (sh > 0);=0A+    touch(sh);=0A+    x =3D z << sh;=0A+    =
touch(sh);=0A+    --sh;=0A+    touch(sh);=0A+    y =3D z << sh;=0A+    =
touch(sh);=0A+    if ( !to_bool(x =3D=3D y + y) ) return __LINE__;=0A+=0A+#=
 endif=0A+=0A+#endif=0A+=0A+#if defined(max) && defined(min)=0A+# ifdef =
UINT_SIZE=0A+    touch(inv);=0A+    x =3D min(src, inv);=0A+    touch(inv);=
=0A+    y =3D max(src, inv);=0A+    touch(inv);=0A+    if ( !to_bool(x + y =
=3D=3D src + inv) ) return __LINE__;=0A+# else=0A+    x =3D src * alt;=0A+ =
   y =3D inv * alt;=0A+    touch(y);=0A+    z =3D max(x, y);=0A+    =
touch(y);=0A+    y =3D min(x, y);=0A+    touch(y);=0A+    if ( !to_bool((y =
+ z) * alt =3D=3D src + inv) ) return __LINE__;=0A+# endif=0A+#endif=0A+=0A=
+#ifdef swap=0A+    touch(src);=0A+    if ( !to_bool(swap(src) =3D=3D inv) =
) return __LINE__;=0A+#endif=0A+=0A+#if defined(interleave_lo) && =
defined(interleave_hi)=0A+    touch(src);=0A+    x =3D interleave_lo(inv, =
src);=0A+    touch(src);=0A+    y =3D interleave_hi(inv, src);=0A+    =
touch(src);=0A+# ifdef UINT_SIZE=0A+    z =3D ((x - y) ^ ~alt) - ~alt;=0A+#=
 else=0A+    z =3D (x - y) * alt;=0A+# endif=0A+    if ( !to_bool(z =3D=3D =
ELEM_COUNT / 2) ) return __LINE__;=0A+#endif=0A+=0A+#ifdef select=0A+# =
ifdef UINT_SIZE=0A+    select(&z, src, inv, alt);=0A+# else=0A+    =
select(&z, src, inv, alt > 0);=0A+# endif=0A+    for ( i =3D 0; i < =
ELEM_COUNT; ++i )=0A+        y[i] =3D (i & 1 ? inv : src)[i];=0A+    if ( =
!to_bool(z =3D=3D y) ) return __LINE__;=0A+#endif=0A+=0A+    return =
0;=0A+}=0A--- a/tools/tests/x86_emulator/test_x86_emulator.c=0A+++ =
b/tools/tests/x86_emulator/test_x86_emulator.c=0A@@ -5,6 +5,7 @@=0A =0A =
#include "x86_emulate.h"=0A #include "blowfish.h"=0A+#include "simd.h"=0A =
=0A #define verbose false /* Switch to true for far more logging. */=0A =
=0A@@ -19,11 +20,43 @@ static bool blowfish_check_regs(const st=0A     =
return regs->eax =3D=3D 2 && regs->edx =3D=3D 1;=0A }=0A =0A+static bool =
simd_check_sse(void)=0A+{=0A+    return cpu_has_sse;=0A+}=0A+=0A+static =
bool simd_check_sse2(void)=0A+{=0A+    return cpu_has_sse2;=0A+}=0A+=0A+sta=
tic bool simd_check_avx(void)=0A+{=0A+    return cpu_has_avx;=0A+}=0A+#defi=
ne simd_check_sse_avx   simd_check_avx=0A+#define simd_check_sse2_avx  =
simd_check_avx=0A+=0A+static void simd_set_regs(struct cpu_user_regs =
*regs)=0A+{=0A+    if ( cpu_has_mmx )=0A+        asm volatile ( "emms" =
);=0A+}=0A+=0A+static bool simd_check_regs(const struct cpu_user_regs =
*regs)=0A+{=0A+    if ( !regs->eax )=0A+        return true;=0A+    =
printf("[line %u] ", (unsigned int)regs->eax);=0A+    return false;=0A+}=0A=
+=0A static const struct {=0A     const void *code;=0A     size_t size;=0A =
    unsigned int bitness;=0A     const char*name;=0A+    bool (*check_cpu)(=
void);=0A     void (*set_regs)(struct cpu_user_regs *);=0A     bool =
(*check_regs)(const struct cpu_user_regs *);=0A } blobs[] =3D {=0A@@ -39,6 =
+72,49 @@ static const struct {=0A     BLOWFISH(32, blowfish, ),=0A     =
BLOWFISH(32, blowfish (push), _mno_accumulate_outgoing_args),=0A #undef =
BLOWFISH=0A+#define SIMD_(bits, desc, feat, form)                     =
\=0A+    { .code =3D simd_x86_##bits##_D##feat##_##form,         \=0A+     =
 .size =3D sizeof(simd_x86_##bits##_D##feat##_##form), \=0A+      .bitness =
=3D bits, .name =3D #desc,                     \=0A+      .check_cpu =3D =
simd_check_##feat,                     \=0A+      .set_regs =3D simd_set_re=
gs,                          \=0A+      .check_regs =3D simd_check_regs =
}=0A+#ifdef __x86_64__=0A+# define SIMD(desc, feat, form) SIMD_(64, desc, =
feat, form), \=0A+                                SIMD_(32, desc, feat, =
form)=0A+#else=0A+# define SIMD(desc, feat, form) SIMD_(32, desc, feat, =
form)=0A+#endif=0A+    SIMD(SSE scalar single,      sse,         f4),=0A+  =
  SIMD(SSE packed single,      sse,       16f4),=0A+    SIMD(SSE2 scalar =
single,     sse2,        f4),=0A+    SIMD(SSE2 packed single,     sse2,    =
  16f4),=0A+    SIMD(SSE2 scalar double,     sse2,        f8),=0A+    =
SIMD(SSE2 packed double,     sse2,      16f8),=0A+    SIMD(SSE2 packed s8, =
        sse2,      16i1),=0A+    SIMD(SSE2 packed u8,         sse2,      =
16u1),=0A+    SIMD(SSE2 packed s16,        sse2,      16i2),=0A+    =
SIMD(SSE2 packed u16,        sse2,      16u2),=0A+    SIMD(SSE2 packed =
s32,        sse2,      16i4),=0A+    SIMD(SSE2 packed u32,        sse2,    =
  16u4),=0A+    SIMD(SSE2 packed s64,        sse2,      16i8),=0A+    =
SIMD(SSE2 packed u64,        sse2,      16u8),=0A+    SIMD(SSE/AVX scalar =
single,  sse_avx,     f4),=0A+    SIMD(SSE/AVX packed single,  sse_avx,   =
16f4),=0A+    SIMD(SSE2/AVX scalar single, sse2_avx,    f4),=0A+    =
SIMD(SSE2/AVX packed single, sse2_avx,  16f4),=0A+    SIMD(SSE2/AVX scalar =
double, sse2_avx,    f8),=0A+    SIMD(SSE2/AVX packed double, sse2_avx,  =
16f8),=0A+    SIMD(SSE2/AVX packed s8,     sse2_avx,  16i1),=0A+    =
SIMD(SSE2/AVX packed u8,     sse2_avx,  16u1),=0A+    SIMD(SSE2/AVX packed =
s16,    sse2_avx,  16i2),=0A+    SIMD(SSE2/AVX packed u16,    sse2_avx,  =
16u2),=0A+    SIMD(SSE2/AVX packed s32,    sse2_avx,  16i4),=0A+    =
SIMD(SSE2/AVX packed u32,    sse2_avx,  16u4),=0A+    SIMD(SSE2/AVX packed =
s64,    sse2_avx,  16i8),=0A+    SIMD(SSE2/AVX packed u64,    sse2_avx,  =
16u8),=0A+#undef SIMD_=0A+#undef SIMD=0A };=0A =0A /* EFLAGS bit definition=
s. */=0A@@ -2598,6 +2674,9 @@ int main(int argc, char **argv)=0A           =
  continue;=0A         }=0A =0A+        if ( blobs[j].check_cpu && =
!blobs[j].check_cpu() )=0A+            continue;=0A+=0A         memcpy(res,=
 blobs[j].code, blobs[j].size);=0A         ctxt.addr_size =3D ctxt.sp_size =
=3D blobs[j].bitness;=0A =0A
--=__Part58619C3D.1__=
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: base64
Content-Disposition: inline

X19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX18KWGVuLWRldmVs
IG1haWxpbmcgbGlzdApYZW4tZGV2ZWxAbGlzdHMueGVuLm9yZwpodHRwczovL2xpc3RzLnhlbi5v
cmcveGVuLWRldmVsCg==

--=__Part58619C3D.1__=--