[PATCH v2 11/11] x86emul: test coverage for SSE/SSE2 insns

From: "Jan Beulich" <JBeulich@suse.com>
To: xen-devel <xen-devel@lists.xenproject.org>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Subject: [PATCH v2 11/11] x86emul: test coverage for SSE/SSE2 insns
Date: Wed, 01 Feb 2017 04:19:09 -0700	[thread overview]
Message-ID: <5891D23D0200007800135C17@prv-mh.provo.novell.com> (raw)
In-Reply-To: <5891CF990200007800135BC5@prv-mh.provo.novell.com>

[-- Attachment #1: Type: text/plain, Size: 19229 bytes --]

... and their AVX equivalents. Note that a few instructions aren't
covered (yet), but those all fall into common pattern groups, so I
would hope that for now we can do with what is there.

MMX insns aren't being covered at all, as they're not easy to deal
with: The compiler refuses to emit such for other than uses of built-in
functions.

The current way of testing AVX insns is meant to be temporary only:
Once we fully support that feature, the present tests should rather be
replaced than full ones simply added.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: New.

--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -11,11 +11,36 @@ all: $(TARGET)
 run: $(TARGET)
 	./$(TARGET)
 
-TESTCASES := blowfish
+TESTCASES := blowfish simd
 
 blowfish-cflags := ""
 blowfish-cflags-x86_32 := "-mno-accumulate-outgoing-args -Dstatic="
 
+sse-vecs := 16
+sse-ints :=
+sse-flts := 4
+sse2-vecs := $(sse-vecs)
+sse2-ints := 1 2 4 8
+sse2-flts := 4 8
+
+# When converting SSE to AVX, have the compiler avoid XMM0 to widen
+# coverage og the VEX.vvvv checks in the emulator.
+sse2avx := -ffixed-xmm0 -Wa,-msse2avx
+
+simd-cflags := $(foreach flavor,sse sse2, \
+                 $(foreach vec,$($(flavor)-vecs), \
+                   $(foreach int,$($(flavor)-ints), \
+                     "-D$(flavor)_$(vec)i$(int) -m$(flavor) -O2 -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
+                     "-D$(flavor)_$(vec)u$(int) -m$(flavor) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)" \
+                     "-D$(flavor)_avx_$(vec)i$(int) -m$(flavor) $(sse2avx) -O2 -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
+                     "-D$(flavor)_avx_$(vec)u$(int) -m$(flavor) $(sse2avx) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)") \
+                   $(foreach flt,$($(flavor)-flts), \
+                     "-D$(flavor)_$(vec)f$(flt) -m$(flavor) -O2 -DVEC_SIZE=$(vec) -DFLOAT_SIZE=$(flt)" \
+                     "-D$(flavor)_avx_$(vec)f$(flt) -m$(flavor) $(sse2avx) -O2 -DVEC_SIZE=$(vec) -DFLOAT_SIZE=$(flt)")) \
+                 $(foreach flt,$($(flavor)-flts), \
+                   "-D$(flavor)_f$(flt) -m$(flavor) -mfpmath=sse -O2 -DFLOAT_SIZE=$(flt)" \
+                   "-D$(flavor)_avx_f$(flt) -m$(flavor) -mfpmath=sse $(sse2avx) -O2 -DFLOAT_SIZE=$(flt)"))
+
 $(addsuffix .h,$(TESTCASES)): %.h: %.c testcase.mk Makefile
 	rm -f $@.new $*.bin
 	$(foreach arch,$(filter-out $(XEN_COMPILE_ARCH),x86_32) $(XEN_COMPILE_ARCH), \
--- /dev/null
+++ b/tools/tests/x86_emulator/simd.c
@@ -0,0 +1,450 @@
+#include <stdbool.h>
+
+asm (
+    "\t.text\n"
+    "\t.globl _start\n"
+    "_start:\n"
+#if defined(__i386__) && VEC_SIZE == 16
+    "\tpush %ebp\n"
+    "\tmov %esp,%ebp\n"
+    "\tand $~0xf,%esp\n"
+    "\tcall simd_test\n"
+    "\tleave\n"
+    "\tret"
+#else
+    "\tjmp simd_test"
+#endif
+    );
+
+typedef
+#if defined(INT_SIZE)
+# define ELEM_SIZE INT_SIZE
+signed int
+# if INT_SIZE == 1
+#  define MODE QI
+# elif INT_SIZE == 2
+#  define MODE HI
+# elif INT_SIZE == 4
+#  define MODE SI
+# elif INT_SIZE == 8
+#  define MODE DI
+# endif
+#elif defined(UINT_SIZE)
+# define ELEM_SIZE UINT_SIZE
+unsigned int
+# if UINT_SIZE == 1
+#  define MODE QI
+# elif UINT_SIZE == 2
+#  define MODE HI
+# elif UINT_SIZE == 4
+#  define MODE SI
+# elif UINT_SIZE == 8
+#  define MODE DI
+# endif
+#elif defined(FLOAT_SIZE)
+float
+# define ELEM_SIZE FLOAT_SIZE
+# if FLOAT_SIZE == 4
+#  define MODE SF
+# elif FLOAT_SIZE == 8
+#  define MODE DF
+# endif
+#endif
+#ifndef VEC_SIZE
+# define VEC_SIZE ELEM_SIZE
+#endif
+__attribute__((mode(MODE), vector_size(VEC_SIZE))) vec_t;
+
+#define ELEM_COUNT (VEC_SIZE / ELEM_SIZE)
+
+typedef unsigned int __attribute((mode(QI), vector_size(VEC_SIZE))) byte_vec_t;
+
+/* Various builtins want plain char / int / long long vector types ... */
+typedef char __attribute__((vector_size(VEC_SIZE))) vqi_t;
+typedef short __attribute__((vector_size(VEC_SIZE))) vhi_t;
+typedef int __attribute__((vector_size(VEC_SIZE))) vsi_t;
+#if VEC_SIZE >= 8
+typedef long long __attribute__((vector_size(VEC_SIZE))) vdi_t;
+#endif
+
+#if VEC_SIZE == 8 && defined(__SSE__)
+# define to_bool(cmp) (__builtin_ia32_pmovmskb(cmp) == 0xff)
+#elif VEC_SIZE == 16
+# if defined(__SSE__) && ELEM_SIZE == 4
+#  define to_bool(cmp) (__builtin_ia32_movmskps(cmp) == 0xf)
+# elif defined(__SSE2__)
+#  if ELEM_SIZE == 8
+#   define to_bool(cmp) (__builtin_ia32_movmskpd(cmp) == 3)
+#  else
+#   define to_bool(cmp) (__builtin_ia32_pmovmskb128(cmp) == 0xffff)
+#  endif
+# endif
+#endif
+
+#ifndef to_bool
+static inline bool _to_bool(byte_vec_t bv)
+{
+    unsigned int i;
+
+    for ( i = 0; i < VEC_SIZE; ++i )
+        if ( bv[i] != 0xff )
+            return false;
+
+    return true;
+}
+# define to_bool(cmp) _to_bool((byte_vec_t)(cmp))
+#endif
+
+#if VEC_SIZE == FLOAT_SIZE
+# define to_int(x) ((vec_t){ (int)(x)[0] })
+#elif VEC_SIZE == 16 && defined(__SSE2__)
+# if FLOAT_SIZE == 4
+#  define to_int(x) __builtin_ia32_cvtdq2ps(__builtin_ia32_cvtps2dq(x))
+# elif FLOAT_SIZE == 8
+#  define to_int(x) __builtin_ia32_cvtdq2pd(__builtin_ia32_cvtpd2dq(x))
+# endif
+#endif
+
+#if VEC_SIZE == FLOAT_SIZE
+# define scalar_1op(x, op) ({ \
+    typeof((x)[0]) __attribute__((vector_size(16))) r; \
+    asm ( op : [out] "=&x" (r) : [in] "m" (x) ); \
+    (vec_t){ r[0] }; \
+})
+#endif
+
+#if FLOAT_SIZE == 4 && defined(__SSE__)
+# if VEC_SIZE == 16
+#  define interleave_hi(x, y) __builtin_ia32_unpckhps(x, y)
+#  define interleave_lo(x, y) __builtin_ia32_unpcklps(x, y)
+#  define max(x, y) __builtin_ia32_maxps(x, y)
+#  define min(x, y) __builtin_ia32_minps(x, y)
+#  define recip(x) __builtin_ia32_rcpps(x)
+#  define rsqrt(x) __builtin_ia32_rsqrtps(x)
+#  define sqrt(x) __builtin_ia32_sqrtps(x)
+#  define swap(x) __builtin_ia32_shufps(x, x, 0b00011011)
+# elif VEC_SIZE == 4
+#  define recip(x) scalar_1op(x, "rcpss %[in], %[out]")
+#  define rsqrt(x) scalar_1op(x, "rsqrtss %[in], %[out]")
+#  define sqrt(x) scalar_1op(x, "sqrtss %[in], %[out]")
+# endif
+#elif FLOAT_SIZE == 8 && defined(__SSE2__)
+# if VEC_SIZE == 16
+#  define interleave_hi(x, y) __builtin_ia32_unpckhpd(x, y)
+#  define interleave_lo(x, y) __builtin_ia32_unpcklpd(x, y)
+#  define max(x, y) __builtin_ia32_maxpd(x, y)
+#  define min(x, y) __builtin_ia32_minpd(x, y)
+#  define recip(x) __builtin_ia32_cvtps2pd(__builtin_ia32_rcpps(__builtin_ia32_cvtpd2ps(x)))
+#  define rsqrt(x) __builtin_ia32_cvtps2pd(__builtin_ia32_rsqrtps(__builtin_ia32_cvtpd2ps(x)))
+#  define sqrt(x) __builtin_ia32_sqrtpd(x)
+#  define swap(x) __builtin_ia32_shufpd(x, x, 0b01)
+# elif VEC_SIZE == 8
+#  define recip(x) scalar_1op(x, "cvtsd2ss %[in], %[out]; rcpss %[out], %[out]; cvtss2sd %[out], %[out]")
+#  define rsqrt(x) scalar_1op(x, "cvtsd2ss %[in], %[out]; rsqrtss %[out], %[out]; cvtss2sd %[out], %[out]")
+#  define sqrt(x) scalar_1op(x, "sqrtsd %[in], %[out]")
+# endif
+#endif
+#if VEC_SIZE == 16 && defined(__SSE2__)
+# if INT_SIZE == 1 || UINT_SIZE == 1
+#  define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhbw128((vqi_t)x, (vqi_t)y))
+#  define interleave_lo(x, y) ((vec_t)__builtin_ia32_punpcklbw128((vqi_t)x, (vqi_t)y))
+# elif INT_SIZE == 2 || UINT_SIZE == 2
+#  define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhwd128((vhi_t)x, (vhi_t)y))
+#  define interleave_lo(x, y) ((vec_t)__builtin_ia32_punpcklwd128((vhi_t)x, (vhi_t)y))
+#  define swap(x) ((vec_t)__builtin_ia32_pshufd( \
+                   (vsi_t)__builtin_ia32_pshufhw( \
+                          __builtin_ia32_pshuflw((vhi_t)x, 0b00011011), 0b00011011), 0b01001110))
+# elif INT_SIZE == 4 || UINT_SIZE == 4
+#  define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhdq128((vsi_t)x, (vsi_t)y))
+#  define interleave_lo(x, y) ((vec_t)__builtin_ia32_punpckldq128((vsi_t)x, (vsi_t)y))
+#  define swap(x) ((vec_t)__builtin_ia32_pshufd((vsi_t)x, 0b00011011))
+# elif INT_SIZE == 8 || UINT_SIZE == 8
+#  define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhqdq128((vdi_t)x, (vdi_t)y))
+#  define interleave_lo(x, y) ((vec_t)__builtin_ia32_punpcklqdq128((vdi_t)x, (vdi_t)y))
+#  define swap(x) ((vec_t)__builtin_ia32_pshufd((vsi_t)x, 0b01001110))
+# endif
+# if UINT_SIZE == 1
+#  define max(x, y) ((vec_t)__builtin_ia32_pmaxub128((vqi_t)x, (vqi_t)y))
+#  define min(x, y) ((vec_t)__builtin_ia32_pminub128((vqi_t)x, (vqi_t)y))
+# elif INT_SIZE == 2
+#  define max(x, y) __builtin_ia32_pmaxsw128(x, y)
+#  define min(x, y) __builtin_ia32_pminsw128(x, y)
+#  define mul_hi(x, y) __builtin_ia32_pmulhw128(x, y)
+# elif UINT_SIZE == 2
+#  define mul_hi(x, y) ((vec_t)__builtin_ia32_pmulhuw128((vhi_t)x, (vhi_t)y))
+# elif UINT_SIZE == 4
+#  define mul_full(x, y) ((vec_t)__builtin_ia32_pmuludq128((vsi_t)x, (vsi_t)y))
+# endif
+# define select(d, x, y, m) ({ \
+    void *d_ = (d); \
+    vqi_t m_ = (vqi_t)(m); \
+    __builtin_ia32_maskmovdqu((vqi_t)(x),  m_, d_); \
+    __builtin_ia32_maskmovdqu((vqi_t)(y), ~m_, d_); \
+})
+#endif
+#if VEC_SIZE == FLOAT_SIZE
+# define max(x, y) ((vec_t){({ typeof(x[0]) x_ = (x)[0], y_ = (y)[0]; x_ > y_ ? x_ : y_; })})
+# define min(x, y) ((vec_t){({ typeof(x[0]) x_ = (x)[0], y_ = (y)[0]; x_ < y_ ? x_ : y_; })})
+#endif
+
+/*
+ * Suppress value propagation by the compiler, preventing unwanted
+ * optimization. This at once makes the compiler use memory operands
+ * more often, which for our purposes is the more interesting case.
+ */
+#define touch(var) asm volatile ( "" : "+m" (var) )
+
+int simd_test(void)
+{
+    unsigned int i, j;
+    vec_t x, y, z, src, inv, alt, sh;
+
+    for ( i = 0, j = ELEM_SIZE << 3; i < ELEM_COUNT; ++i )
+    {
+        src[i] = i + 1;
+        inv[i] = ELEM_COUNT - i;
+#ifdef UINT_SIZE
+        alt[i] = -!(i & 1);
+#else
+        alt[i] = i & 1 ? -1 : 1;
+#endif
+        if ( !(i & (i + 1)) )
+            --j;
+        sh[i] = j;
+    }
+
+    touch(src);
+    x = src;
+    touch(x);
+    if ( !to_bool(x == src) ) return __LINE__;
+
+    touch(src);
+    y = x + src;
+    touch(src);
+    touch(y);
+    if ( !to_bool(y == 2 * src) ) return __LINE__;
+
+    touch(src);
+    z = y -= src;
+    touch(z);
+    if ( !to_bool(x == z) ) return __LINE__;
+
+#if defined(UINT_SIZE)
+
+    touch(inv);
+    x |= inv;
+    touch(inv);
+    y &= inv;
+    touch(inv);
+    z ^= inv;
+    touch(inv);
+    touch(x);
+    if ( !to_bool((x & ~y) == z) ) return __LINE__;
+
+#elif ELEM_SIZE > 1 || VEC_SIZE <= 8
+
+    touch(src);
+    x *= src;
+    y = inv * inv;
+    touch(src);
+    z = src + inv;
+    touch(inv);
+    z *= (src - inv);
+    if ( !to_bool(x - y == z) ) return __LINE__;
+
+#endif
+
+#if defined(FLOAT_SIZE)
+
+    x = src * alt;
+    touch(alt);
+    y = src / alt;
+    if ( !to_bool(x == y) ) return __LINE__;
+    touch(alt);
+    touch(src);
+    if ( !to_bool(x * -alt == -src) ) return __LINE__;
+
+# if defined(recip) && defined(to_int)
+
+    touch(src);
+    x = recip(src);
+    touch(src);
+    touch(x);
+    if ( !to_bool(to_int(recip(x)) == src) ) return __LINE__;
+
+#  ifdef rsqrt
+    x = src * src;
+    touch(x);
+    y = rsqrt(x);
+    touch(y);
+    if ( !to_bool(to_int(recip(y)) == src) ) return __LINE__;
+    touch(src);
+    if ( !to_bool(to_int(y) == to_int(recip(src))) ) return __LINE__;
+#  endif
+
+# endif
+
+# ifdef sqrt
+    x = src * src;
+    touch(x);
+    if ( !to_bool(sqrt(x) == src) ) return __LINE__;
+# endif
+
+#else
+
+# if ELEM_SIZE > 1
+
+    touch(inv);
+    x = src * inv;
+    touch(inv);
+    y[ELEM_COUNT - 1] = y[0] = j = ELEM_COUNT;
+    for ( i = 1; i < ELEM_COUNT / 2; ++i )
+        y[ELEM_COUNT - i - 1] = y[i] = y[i - 1] + (j -= 2);
+    if ( !to_bool(x == y) ) return __LINE__;
+
+# ifdef mul_hi
+    touch(alt);
+    x = mul_hi(src, alt);
+    touch(alt);
+#  ifdef INT_SIZE
+    if ( !to_bool(x == (alt < 0)) ) return __LINE__;
+#  else
+    if ( !to_bool(x == (src & alt) + alt) ) return __LINE__;
+#  endif
+# endif
+
+# ifdef mul_full
+    x = src ^ alt;
+    touch(inv);
+    y = mul_full(x, inv);
+    touch(inv);
+    for ( i = 0; i < ELEM_COUNT; i += 2 )
+    {
+        unsigned long long res = x[i] * 1ULL * inv[i];
+
+        z[i] = res;
+        z[i + 1] = res >> (ELEM_SIZE << 3);
+    }
+    if ( !to_bool(y == z) ) return __LINE__;
+# endif
+
+    z = src;
+#  ifdef INT_SIZE
+    z *= alt;
+#  endif
+    touch(z);
+    x = z << 3;
+    touch(z);
+    y = z << 2;
+    touch(z);
+    if ( !to_bool(x == y + y) ) return __LINE__;
+
+    touch(x);
+    z = x >> 2;
+    touch(x);
+    if ( !to_bool(y == z + z) ) return __LINE__;
+
+    z = src;
+#  ifdef INT_SIZE
+    z *= alt;
+#  endif
+    /*
+     * Note that despite the touch()-es here there doesn't appear to be a way
+     * to make the compiler use a memory operand for the shift instruction (at
+     * least without resorting to built-ins).
+     */
+    j = 3;
+    touch(j);
+    x = z << j;
+    touch(j);
+    j = 2;
+    touch(j);
+    y = z << j;
+    touch(j);
+    if ( !to_bool(x == y + y) ) return __LINE__;
+
+    z = x >> j;
+    touch(j);
+    if ( !to_bool(y == z + z) ) return __LINE__;
+
+# endif
+
+# if ELEM_SIZE == 2 || defined(__SSE4_1__)
+    /*
+     * While there are no instructions with varying shift counts per field,
+     * the code turns out to be a nice exercise for pextr/pinsr.
+     */
+    z = src;
+#  ifdef INT_SIZE
+    z *= alt;
+#  endif
+    /*
+     * Zap elements for which the shift count is negative (and the hence the
+     * decrement below would yield a negative count.
+     */
+    z &= (sh > 0);
+    touch(sh);
+    x = z << sh;
+    touch(sh);
+    --sh;
+    touch(sh);
+    y = z << sh;
+    touch(sh);
+    if ( !to_bool(x == y + y) ) return __LINE__;
+
+# endif
+
+#endif
+
+#if defined(max) && defined(min)
+# ifdef UINT_SIZE
+    touch(inv);
+    x = min(src, inv);
+    touch(inv);
+    y = max(src, inv);
+    touch(inv);
+    if ( !to_bool(x + y == src + inv) ) return __LINE__;
+# else
+    x = src * alt;
+    y = inv * alt;
+    touch(y);
+    z = max(x, y);
+    touch(y);
+    y = min(x, y);
+    touch(y);
+    if ( !to_bool((y + z) * alt == src + inv) ) return __LINE__;
+# endif
+#endif
+
+#ifdef swap
+    touch(src);
+    if ( !to_bool(swap(src) == inv) ) return __LINE__;
+#endif
+
+#if defined(interleave_lo) && defined(interleave_hi)
+    touch(src);
+    x = interleave_lo(inv, src);
+    touch(src);
+    y = interleave_hi(inv, src);
+    touch(src);
+# ifdef UINT_SIZE
+    z = ((x - y) ^ ~alt) - ~alt;
+# else
+    z = (x - y) * alt;
+# endif
+    if ( !to_bool(z == ELEM_COUNT / 2) ) return __LINE__;
+#endif
+
+#ifdef select
+# ifdef UINT_SIZE
+    select(&z, src, inv, alt);
+# else
+    select(&z, src, inv, alt > 0);
+# endif
+    for ( i = 0; i < ELEM_COUNT; ++i )
+        y[i] = (i & 1 ? inv : src)[i];
+    if ( !to_bool(z == y) ) return __LINE__;
+#endif
+
+    return 0;
+}
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -5,6 +5,7 @@
 
 #include "x86_emulate.h"
 #include "blowfish.h"
+#include "simd.h"
 
 #define verbose false /* Switch to true for far more logging. */
 
@@ -19,11 +20,43 @@ static bool blowfish_check_regs(const st
     return regs->eax == 2 && regs->edx == 1;
 }
 
+static bool simd_check_sse(void)
+{
+    return cpu_has_sse;
+}
+
+static bool simd_check_sse2(void)
+{
+    return cpu_has_sse2;
+}
+
+static bool simd_check_avx(void)
+{
+    return cpu_has_avx;
+}
+#define simd_check_sse_avx   simd_check_avx
+#define simd_check_sse2_avx  simd_check_avx
+
+static void simd_set_regs(struct cpu_user_regs *regs)
+{
+    if ( cpu_has_mmx )
+        asm volatile ( "emms" );
+}
+
+static bool simd_check_regs(const struct cpu_user_regs *regs)
+{
+    if ( !regs->eax )
+        return true;
+    printf("[line %u] ", (unsigned int)regs->eax);
+    return false;
+}
+
 static const struct {
     const void *code;
     size_t size;
     unsigned int bitness;
     const char*name;
+    bool (*check_cpu)(void);
     void (*set_regs)(struct cpu_user_regs *);
     bool (*check_regs)(const struct cpu_user_regs *);
 } blobs[] = {
@@ -39,6 +72,49 @@ static const struct {
     BLOWFISH(32, blowfish, ),
     BLOWFISH(32, blowfish (push), _mno_accumulate_outgoing_args),
 #undef BLOWFISH
+#define SIMD_(bits, desc, feat, form)                     \
+    { .code = simd_x86_##bits##_D##feat##_##form,         \
+      .size = sizeof(simd_x86_##bits##_D##feat##_##form), \
+      .bitness = bits, .name = #desc,                     \
+      .check_cpu = simd_check_##feat,                     \
+      .set_regs = simd_set_regs,                          \
+      .check_regs = simd_check_regs }
+#ifdef __x86_64__
+# define SIMD(desc, feat, form) SIMD_(64, desc, feat, form), \
+                                SIMD_(32, desc, feat, form)
+#else
+# define SIMD(desc, feat, form) SIMD_(32, desc, feat, form)
+#endif
+    SIMD(SSE scalar single,      sse,         f4),
+    SIMD(SSE packed single,      sse,       16f4),
+    SIMD(SSE2 scalar single,     sse2,        f4),
+    SIMD(SSE2 packed single,     sse2,      16f4),
+    SIMD(SSE2 scalar double,     sse2,        f8),
+    SIMD(SSE2 packed double,     sse2,      16f8),
+    SIMD(SSE2 packed s8,         sse2,      16i1),
+    SIMD(SSE2 packed u8,         sse2,      16u1),
+    SIMD(SSE2 packed s16,        sse2,      16i2),
+    SIMD(SSE2 packed u16,        sse2,      16u2),
+    SIMD(SSE2 packed s32,        sse2,      16i4),
+    SIMD(SSE2 packed u32,        sse2,      16u4),
+    SIMD(SSE2 packed s64,        sse2,      16i8),
+    SIMD(SSE2 packed u64,        sse2,      16u8),
+    SIMD(SSE/AVX scalar single,  sse_avx,     f4),
+    SIMD(SSE/AVX packed single,  sse_avx,   16f4),
+    SIMD(SSE2/AVX scalar single, sse2_avx,    f4),
+    SIMD(SSE2/AVX packed single, sse2_avx,  16f4),
+    SIMD(SSE2/AVX scalar double, sse2_avx,    f8),
+    SIMD(SSE2/AVX packed double, sse2_avx,  16f8),
+    SIMD(SSE2/AVX packed s8,     sse2_avx,  16i1),
+    SIMD(SSE2/AVX packed u8,     sse2_avx,  16u1),
+    SIMD(SSE2/AVX packed s16,    sse2_avx,  16i2),
+    SIMD(SSE2/AVX packed u16,    sse2_avx,  16u2),
+    SIMD(SSE2/AVX packed s32,    sse2_avx,  16i4),
+    SIMD(SSE2/AVX packed u32,    sse2_avx,  16u4),
+    SIMD(SSE2/AVX packed s64,    sse2_avx,  16i8),
+    SIMD(SSE2/AVX packed u64,    sse2_avx,  16u8),
+#undef SIMD_
+#undef SIMD
 };
 
 /* EFLAGS bit definitions. */
@@ -2598,6 +2674,9 @@ int main(int argc, char **argv)
             continue;
         }
 
+        if ( blobs[j].check_cpu && !blobs[j].check_cpu() )
+            continue;
+
         memcpy(res, blobs[j].code, blobs[j].size);
         ctxt.addr_size = ctxt.sp_size = blobs[j].bitness;
 



[-- Attachment #2: x86emul-SSE-AVX-0f-test.patch --]
[-- Type: text/plain, Size: 19270 bytes --]

x86emul: test coverage for SSE/SSE2 insns

... and their AVX equivalents. Note that a few instructions aren't
covered (yet), but those all fall into common pattern groups, so I
would hope that for now we can do with what is there.

MMX insns aren't being covered at all, as they're not easy to deal
with: The compiler refuses to emit such for other than uses of built-in
functions.

The current way of testing AVX insns is meant to be temporary only:
Once we fully support that feature, the present tests should rather be
replaced than full ones simply added.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: New.

--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -11,11 +11,36 @@ all: $(TARGET)
 run: $(TARGET)
 	./$(TARGET)
 
-TESTCASES := blowfish
+TESTCASES := blowfish simd
 
 blowfish-cflags := ""
 blowfish-cflags-x86_32 := "-mno-accumulate-outgoing-args -Dstatic="
 
+sse-vecs := 16
+sse-ints :=
+sse-flts := 4
+sse2-vecs := $(sse-vecs)
+sse2-ints := 1 2 4 8
+sse2-flts := 4 8
+
+# When converting SSE to AVX, have the compiler avoid XMM0 to widen
+# coverage og the VEX.vvvv checks in the emulator.
+sse2avx := -ffixed-xmm0 -Wa,-msse2avx
+
+simd-cflags := $(foreach flavor,sse sse2, \
+                 $(foreach vec,$($(flavor)-vecs), \
+                   $(foreach int,$($(flavor)-ints), \
+                     "-D$(flavor)_$(vec)i$(int) -m$(flavor) -O2 -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
+                     "-D$(flavor)_$(vec)u$(int) -m$(flavor) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)" \
+                     "-D$(flavor)_avx_$(vec)i$(int) -m$(flavor) $(sse2avx) -O2 -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
+                     "-D$(flavor)_avx_$(vec)u$(int) -m$(flavor) $(sse2avx) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)") \
+                   $(foreach flt,$($(flavor)-flts), \
+                     "-D$(flavor)_$(vec)f$(flt) -m$(flavor) -O2 -DVEC_SIZE=$(vec) -DFLOAT_SIZE=$(flt)" \
+                     "-D$(flavor)_avx_$(vec)f$(flt) -m$(flavor) $(sse2avx) -O2 -DVEC_SIZE=$(vec) -DFLOAT_SIZE=$(flt)")) \
+                 $(foreach flt,$($(flavor)-flts), \
+                   "-D$(flavor)_f$(flt) -m$(flavor) -mfpmath=sse -O2 -DFLOAT_SIZE=$(flt)" \
+                   "-D$(flavor)_avx_f$(flt) -m$(flavor) -mfpmath=sse $(sse2avx) -O2 -DFLOAT_SIZE=$(flt)"))
+
 $(addsuffix .h,$(TESTCASES)): %.h: %.c testcase.mk Makefile
 	rm -f $@.new $*.bin
 	$(foreach arch,$(filter-out $(XEN_COMPILE_ARCH),x86_32) $(XEN_COMPILE_ARCH), \
--- /dev/null
+++ b/tools/tests/x86_emulator/simd.c
@@ -0,0 +1,450 @@
+#include <stdbool.h>
+
+asm (
+    "\t.text\n"
+    "\t.globl _start\n"
+    "_start:\n"
+#if defined(__i386__) && VEC_SIZE == 16
+    "\tpush %ebp\n"
+    "\tmov %esp,%ebp\n"
+    "\tand $~0xf,%esp\n"
+    "\tcall simd_test\n"
+    "\tleave\n"
+    "\tret"
+#else
+    "\tjmp simd_test"
+#endif
+    );
+
+typedef
+#if defined(INT_SIZE)
+# define ELEM_SIZE INT_SIZE
+signed int
+# if INT_SIZE == 1
+#  define MODE QI
+# elif INT_SIZE == 2
+#  define MODE HI
+# elif INT_SIZE == 4
+#  define MODE SI
+# elif INT_SIZE == 8
+#  define MODE DI
+# endif
+#elif defined(UINT_SIZE)
+# define ELEM_SIZE UINT_SIZE
+unsigned int
+# if UINT_SIZE == 1
+#  define MODE QI
+# elif UINT_SIZE == 2
+#  define MODE HI
+# elif UINT_SIZE == 4
+#  define MODE SI
+# elif UINT_SIZE == 8
+#  define MODE DI
+# endif
+#elif defined(FLOAT_SIZE)
+float
+# define ELEM_SIZE FLOAT_SIZE
+# if FLOAT_SIZE == 4
+#  define MODE SF
+# elif FLOAT_SIZE == 8
+#  define MODE DF
+# endif
+#endif
+#ifndef VEC_SIZE
+# define VEC_SIZE ELEM_SIZE
+#endif
+__attribute__((mode(MODE), vector_size(VEC_SIZE))) vec_t;
+
+#define ELEM_COUNT (VEC_SIZE / ELEM_SIZE)
+
+typedef unsigned int __attribute((mode(QI), vector_size(VEC_SIZE))) byte_vec_t;
+
+/* Various builtins want plain char / int / long long vector types ... */
+typedef char __attribute__((vector_size(VEC_SIZE))) vqi_t;
+typedef short __attribute__((vector_size(VEC_SIZE))) vhi_t;
+typedef int __attribute__((vector_size(VEC_SIZE))) vsi_t;
+#if VEC_SIZE >= 8
+typedef long long __attribute__((vector_size(VEC_SIZE))) vdi_t;
+#endif
+
+#if VEC_SIZE == 8 && defined(__SSE__)
+# define to_bool(cmp) (__builtin_ia32_pmovmskb(cmp) == 0xff)
+#elif VEC_SIZE == 16
+# if defined(__SSE__) && ELEM_SIZE == 4
+#  define to_bool(cmp) (__builtin_ia32_movmskps(cmp) == 0xf)
+# elif defined(__SSE2__)
+#  if ELEM_SIZE == 8
+#   define to_bool(cmp) (__builtin_ia32_movmskpd(cmp) == 3)
+#  else
+#   define to_bool(cmp) (__builtin_ia32_pmovmskb128(cmp) == 0xffff)
+#  endif
+# endif
+#endif
+
+#ifndef to_bool
+static inline bool _to_bool(byte_vec_t bv)
+{
+    unsigned int i;
+
+    for ( i = 0; i < VEC_SIZE; ++i )
+        if ( bv[i] != 0xff )
+            return false;
+
+    return true;
+}
+# define to_bool(cmp) _to_bool((byte_vec_t)(cmp))
+#endif
+
+#if VEC_SIZE == FLOAT_SIZE
+# define to_int(x) ((vec_t){ (int)(x)[0] })
+#elif VEC_SIZE == 16 && defined(__SSE2__)
+# if FLOAT_SIZE == 4
+#  define to_int(x) __builtin_ia32_cvtdq2ps(__builtin_ia32_cvtps2dq(x))
+# elif FLOAT_SIZE == 8
+#  define to_int(x) __builtin_ia32_cvtdq2pd(__builtin_ia32_cvtpd2dq(x))
+# endif
+#endif
+
+#if VEC_SIZE == FLOAT_SIZE
+# define scalar_1op(x, op) ({ \
+    typeof((x)[0]) __attribute__((vector_size(16))) r; \
+    asm ( op : [out] "=&x" (r) : [in] "m" (x) ); \
+    (vec_t){ r[0] }; \
+})
+#endif
+
+#if FLOAT_SIZE == 4 && defined(__SSE__)
+# if VEC_SIZE == 16
+#  define interleave_hi(x, y) __builtin_ia32_unpckhps(x, y)
+#  define interleave_lo(x, y) __builtin_ia32_unpcklps(x, y)
+#  define max(x, y) __builtin_ia32_maxps(x, y)
+#  define min(x, y) __builtin_ia32_minps(x, y)
+#  define recip(x) __builtin_ia32_rcpps(x)
+#  define rsqrt(x) __builtin_ia32_rsqrtps(x)
+#  define sqrt(x) __builtin_ia32_sqrtps(x)
+#  define swap(x) __builtin_ia32_shufps(x, x, 0b00011011)
+# elif VEC_SIZE == 4
+#  define recip(x) scalar_1op(x, "rcpss %[in], %[out]")
+#  define rsqrt(x) scalar_1op(x, "rsqrtss %[in], %[out]")
+#  define sqrt(x) scalar_1op(x, "sqrtss %[in], %[out]")
+# endif
+#elif FLOAT_SIZE == 8 && defined(__SSE2__)
+# if VEC_SIZE == 16
+#  define interleave_hi(x, y) __builtin_ia32_unpckhpd(x, y)
+#  define interleave_lo(x, y) __builtin_ia32_unpcklpd(x, y)
+#  define max(x, y) __builtin_ia32_maxpd(x, y)
+#  define min(x, y) __builtin_ia32_minpd(x, y)
+#  define recip(x) __builtin_ia32_cvtps2pd(__builtin_ia32_rcpps(__builtin_ia32_cvtpd2ps(x)))
+#  define rsqrt(x) __builtin_ia32_cvtps2pd(__builtin_ia32_rsqrtps(__builtin_ia32_cvtpd2ps(x)))
+#  define sqrt(x) __builtin_ia32_sqrtpd(x)
+#  define swap(x) __builtin_ia32_shufpd(x, x, 0b01)
+# elif VEC_SIZE == 8
+#  define recip(x) scalar_1op(x, "cvtsd2ss %[in], %[out]; rcpss %[out], %[out]; cvtss2sd %[out], %[out]")
+#  define rsqrt(x) scalar_1op(x, "cvtsd2ss %[in], %[out]; rsqrtss %[out], %[out]; cvtss2sd %[out], %[out]")
+#  define sqrt(x) scalar_1op(x, "sqrtsd %[in], %[out]")
+# endif
+#endif
+#if VEC_SIZE == 16 && defined(__SSE2__)
+# if INT_SIZE == 1 || UINT_SIZE == 1
+#  define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhbw128((vqi_t)x, (vqi_t)y))
+#  define interleave_lo(x, y) ((vec_t)__builtin_ia32_punpcklbw128((vqi_t)x, (vqi_t)y))
+# elif INT_SIZE == 2 || UINT_SIZE == 2
+#  define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhwd128((vhi_t)x, (vhi_t)y))
+#  define interleave_lo(x, y) ((vec_t)__builtin_ia32_punpcklwd128((vhi_t)x, (vhi_t)y))
+#  define swap(x) ((vec_t)__builtin_ia32_pshufd( \
+                   (vsi_t)__builtin_ia32_pshufhw( \
+                          __builtin_ia32_pshuflw((vhi_t)x, 0b00011011), 0b00011011), 0b01001110))
+# elif INT_SIZE == 4 || UINT_SIZE == 4
+#  define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhdq128((vsi_t)x, (vsi_t)y))
+#  define interleave_lo(x, y) ((vec_t)__builtin_ia32_punpckldq128((vsi_t)x, (vsi_t)y))
+#  define swap(x) ((vec_t)__builtin_ia32_pshufd((vsi_t)x, 0b00011011))
+# elif INT_SIZE == 8 || UINT_SIZE == 8
+#  define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhqdq128((vdi_t)x, (vdi_t)y))
+#  define interleave_lo(x, y) ((vec_t)__builtin_ia32_punpcklqdq128((vdi_t)x, (vdi_t)y))
+#  define swap(x) ((vec_t)__builtin_ia32_pshufd((vsi_t)x, 0b01001110))
+# endif
+# if UINT_SIZE == 1
+#  define max(x, y) ((vec_t)__builtin_ia32_pmaxub128((vqi_t)x, (vqi_t)y))
+#  define min(x, y) ((vec_t)__builtin_ia32_pminub128((vqi_t)x, (vqi_t)y))
+# elif INT_SIZE == 2
+#  define max(x, y) __builtin_ia32_pmaxsw128(x, y)
+#  define min(x, y) __builtin_ia32_pminsw128(x, y)
+#  define mul_hi(x, y) __builtin_ia32_pmulhw128(x, y)
+# elif UINT_SIZE == 2
+#  define mul_hi(x, y) ((vec_t)__builtin_ia32_pmulhuw128((vhi_t)x, (vhi_t)y))
+# elif UINT_SIZE == 4
+#  define mul_full(x, y) ((vec_t)__builtin_ia32_pmuludq128((vsi_t)x, (vsi_t)y))
+# endif
+# define select(d, x, y, m) ({ \
+    void *d_ = (d); \
+    vqi_t m_ = (vqi_t)(m); \
+    __builtin_ia32_maskmovdqu((vqi_t)(x),  m_, d_); \
+    __builtin_ia32_maskmovdqu((vqi_t)(y), ~m_, d_); \
+})
+#endif
+#if VEC_SIZE == FLOAT_SIZE
+# define max(x, y) ((vec_t){({ typeof(x[0]) x_ = (x)[0], y_ = (y)[0]; x_ > y_ ? x_ : y_; })})
+# define min(x, y) ((vec_t){({ typeof(x[0]) x_ = (x)[0], y_ = (y)[0]; x_ < y_ ? x_ : y_; })})
+#endif
+
+/*
+ * Suppress value propagation by the compiler, preventing unwanted
+ * optimization. This at once makes the compiler use memory operands
+ * more often, which for our purposes is the more interesting case.
+ */
+#define touch(var) asm volatile ( "" : "+m" (var) )
+
+int simd_test(void)
+{
+    unsigned int i, j;
+    vec_t x, y, z, src, inv, alt, sh;
+
+    for ( i = 0, j = ELEM_SIZE << 3; i < ELEM_COUNT; ++i )
+    {
+        src[i] = i + 1;
+        inv[i] = ELEM_COUNT - i;
+#ifdef UINT_SIZE
+        alt[i] = -!(i & 1);
+#else
+        alt[i] = i & 1 ? -1 : 1;
+#endif
+        if ( !(i & (i + 1)) )
+            --j;
+        sh[i] = j;
+    }
+
+    touch(src);
+    x = src;
+    touch(x);
+    if ( !to_bool(x == src) ) return __LINE__;
+
+    touch(src);
+    y = x + src;
+    touch(src);
+    touch(y);
+    if ( !to_bool(y == 2 * src) ) return __LINE__;
+
+    touch(src);
+    z = y -= src;
+    touch(z);
+    if ( !to_bool(x == z) ) return __LINE__;
+
+#if defined(UINT_SIZE)
+
+    touch(inv);
+    x |= inv;
+    touch(inv);
+    y &= inv;
+    touch(inv);
+    z ^= inv;
+    touch(inv);
+    touch(x);
+    if ( !to_bool((x & ~y) == z) ) return __LINE__;
+
+#elif ELEM_SIZE > 1 || VEC_SIZE <= 8
+
+    touch(src);
+    x *= src;
+    y = inv * inv;
+    touch(src);
+    z = src + inv;
+    touch(inv);
+    z *= (src - inv);
+    if ( !to_bool(x - y == z) ) return __LINE__;
+
+#endif
+
+#if defined(FLOAT_SIZE)
+
+    x = src * alt;
+    touch(alt);
+    y = src / alt;
+    if ( !to_bool(x == y) ) return __LINE__;
+    touch(alt);
+    touch(src);
+    if ( !to_bool(x * -alt == -src) ) return __LINE__;
+
+# if defined(recip) && defined(to_int)
+
+    touch(src);
+    x = recip(src);
+    touch(src);
+    touch(x);
+    if ( !to_bool(to_int(recip(x)) == src) ) return __LINE__;
+
+#  ifdef rsqrt
+    x = src * src;
+    touch(x);
+    y = rsqrt(x);
+    touch(y);
+    if ( !to_bool(to_int(recip(y)) == src) ) return __LINE__;
+    touch(src);
+    if ( !to_bool(to_int(y) == to_int(recip(src))) ) return __LINE__;
+#  endif
+
+# endif
+
+# ifdef sqrt
+    x = src * src;
+    touch(x);
+    if ( !to_bool(sqrt(x) == src) ) return __LINE__;
+# endif
+
+#else
+
+# if ELEM_SIZE > 1
+
+    touch(inv);
+    x = src * inv;
+    touch(inv);
+    y[ELEM_COUNT - 1] = y[0] = j = ELEM_COUNT;
+    for ( i = 1; i < ELEM_COUNT / 2; ++i )
+        y[ELEM_COUNT - i - 1] = y[i] = y[i - 1] + (j -= 2);
+    if ( !to_bool(x == y) ) return __LINE__;
+
+# ifdef mul_hi
+    touch(alt);
+    x = mul_hi(src, alt);
+    touch(alt);
+#  ifdef INT_SIZE
+    if ( !to_bool(x == (alt < 0)) ) return __LINE__;
+#  else
+    if ( !to_bool(x == (src & alt) + alt) ) return __LINE__;
+#  endif
+# endif
+
+# ifdef mul_full
+    x = src ^ alt;
+    touch(inv);
+    y = mul_full(x, inv);
+    touch(inv);
+    for ( i = 0; i < ELEM_COUNT; i += 2 )
+    {
+        unsigned long long res = x[i] * 1ULL * inv[i];
+
+        z[i] = res;
+        z[i + 1] = res >> (ELEM_SIZE << 3);
+    }
+    if ( !to_bool(y == z) ) return __LINE__;
+# endif
+
+    z = src;
+#  ifdef INT_SIZE
+    z *= alt;
+#  endif
+    touch(z);
+    x = z << 3;
+    touch(z);
+    y = z << 2;
+    touch(z);
+    if ( !to_bool(x == y + y) ) return __LINE__;
+
+    touch(x);
+    z = x >> 2;
+    touch(x);
+    if ( !to_bool(y == z + z) ) return __LINE__;
+
+    z = src;
+#  ifdef INT_SIZE
+    z *= alt;
+#  endif
+    /*
+     * Note that despite the touch()-es here there doesn't appear to be a way
+     * to make the compiler use a memory operand for the shift instruction (at
+     * least without resorting to built-ins).
+     */
+    j = 3;
+    touch(j);
+    x = z << j;
+    touch(j);
+    j = 2;
+    touch(j);
+    y = z << j;
+    touch(j);
+    if ( !to_bool(x == y + y) ) return __LINE__;
+
+    z = x >> j;
+    touch(j);
+    if ( !to_bool(y == z + z) ) return __LINE__;
+
+# endif
+
+# if ELEM_SIZE == 2 || defined(__SSE4_1__)
+    /*
+     * While there are no instructions with varying shift counts per field,
+     * the code turns out to be a nice exercise for pextr/pinsr.
+     */
+    z = src;
+#  ifdef INT_SIZE
+    z *= alt;
+#  endif
+    /*
+     * Zap elements for which the shift count is negative (and the hence the
+     * decrement below would yield a negative count.
+     */
+    z &= (sh > 0);
+    touch(sh);
+    x = z << sh;
+    touch(sh);
+    --sh;
+    touch(sh);
+    y = z << sh;
+    touch(sh);
+    if ( !to_bool(x == y + y) ) return __LINE__;
+
+# endif
+
+#endif
+
+#if defined(max) && defined(min)
+# ifdef UINT_SIZE
+    touch(inv);
+    x = min(src, inv);
+    touch(inv);
+    y = max(src, inv);
+    touch(inv);
+    if ( !to_bool(x + y == src + inv) ) return __LINE__;
+# else
+    x = src * alt;
+    y = inv * alt;
+    touch(y);
+    z = max(x, y);
+    touch(y);
+    y = min(x, y);
+    touch(y);
+    if ( !to_bool((y + z) * alt == src + inv) ) return __LINE__;
+# endif
+#endif
+
+#ifdef swap
+    touch(src);
+    if ( !to_bool(swap(src) == inv) ) return __LINE__;
+#endif
+
+#if defined(interleave_lo) && defined(interleave_hi)
+    touch(src);
+    x = interleave_lo(inv, src);
+    touch(src);
+    y = interleave_hi(inv, src);
+    touch(src);
+# ifdef UINT_SIZE
+    z = ((x - y) ^ ~alt) - ~alt;
+# else
+    z = (x - y) * alt;
+# endif
+    if ( !to_bool(z == ELEM_COUNT / 2) ) return __LINE__;
+#endif
+
+#ifdef select
+# ifdef UINT_SIZE
+    select(&z, src, inv, alt);
+# else
+    select(&z, src, inv, alt > 0);
+# endif
+    for ( i = 0; i < ELEM_COUNT; ++i )
+        y[i] = (i & 1 ? inv : src)[i];
+    if ( !to_bool(z == y) ) return __LINE__;
+#endif
+
+    return 0;
+}
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -5,6 +5,7 @@
 
 #include "x86_emulate.h"
 #include "blowfish.h"
+#include "simd.h"
 
 #define verbose false /* Switch to true for far more logging. */
 
@@ -19,11 +20,43 @@ static bool blowfish_check_regs(const st
     return regs->eax == 2 && regs->edx == 1;
 }
 
+static bool simd_check_sse(void)
+{
+    return cpu_has_sse;
+}
+
+static bool simd_check_sse2(void)
+{
+    return cpu_has_sse2;
+}
+
+static bool simd_check_avx(void)
+{
+    return cpu_has_avx;
+}
+#define simd_check_sse_avx   simd_check_avx
+#define simd_check_sse2_avx  simd_check_avx
+
+static void simd_set_regs(struct cpu_user_regs *regs)
+{
+    if ( cpu_has_mmx )
+        asm volatile ( "emms" );
+}
+
+static bool simd_check_regs(const struct cpu_user_regs *regs)
+{
+    if ( !regs->eax )
+        return true;
+    printf("[line %u] ", (unsigned int)regs->eax);
+    return false;
+}
+
 static const struct {
     const void *code;
     size_t size;
     unsigned int bitness;
     const char*name;
+    bool (*check_cpu)(void);
     void (*set_regs)(struct cpu_user_regs *);
     bool (*check_regs)(const struct cpu_user_regs *);
 } blobs[] = {
@@ -39,6 +72,49 @@ static const struct {
     BLOWFISH(32, blowfish, ),
     BLOWFISH(32, blowfish (push), _mno_accumulate_outgoing_args),
 #undef BLOWFISH
+#define SIMD_(bits, desc, feat, form)                     \
+    { .code = simd_x86_##bits##_D##feat##_##form,         \
+      .size = sizeof(simd_x86_##bits##_D##feat##_##form), \
+      .bitness = bits, .name = #desc,                     \
+      .check_cpu = simd_check_##feat,                     \
+      .set_regs = simd_set_regs,                          \
+      .check_regs = simd_check_regs }
+#ifdef __x86_64__
+# define SIMD(desc, feat, form) SIMD_(64, desc, feat, form), \
+                                SIMD_(32, desc, feat, form)
+#else
+# define SIMD(desc, feat, form) SIMD_(32, desc, feat, form)
+#endif
+    SIMD(SSE scalar single,      sse,         f4),
+    SIMD(SSE packed single,      sse,       16f4),
+    SIMD(SSE2 scalar single,     sse2,        f4),
+    SIMD(SSE2 packed single,     sse2,      16f4),
+    SIMD(SSE2 scalar double,     sse2,        f8),
+    SIMD(SSE2 packed double,     sse2,      16f8),
+    SIMD(SSE2 packed s8,         sse2,      16i1),
+    SIMD(SSE2 packed u8,         sse2,      16u1),
+    SIMD(SSE2 packed s16,        sse2,      16i2),
+    SIMD(SSE2 packed u16,        sse2,      16u2),
+    SIMD(SSE2 packed s32,        sse2,      16i4),
+    SIMD(SSE2 packed u32,        sse2,      16u4),
+    SIMD(SSE2 packed s64,        sse2,      16i8),
+    SIMD(SSE2 packed u64,        sse2,      16u8),
+    SIMD(SSE/AVX scalar single,  sse_avx,     f4),
+    SIMD(SSE/AVX packed single,  sse_avx,   16f4),
+    SIMD(SSE2/AVX scalar single, sse2_avx,    f4),
+    SIMD(SSE2/AVX packed single, sse2_avx,  16f4),
+    SIMD(SSE2/AVX scalar double, sse2_avx,    f8),
+    SIMD(SSE2/AVX packed double, sse2_avx,  16f8),
+    SIMD(SSE2/AVX packed s8,     sse2_avx,  16i1),
+    SIMD(SSE2/AVX packed u8,     sse2_avx,  16u1),
+    SIMD(SSE2/AVX packed s16,    sse2_avx,  16i2),
+    SIMD(SSE2/AVX packed u16,    sse2_avx,  16u2),
+    SIMD(SSE2/AVX packed s32,    sse2_avx,  16i4),
+    SIMD(SSE2/AVX packed u32,    sse2_avx,  16u4),
+    SIMD(SSE2/AVX packed s64,    sse2_avx,  16i8),
+    SIMD(SSE2/AVX packed u64,    sse2_avx,  16u8),
+#undef SIMD_
+#undef SIMD
 };
 
 /* EFLAGS bit definitions. */
@@ -2598,6 +2674,9 @@ int main(int argc, char **argv)
             continue;
         }
 
+        if ( blobs[j].check_cpu && !blobs[j].check_cpu() )
+            continue;
+
         memcpy(res, blobs[j].code, blobs[j].size);
         ctxt.addr_size = ctxt.sp_size = blobs[j].bitness;
 

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel