From mboxrd@z Thu Jan 1 00:00:00 1970 From: "Jan Beulich" Subject: [PATCH v2 11/11] x86emul: test coverage for SSE/SSE2 insns Date: Wed, 01 Feb 2017 04:19:09 -0700 Message-ID: <5891D23D0200007800135C17@prv-mh.provo.novell.com> References: <5891CF990200007800135BC5@prv-mh.provo.novell.com> Mime-Version: 1.0 Content-Type: multipart/mixed; boundary="=__Part58619C3D.1__=" Return-path: Received: from mail6.bemta6.messagelabs.com ([193.109.254.103]) by lists.xenproject.org with esmtp (Exim 4.84_2) (envelope-from ) id 1cYswW-0003m7-Pq for xen-devel@lists.xenproject.org; Wed, 01 Feb 2017 11:19:17 +0000 In-Reply-To: <5891CF990200007800135BC5@prv-mh.provo.novell.com> List-Unsubscribe: , List-Post: List-Help: List-Subscribe: , Errors-To: xen-devel-bounces@lists.xen.org Sender: "Xen-devel" To: xen-devel Cc: Andrew Cooper List-Id: xen-devel@lists.xenproject.org This is a MIME message. If you are reading this text, you may want to consider changing to a mail reader or gateway that understands how to properly handle MIME multipart messages. --=__Part58619C3D.1__= Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: quoted-printable Content-Disposition: inline ... and their AVX equivalents. Note that a few instructions aren't covered (yet), but those all fall into common pattern groups, so I would hope that for now we can do with what is there. MMX insns aren't being covered at all, as they're not easy to deal with: The compiler refuses to emit such for other than uses of built-in functions. The current way of testing AVX insns is meant to be temporary only: Once we fully support that feature, the present tests should rather be replaced than full ones simply added. Signed-off-by: Jan Beulich --- v2: New. --- a/tools/tests/x86_emulator/Makefile +++ b/tools/tests/x86_emulator/Makefile @@ -11,11 +11,36 @@ all: $(TARGET) run: $(TARGET) ./$(TARGET) =20 -TESTCASES :=3D blowfish +TESTCASES :=3D blowfish simd =20 blowfish-cflags :=3D "" blowfish-cflags-x86_32 :=3D "-mno-accumulate-outgoing-args -Dstatic=3D" =20 +sse-vecs :=3D 16 +sse-ints :=3D +sse-flts :=3D 4 +sse2-vecs :=3D $(sse-vecs) +sse2-ints :=3D 1 2 4 8 +sse2-flts :=3D 4 8 + +# When converting SSE to AVX, have the compiler avoid XMM0 to widen +# coverage og the VEX.vvvv checks in the emulator. +sse2avx :=3D -ffixed-xmm0 -Wa,-msse2avx + +simd-cflags :=3D $(foreach flavor,sse sse2, \ + $(foreach vec,$($(flavor)-vecs), \ + $(foreach int,$($(flavor)-ints), \ + "-D$(flavor)_$(vec)i$(int) -m$(flavor) -O2 -DVEC_SIZE= =3D$(vec) -DINT_SIZE=3D$(int)" \ + "-D$(flavor)_$(vec)u$(int) -m$(flavor) -O2 -DVEC_SIZE= =3D$(vec) -DUINT_SIZE=3D$(int)" \ + "-D$(flavor)_avx_$(vec)i$(int) -m$(flavor) $(sse2avx)= -O2 -DVEC_SIZE=3D$(vec) -DINT_SIZE=3D$(int)" \ + "-D$(flavor)_avx_$(vec)u$(int) -m$(flavor) $(sse2avx)= -O2 -DVEC_SIZE=3D$(vec) -DUINT_SIZE=3D$(int)") \ + $(foreach flt,$($(flavor)-flts), \ + "-D$(flavor)_$(vec)f$(flt) -m$(flavor) -O2 -DVEC_SIZE= =3D$(vec) -DFLOAT_SIZE=3D$(flt)" \ + "-D$(flavor)_avx_$(vec)f$(flt) -m$(flavor) $(sse2avx)= -O2 -DVEC_SIZE=3D$(vec) -DFLOAT_SIZE=3D$(flt)")) \ + $(foreach flt,$($(flavor)-flts), \ + "-D$(flavor)_f$(flt) -m$(flavor) -mfpmath=3Dsse -O2 = -DFLOAT_SIZE=3D$(flt)" \ + "-D$(flavor)_avx_f$(flt) -m$(flavor) -mfpmath=3Dsse = $(sse2avx) -O2 -DFLOAT_SIZE=3D$(flt)")) + $(addsuffix .h,$(TESTCASES)): %.h: %.c testcase.mk Makefile rm -f $@.new $*.bin $(foreach arch,$(filter-out $(XEN_COMPILE_ARCH),x86_32) $(XEN_COMPI= LE_ARCH), \ --- /dev/null +++ b/tools/tests/x86_emulator/simd.c @@ -0,0 +1,450 @@ +#include + +asm ( + "\t.text\n" + "\t.globl _start\n" + "_start:\n" +#if defined(__i386__) && VEC_SIZE =3D=3D 16 + "\tpush %ebp\n" + "\tmov %esp,%ebp\n" + "\tand $~0xf,%esp\n" + "\tcall simd_test\n" + "\tleave\n" + "\tret" +#else + "\tjmp simd_test" +#endif + ); + +typedef +#if defined(INT_SIZE) +# define ELEM_SIZE INT_SIZE +signed int +# if INT_SIZE =3D=3D 1 +# define MODE QI +# elif INT_SIZE =3D=3D 2 +# define MODE HI +# elif INT_SIZE =3D=3D 4 +# define MODE SI +# elif INT_SIZE =3D=3D 8 +# define MODE DI +# endif +#elif defined(UINT_SIZE) +# define ELEM_SIZE UINT_SIZE +unsigned int +# if UINT_SIZE =3D=3D 1 +# define MODE QI +# elif UINT_SIZE =3D=3D 2 +# define MODE HI +# elif UINT_SIZE =3D=3D 4 +# define MODE SI +# elif UINT_SIZE =3D=3D 8 +# define MODE DI +# endif +#elif defined(FLOAT_SIZE) +float +# define ELEM_SIZE FLOAT_SIZE +# if FLOAT_SIZE =3D=3D 4 +# define MODE SF +# elif FLOAT_SIZE =3D=3D 8 +# define MODE DF +# endif +#endif +#ifndef VEC_SIZE +# define VEC_SIZE ELEM_SIZE +#endif +__attribute__((mode(MODE), vector_size(VEC_SIZE))) vec_t; + +#define ELEM_COUNT (VEC_SIZE / ELEM_SIZE) + +typedef unsigned int __attribute((mode(QI), vector_size(VEC_SIZE))) = byte_vec_t; + +/* Various builtins want plain char / int / long long vector types ... */ +typedef char __attribute__((vector_size(VEC_SIZE))) vqi_t; +typedef short __attribute__((vector_size(VEC_SIZE))) vhi_t; +typedef int __attribute__((vector_size(VEC_SIZE))) vsi_t; +#if VEC_SIZE >=3D 8 +typedef long long __attribute__((vector_size(VEC_SIZE))) vdi_t; +#endif + +#if VEC_SIZE =3D=3D 8 && defined(__SSE__) +# define to_bool(cmp) (__builtin_ia32_pmovmskb(cmp) =3D=3D 0xff) +#elif VEC_SIZE =3D=3D 16 +# if defined(__SSE__) && ELEM_SIZE =3D=3D 4 +# define to_bool(cmp) (__builtin_ia32_movmskps(cmp) =3D=3D 0xf) +# elif defined(__SSE2__) +# if ELEM_SIZE =3D=3D 8 +# define to_bool(cmp) (__builtin_ia32_movmskpd(cmp) =3D=3D 3) +# else +# define to_bool(cmp) (__builtin_ia32_pmovmskb128(cmp) =3D=3D 0xffff) +# endif +# endif +#endif + +#ifndef to_bool +static inline bool _to_bool(byte_vec_t bv) +{ + unsigned int i; + + for ( i =3D 0; i < VEC_SIZE; ++i ) + if ( bv[i] !=3D 0xff ) + return false; + + return true; +} +# define to_bool(cmp) _to_bool((byte_vec_t)(cmp)) +#endif + +#if VEC_SIZE =3D=3D FLOAT_SIZE +# define to_int(x) ((vec_t){ (int)(x)[0] }) +#elif VEC_SIZE =3D=3D 16 && defined(__SSE2__) +# if FLOAT_SIZE =3D=3D 4 +# define to_int(x) __builtin_ia32_cvtdq2ps(__builtin_ia32_cvtps2dq(x)) +# elif FLOAT_SIZE =3D=3D 8 +# define to_int(x) __builtin_ia32_cvtdq2pd(__builtin_ia32_cvtpd2dq(x)) +# endif +#endif + +#if VEC_SIZE =3D=3D FLOAT_SIZE +# define scalar_1op(x, op) ({ \ + typeof((x)[0]) __attribute__((vector_size(16))) r; \ + asm ( op : [out] "=3D&x" (r) : [in] "m" (x) ); \ + (vec_t){ r[0] }; \ +}) +#endif + +#if FLOAT_SIZE =3D=3D 4 && defined(__SSE__) +# if VEC_SIZE =3D=3D 16 +# define interleave_hi(x, y) __builtin_ia32_unpckhps(x, y) +# define interleave_lo(x, y) __builtin_ia32_unpcklps(x, y) +# define max(x, y) __builtin_ia32_maxps(x, y) +# define min(x, y) __builtin_ia32_minps(x, y) +# define recip(x) __builtin_ia32_rcpps(x) +# define rsqrt(x) __builtin_ia32_rsqrtps(x) +# define sqrt(x) __builtin_ia32_sqrtps(x) +# define swap(x) __builtin_ia32_shufps(x, x, 0b00011011) +# elif VEC_SIZE =3D=3D 4 +# define recip(x) scalar_1op(x, "rcpss %[in], %[out]") +# define rsqrt(x) scalar_1op(x, "rsqrtss %[in], %[out]") +# define sqrt(x) scalar_1op(x, "sqrtss %[in], %[out]") +# endif +#elif FLOAT_SIZE =3D=3D 8 && defined(__SSE2__) +# if VEC_SIZE =3D=3D 16 +# define interleave_hi(x, y) __builtin_ia32_unpckhpd(x, y) +# define interleave_lo(x, y) __builtin_ia32_unpcklpd(x, y) +# define max(x, y) __builtin_ia32_maxpd(x, y) +# define min(x, y) __builtin_ia32_minpd(x, y) +# define recip(x) __builtin_ia32_cvtps2pd(__builtin_ia32_rcpps(__builtin_= ia32_cvtpd2ps(x))) +# define rsqrt(x) __builtin_ia32_cvtps2pd(__builtin_ia32_rsqrtps(__builti= n_ia32_cvtpd2ps(x))) +# define sqrt(x) __builtin_ia32_sqrtpd(x) +# define swap(x) __builtin_ia32_shufpd(x, x, 0b01) +# elif VEC_SIZE =3D=3D 8 +# define recip(x) scalar_1op(x, "cvtsd2ss %[in], %[out]; rcpss %[out], = %[out]; cvtss2sd %[out], %[out]") +# define rsqrt(x) scalar_1op(x, "cvtsd2ss %[in], %[out]; rsqrtss %[out], = %[out]; cvtss2sd %[out], %[out]") +# define sqrt(x) scalar_1op(x, "sqrtsd %[in], %[out]") +# endif +#endif +#if VEC_SIZE =3D=3D 16 && defined(__SSE2__) +# if INT_SIZE =3D=3D 1 || UINT_SIZE =3D=3D 1 +# define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhbw128((vqi_t)x= , (vqi_t)y)) +# define interleave_lo(x, y) ((vec_t)__builtin_ia32_punpcklbw128((vqi_t)x= , (vqi_t)y)) +# elif INT_SIZE =3D=3D 2 || UINT_SIZE =3D=3D 2 +# define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhwd128((vhi_t)x= , (vhi_t)y)) +# define interleave_lo(x, y) ((vec_t)__builtin_ia32_punpcklwd128((vhi_t)x= , (vhi_t)y)) +# define swap(x) ((vec_t)__builtin_ia32_pshufd( \ + (vsi_t)__builtin_ia32_pshufhw( \ + __builtin_ia32_pshuflw((vhi_t)x, 0b00011011), = 0b00011011), 0b01001110)) +# elif INT_SIZE =3D=3D 4 || UINT_SIZE =3D=3D 4 +# define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhdq128((vsi_t)x= , (vsi_t)y)) +# define interleave_lo(x, y) ((vec_t)__builtin_ia32_punpckldq128((vsi_t)x= , (vsi_t)y)) +# define swap(x) ((vec_t)__builtin_ia32_pshufd((vsi_t)x, 0b00011011)) +# elif INT_SIZE =3D=3D 8 || UINT_SIZE =3D=3D 8 +# define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhqdq128((vdi_t)= x, (vdi_t)y)) +# define interleave_lo(x, y) ((vec_t)__builtin_ia32_punpcklqdq128((vdi_t)= x, (vdi_t)y)) +# define swap(x) ((vec_t)__builtin_ia32_pshufd((vsi_t)x, 0b01001110)) +# endif +# if UINT_SIZE =3D=3D 1 +# define max(x, y) ((vec_t)__builtin_ia32_pmaxub128((vqi_t)x, (vqi_t)y)) +# define min(x, y) ((vec_t)__builtin_ia32_pminub128((vqi_t)x, (vqi_t)y)) +# elif INT_SIZE =3D=3D 2 +# define max(x, y) __builtin_ia32_pmaxsw128(x, y) +# define min(x, y) __builtin_ia32_pminsw128(x, y) +# define mul_hi(x, y) __builtin_ia32_pmulhw128(x, y) +# elif UINT_SIZE =3D=3D 2 +# define mul_hi(x, y) ((vec_t)__builtin_ia32_pmulhuw128((vhi_t)x, = (vhi_t)y)) +# elif UINT_SIZE =3D=3D 4 +# define mul_full(x, y) ((vec_t)__builtin_ia32_pmuludq128((vsi_t)x, = (vsi_t)y)) +# endif +# define select(d, x, y, m) ({ \ + void *d_ =3D (d); \ + vqi_t m_ =3D (vqi_t)(m); \ + __builtin_ia32_maskmovdqu((vqi_t)(x), m_, d_); \ + __builtin_ia32_maskmovdqu((vqi_t)(y), ~m_, d_); \ +}) +#endif +#if VEC_SIZE =3D=3D FLOAT_SIZE +# define max(x, y) ((vec_t){({ typeof(x[0]) x_ =3D (x)[0], y_ =3D (y)[0]; = x_ > y_ ? x_ : y_; })}) +# define min(x, y) ((vec_t){({ typeof(x[0]) x_ =3D (x)[0], y_ =3D (y)[0]; = x_ < y_ ? x_ : y_; })}) +#endif + +/* + * Suppress value propagation by the compiler, preventing unwanted + * optimization. This at once makes the compiler use memory operands + * more often, which for our purposes is the more interesting case. + */ +#define touch(var) asm volatile ( "" : "+m" (var) ) + +int simd_test(void) +{ + unsigned int i, j; + vec_t x, y, z, src, inv, alt, sh; + + for ( i =3D 0, j =3D ELEM_SIZE << 3; i < ELEM_COUNT; ++i ) + { + src[i] =3D i + 1; + inv[i] =3D ELEM_COUNT - i; +#ifdef UINT_SIZE + alt[i] =3D -!(i & 1); +#else + alt[i] =3D i & 1 ? -1 : 1; +#endif + if ( !(i & (i + 1)) ) + --j; + sh[i] =3D j; + } + + touch(src); + x =3D src; + touch(x); + if ( !to_bool(x =3D=3D src) ) return __LINE__; + + touch(src); + y =3D x + src; + touch(src); + touch(y); + if ( !to_bool(y =3D=3D 2 * src) ) return __LINE__; + + touch(src); + z =3D y -=3D src; + touch(z); + if ( !to_bool(x =3D=3D z) ) return __LINE__; + +#if defined(UINT_SIZE) + + touch(inv); + x |=3D inv; + touch(inv); + y &=3D inv; + touch(inv); + z ^=3D inv; + touch(inv); + touch(x); + if ( !to_bool((x & ~y) =3D=3D z) ) return __LINE__; + +#elif ELEM_SIZE > 1 || VEC_SIZE <=3D 8 + + touch(src); + x *=3D src; + y =3D inv * inv; + touch(src); + z =3D src + inv; + touch(inv); + z *=3D (src - inv); + if ( !to_bool(x - y =3D=3D z) ) return __LINE__; + +#endif + +#if defined(FLOAT_SIZE) + + x =3D src * alt; + touch(alt); + y =3D src / alt; + if ( !to_bool(x =3D=3D y) ) return __LINE__; + touch(alt); + touch(src); + if ( !to_bool(x * -alt =3D=3D -src) ) return __LINE__; + +# if defined(recip) && defined(to_int) + + touch(src); + x =3D recip(src); + touch(src); + touch(x); + if ( !to_bool(to_int(recip(x)) =3D=3D src) ) return __LINE__; + +# ifdef rsqrt + x =3D src * src; + touch(x); + y =3D rsqrt(x); + touch(y); + if ( !to_bool(to_int(recip(y)) =3D=3D src) ) return __LINE__; + touch(src); + if ( !to_bool(to_int(y) =3D=3D to_int(recip(src))) ) return __LINE__; +# endif + +# endif + +# ifdef sqrt + x =3D src * src; + touch(x); + if ( !to_bool(sqrt(x) =3D=3D src) ) return __LINE__; +# endif + +#else + +# if ELEM_SIZE > 1 + + touch(inv); + x =3D src * inv; + touch(inv); + y[ELEM_COUNT - 1] =3D y[0] =3D j =3D ELEM_COUNT; + for ( i =3D 1; i < ELEM_COUNT / 2; ++i ) + y[ELEM_COUNT - i - 1] =3D y[i] =3D y[i - 1] + (j -=3D 2); + if ( !to_bool(x =3D=3D y) ) return __LINE__; + +# ifdef mul_hi + touch(alt); + x =3D mul_hi(src, alt); + touch(alt); +# ifdef INT_SIZE + if ( !to_bool(x =3D=3D (alt < 0)) ) return __LINE__; +# else + if ( !to_bool(x =3D=3D (src & alt) + alt) ) return __LINE__; +# endif +# endif + +# ifdef mul_full + x =3D src ^ alt; + touch(inv); + y =3D mul_full(x, inv); + touch(inv); + for ( i =3D 0; i < ELEM_COUNT; i +=3D 2 ) + { + unsigned long long res =3D x[i] * 1ULL * inv[i]; + + z[i] =3D res; + z[i + 1] =3D res >> (ELEM_SIZE << 3); + } + if ( !to_bool(y =3D=3D z) ) return __LINE__; +# endif + + z =3D src; +# ifdef INT_SIZE + z *=3D alt; +# endif + touch(z); + x =3D z << 3; + touch(z); + y =3D z << 2; + touch(z); + if ( !to_bool(x =3D=3D y + y) ) return __LINE__; + + touch(x); + z =3D x >> 2; + touch(x); + if ( !to_bool(y =3D=3D z + z) ) return __LINE__; + + z =3D src; +# ifdef INT_SIZE + z *=3D alt; +# endif + /* + * Note that despite the touch()-es here there doesn't appear to be a = way + * to make the compiler use a memory operand for the shift instruction= (at + * least without resorting to built-ins). + */ + j =3D 3; + touch(j); + x =3D z << j; + touch(j); + j =3D 2; + touch(j); + y =3D z << j; + touch(j); + if ( !to_bool(x =3D=3D y + y) ) return __LINE__; + + z =3D x >> j; + touch(j); + if ( !to_bool(y =3D=3D z + z) ) return __LINE__; + +# endif + +# if ELEM_SIZE =3D=3D 2 || defined(__SSE4_1__) + /* + * While there are no instructions with varying shift counts per = field, + * the code turns out to be a nice exercise for pextr/pinsr. + */ + z =3D src; +# ifdef INT_SIZE + z *=3D alt; +# endif + /* + * Zap elements for which the shift count is negative (and the hence = the + * decrement below would yield a negative count. + */ + z &=3D (sh > 0); + touch(sh); + x =3D z << sh; + touch(sh); + --sh; + touch(sh); + y =3D z << sh; + touch(sh); + if ( !to_bool(x =3D=3D y + y) ) return __LINE__; + +# endif + +#endif + +#if defined(max) && defined(min) +# ifdef UINT_SIZE + touch(inv); + x =3D min(src, inv); + touch(inv); + y =3D max(src, inv); + touch(inv); + if ( !to_bool(x + y =3D=3D src + inv) ) return __LINE__; +# else + x =3D src * alt; + y =3D inv * alt; + touch(y); + z =3D max(x, y); + touch(y); + y =3D min(x, y); + touch(y); + if ( !to_bool((y + z) * alt =3D=3D src + inv) ) return __LINE__; +# endif +#endif + +#ifdef swap + touch(src); + if ( !to_bool(swap(src) =3D=3D inv) ) return __LINE__; +#endif + +#if defined(interleave_lo) && defined(interleave_hi) + touch(src); + x =3D interleave_lo(inv, src); + touch(src); + y =3D interleave_hi(inv, src); + touch(src); +# ifdef UINT_SIZE + z =3D ((x - y) ^ ~alt) - ~alt; +# else + z =3D (x - y) * alt; +# endif + if ( !to_bool(z =3D=3D ELEM_COUNT / 2) ) return __LINE__; +#endif + +#ifdef select +# ifdef UINT_SIZE + select(&z, src, inv, alt); +# else + select(&z, src, inv, alt > 0); +# endif + for ( i =3D 0; i < ELEM_COUNT; ++i ) + y[i] =3D (i & 1 ? inv : src)[i]; + if ( !to_bool(z =3D=3D y) ) return __LINE__; +#endif + + return 0; +} --- a/tools/tests/x86_emulator/test_x86_emulator.c +++ b/tools/tests/x86_emulator/test_x86_emulator.c @@ -5,6 +5,7 @@ =20 #include "x86_emulate.h" #include "blowfish.h" +#include "simd.h" =20 #define verbose false /* Switch to true for far more logging. */ =20 @@ -19,11 +20,43 @@ static bool blowfish_check_regs(const st return regs->eax =3D=3D 2 && regs->edx =3D=3D 1; } =20 +static bool simd_check_sse(void) +{ + return cpu_has_sse; +} + +static bool simd_check_sse2(void) +{ + return cpu_has_sse2; +} + +static bool simd_check_avx(void) +{ + return cpu_has_avx; +} +#define simd_check_sse_avx simd_check_avx +#define simd_check_sse2_avx simd_check_avx + +static void simd_set_regs(struct cpu_user_regs *regs) +{ + if ( cpu_has_mmx ) + asm volatile ( "emms" ); +} + +static bool simd_check_regs(const struct cpu_user_regs *regs) +{ + if ( !regs->eax ) + return true; + printf("[line %u] ", (unsigned int)regs->eax); + return false; +} + static const struct { const void *code; size_t size; unsigned int bitness; const char*name; + bool (*check_cpu)(void); void (*set_regs)(struct cpu_user_regs *); bool (*check_regs)(const struct cpu_user_regs *); } blobs[] =3D { @@ -39,6 +72,49 @@ static const struct { BLOWFISH(32, blowfish, ), BLOWFISH(32, blowfish (push), _mno_accumulate_outgoing_args), #undef BLOWFISH +#define SIMD_(bits, desc, feat, form) \ + { .code =3D simd_x86_##bits##_D##feat##_##form, \ + .size =3D sizeof(simd_x86_##bits##_D##feat##_##form), \ + .bitness =3D bits, .name =3D #desc, \ + .check_cpu =3D simd_check_##feat, \ + .set_regs =3D simd_set_regs, \ + .check_regs =3D simd_check_regs } +#ifdef __x86_64__ +# define SIMD(desc, feat, form) SIMD_(64, desc, feat, form), \ + SIMD_(32, desc, feat, form) +#else +# define SIMD(desc, feat, form) SIMD_(32, desc, feat, form) +#endif + SIMD(SSE scalar single, sse, f4), + SIMD(SSE packed single, sse, 16f4), + SIMD(SSE2 scalar single, sse2, f4), + SIMD(SSE2 packed single, sse2, 16f4), + SIMD(SSE2 scalar double, sse2, f8), + SIMD(SSE2 packed double, sse2, 16f8), + SIMD(SSE2 packed s8, sse2, 16i1), + SIMD(SSE2 packed u8, sse2, 16u1), + SIMD(SSE2 packed s16, sse2, 16i2), + SIMD(SSE2 packed u16, sse2, 16u2), + SIMD(SSE2 packed s32, sse2, 16i4), + SIMD(SSE2 packed u32, sse2, 16u4), + SIMD(SSE2 packed s64, sse2, 16i8), + SIMD(SSE2 packed u64, sse2, 16u8), + SIMD(SSE/AVX scalar single, sse_avx, f4), + SIMD(SSE/AVX packed single, sse_avx, 16f4), + SIMD(SSE2/AVX scalar single, sse2_avx, f4), + SIMD(SSE2/AVX packed single, sse2_avx, 16f4), + SIMD(SSE2/AVX scalar double, sse2_avx, f8), + SIMD(SSE2/AVX packed double, sse2_avx, 16f8), + SIMD(SSE2/AVX packed s8, sse2_avx, 16i1), + SIMD(SSE2/AVX packed u8, sse2_avx, 16u1), + SIMD(SSE2/AVX packed s16, sse2_avx, 16i2), + SIMD(SSE2/AVX packed u16, sse2_avx, 16u2), + SIMD(SSE2/AVX packed s32, sse2_avx, 16i4), + SIMD(SSE2/AVX packed u32, sse2_avx, 16u4), + SIMD(SSE2/AVX packed s64, sse2_avx, 16i8), + SIMD(SSE2/AVX packed u64, sse2_avx, 16u8), +#undef SIMD_ +#undef SIMD }; =20 /* EFLAGS bit definitions. */ @@ -2598,6 +2674,9 @@ int main(int argc, char **argv) continue; } =20 + if ( blobs[j].check_cpu && !blobs[j].check_cpu() ) + continue; + memcpy(res, blobs[j].code, blobs[j].size); ctxt.addr_size =3D ctxt.sp_size =3D blobs[j].bitness; =20 --=__Part58619C3D.1__= Content-Type: text/plain; name="x86emul-SSE-AVX-0f-test.patch" Content-Transfer-Encoding: quoted-printable Content-Disposition: attachment; filename="x86emul-SSE-AVX-0f-test.patch" x86emul: test coverage for SSE/SSE2 insns=0A=0A... and their AVX equivalent= s. Note that a few instructions aren't=0Acovered (yet), but those all fall = into common pattern groups, so I=0Awould hope that for now we can do with = what is there.=0A=0AMMX insns aren't being covered at all, as they're not = easy to deal=0Awith: The compiler refuses to emit such for other than uses = of built-in=0Afunctions.=0A=0AThe current way of testing AVX insns is = meant to be temporary only:=0AOnce we fully support that feature, the = present tests should rather be=0Areplaced than full ones simply = added.=0A=0ASigned-off-by: Jan Beulich =0A---=0Av2: = New.=0A=0A--- a/tools/tests/x86_emulator/Makefile=0A+++ b/tools/tests/x86_e= mulator/Makefile=0A@@ -11,11 +11,36 @@ all: $(TARGET)=0A run: $(TARGET)=0A = ./$(TARGET)=0A =0A-TESTCASES :=3D blowfish=0A+TESTCASES :=3D = blowfish simd=0A =0A blowfish-cflags :=3D ""=0A blowfish-cflags-x86_32 = :=3D "-mno-accumulate-outgoing-args -Dstatic=3D"=0A =0A+sse-vecs :=3D = 16=0A+sse-ints :=3D=0A+sse-flts :=3D 4=0A+sse2-vecs :=3D $(sse-vecs)=0A+sse= 2-ints :=3D 1 2 4 8=0A+sse2-flts :=3D 4 8=0A+=0A+# When converting SSE to = AVX, have the compiler avoid XMM0 to widen=0A+# coverage og the VEX.vvvv = checks in the emulator.=0A+sse2avx :=3D -ffixed-xmm0 -Wa,-msse2avx=0A+=0A+s= imd-cflags :=3D $(foreach flavor,sse sse2, \=0A+ $(foreach = vec,$($(flavor)-vecs), \=0A+ $(foreach int,$($(flavor)-in= ts), \=0A+ "-D$(flavor)_$(vec)i$(int) -m$(flavor) -O2 = -DVEC_SIZE=3D$(vec) -DINT_SIZE=3D$(int)" \=0A+ = "-D$(flavor)_$(vec)u$(int) -m$(flavor) -O2 -DVEC_SIZE=3D$(vec) -DUINT_SIZE= =3D$(int)" \=0A+ "-D$(flavor)_avx_$(vec)i$(int) = -m$(flavor) $(sse2avx) -O2 -DVEC_SIZE=3D$(vec) -DINT_SIZE=3D$(int)" \=0A+ = "-D$(flavor)_avx_$(vec)u$(int) -m$(flavor) $(sse2avx) = -O2 -DVEC_SIZE=3D$(vec) -DUINT_SIZE=3D$(int)") \=0A+ = $(foreach flt,$($(flavor)-flts), \=0A+ "-D$(flavor)_$(v= ec)f$(flt) -m$(flavor) -O2 -DVEC_SIZE=3D$(vec) -DFLOAT_SIZE=3D$(flt)" = \=0A+ "-D$(flavor)_avx_$(vec)f$(flt) -m$(flavor) = $(sse2avx) -O2 -DVEC_SIZE=3D$(vec) -DFLOAT_SIZE=3D$(flt)")) \=0A+ = $(foreach flt,$($(flavor)-flts), \=0A+ "-D$(flavor= )_f$(flt) -m$(flavor) -mfpmath=3Dsse -O2 -DFLOAT_SIZE=3D$(flt)" \=0A+ = "-D$(flavor)_avx_f$(flt) -m$(flavor) -mfpmath=3Dsse $(sse2avx)= -O2 -DFLOAT_SIZE=3D$(flt)"))=0A+=0A $(addsuffix .h,$(TESTCASES)): %.h: = %.c testcase.mk Makefile=0A rm -f $@.new $*.bin=0A $(foreach = arch,$(filter-out $(XEN_COMPILE_ARCH),x86_32) $(XEN_COMPILE_ARCH), \=0A--- = /dev/null=0A+++ b/tools/tests/x86_emulator/simd.c=0A@@ -0,0 +1,450 = @@=0A+#include =0A+=0A+asm (=0A+ "\t.text\n"=0A+ = "\t.globl _start\n"=0A+ "_start:\n"=0A+#if defined(__i386__) && = VEC_SIZE =3D=3D 16=0A+ "\tpush %ebp\n"=0A+ "\tmov %esp,%ebp\n"=0A+ = "\tand $~0xf,%esp\n"=0A+ "\tcall simd_test\n"=0A+ "\tleave\n"=0A+ = "\tret"=0A+#else=0A+ "\tjmp simd_test"=0A+#endif=0A+ );=0A+=0A+typed= ef=0A+#if defined(INT_SIZE)=0A+# define ELEM_SIZE INT_SIZE=0A+signed = int=0A+# if INT_SIZE =3D=3D 1=0A+# define MODE QI=0A+# elif INT_SIZE = =3D=3D 2=0A+# define MODE HI=0A+# elif INT_SIZE =3D=3D 4=0A+# define = MODE SI=0A+# elif INT_SIZE =3D=3D 8=0A+# define MODE DI=0A+# endif=0A+#eli= f defined(UINT_SIZE)=0A+# define ELEM_SIZE UINT_SIZE=0A+unsigned int=0A+# = if UINT_SIZE =3D=3D 1=0A+# define MODE QI=0A+# elif UINT_SIZE =3D=3D = 2=0A+# define MODE HI=0A+# elif UINT_SIZE =3D=3D 4=0A+# define MODE = SI=0A+# elif UINT_SIZE =3D=3D 8=0A+# define MODE DI=0A+# endif=0A+#elif = defined(FLOAT_SIZE)=0A+float=0A+# define ELEM_SIZE FLOAT_SIZE=0A+# if = FLOAT_SIZE =3D=3D 4=0A+# define MODE SF=0A+# elif FLOAT_SIZE =3D=3D = 8=0A+# define MODE DF=0A+# endif=0A+#endif=0A+#ifndef VEC_SIZE=0A+# = define VEC_SIZE ELEM_SIZE=0A+#endif=0A+__attribute__((mode(MODE), = vector_size(VEC_SIZE))) vec_t;=0A+=0A+#define ELEM_COUNT (VEC_SIZE / = ELEM_SIZE)=0A+=0A+typedef unsigned int __attribute((mode(QI), vector_size(V= EC_SIZE))) byte_vec_t;=0A+=0A+/* Various builtins want plain char / int / = long long vector types ... */=0A+typedef char __attribute__((vector_size(VE= C_SIZE))) vqi_t;=0A+typedef short __attribute__((vector_size(VEC_SIZE))) = vhi_t;=0A+typedef int __attribute__((vector_size(VEC_SIZE))) vsi_t;=0A+#if = VEC_SIZE >=3D 8=0A+typedef long long __attribute__((vector_size(VEC_SIZE)))= vdi_t;=0A+#endif=0A+=0A+#if VEC_SIZE =3D=3D 8 && defined(__SSE__)=0A+# = define to_bool(cmp) (__builtin_ia32_pmovmskb(cmp) =3D=3D 0xff)=0A+#elif = VEC_SIZE =3D=3D 16=0A+# if defined(__SSE__) && ELEM_SIZE =3D=3D 4=0A+# = define to_bool(cmp) (__builtin_ia32_movmskps(cmp) =3D=3D 0xf)=0A+# elif = defined(__SSE2__)=0A+# if ELEM_SIZE =3D=3D 8=0A+# define to_bool(cmp) = (__builtin_ia32_movmskpd(cmp) =3D=3D 3)=0A+# else=0A+# define to_bool(cm= p) (__builtin_ia32_pmovmskb128(cmp) =3D=3D 0xffff)=0A+# endif=0A+# = endif=0A+#endif=0A+=0A+#ifndef to_bool=0A+static inline bool _to_bool(byte_= vec_t bv)=0A+{=0A+ unsigned int i;=0A+=0A+ for ( i =3D 0; i < = VEC_SIZE; ++i )=0A+ if ( bv[i] !=3D 0xff )=0A+ return = false;=0A+=0A+ return true;=0A+}=0A+# define to_bool(cmp) _to_bool((byte= _vec_t)(cmp))=0A+#endif=0A+=0A+#if VEC_SIZE =3D=3D FLOAT_SIZE=0A+# define = to_int(x) ((vec_t){ (int)(x)[0] })=0A+#elif VEC_SIZE =3D=3D 16 && = defined(__SSE2__)=0A+# if FLOAT_SIZE =3D=3D 4=0A+# define to_int(x) = __builtin_ia32_cvtdq2ps(__builtin_ia32_cvtps2dq(x))=0A+# elif FLOAT_SIZE = =3D=3D 8=0A+# define to_int(x) __builtin_ia32_cvtdq2pd(__builtin_ia32_cvtp= d2dq(x))=0A+# endif=0A+#endif=0A+=0A+#if VEC_SIZE =3D=3D FLOAT_SIZE=0A+# = define scalar_1op(x, op) ({ \=0A+ typeof((x)[0]) __attribute__((vector_s= ize(16))) r; \=0A+ asm ( op : [out] "=3D&x" (r) : [in] "m" (x) ); \=0A+ = (vec_t){ r[0] }; \=0A+})=0A+#endif=0A+=0A+#if FLOAT_SIZE =3D=3D 4 && = defined(__SSE__)=0A+# if VEC_SIZE =3D=3D 16=0A+# define interleave_hi(x, = y) __builtin_ia32_unpckhps(x, y)=0A+# define interleave_lo(x, y) = __builtin_ia32_unpcklps(x, y)=0A+# define max(x, y) __builtin_ia32_maxps(x= , y)=0A+# define min(x, y) __builtin_ia32_minps(x, y)=0A+# define = recip(x) __builtin_ia32_rcpps(x)=0A+# define rsqrt(x) __builtin_ia32_rsqrt= ps(x)=0A+# define sqrt(x) __builtin_ia32_sqrtps(x)=0A+# define swap(x) = __builtin_ia32_shufps(x, x, 0b00011011)=0A+# elif VEC_SIZE =3D=3D 4=0A+# = define recip(x) scalar_1op(x, "rcpss %[in], %[out]")=0A+# define rsqrt(x) = scalar_1op(x, "rsqrtss %[in], %[out]")=0A+# define sqrt(x) scalar_1op(x, = "sqrtss %[in], %[out]")=0A+# endif=0A+#elif FLOAT_SIZE =3D=3D 8 && = defined(__SSE2__)=0A+# if VEC_SIZE =3D=3D 16=0A+# define interleave_hi(x, = y) __builtin_ia32_unpckhpd(x, y)=0A+# define interleave_lo(x, y) = __builtin_ia32_unpcklpd(x, y)=0A+# define max(x, y) __builtin_ia32_maxpd(x= , y)=0A+# define min(x, y) __builtin_ia32_minpd(x, y)=0A+# define = recip(x) __builtin_ia32_cvtps2pd(__builtin_ia32_rcpps(__builtin_ia32_cvtpd2= ps(x)))=0A+# define rsqrt(x) __builtin_ia32_cvtps2pd(__builtin_ia32_rsqrtp= s(__builtin_ia32_cvtpd2ps(x)))=0A+# define sqrt(x) __builtin_ia32_sqrtpd(x= )=0A+# define swap(x) __builtin_ia32_shufpd(x, x, 0b01)=0A+# elif = VEC_SIZE =3D=3D 8=0A+# define recip(x) scalar_1op(x, "cvtsd2ss %[in], = %[out]; rcpss %[out], %[out]; cvtss2sd %[out], %[out]")=0A+# define = rsqrt(x) scalar_1op(x, "cvtsd2ss %[in], %[out]; rsqrtss %[out], %[out]; = cvtss2sd %[out], %[out]")=0A+# define sqrt(x) scalar_1op(x, "sqrtsd = %[in], %[out]")=0A+# endif=0A+#endif=0A+#if VEC_SIZE =3D=3D 16 && = defined(__SSE2__)=0A+# if INT_SIZE =3D=3D 1 || UINT_SIZE =3D=3D 1=0A+# = define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhbw128((vqi_t)x, = (vqi_t)y))=0A+# define interleave_lo(x, y) ((vec_t)__builtin_ia32_punpcklb= w128((vqi_t)x, (vqi_t)y))=0A+# elif INT_SIZE =3D=3D 2 || UINT_SIZE =3D=3D = 2=0A+# define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhwd128((vhi= _t)x, (vhi_t)y))=0A+# define interleave_lo(x, y) ((vec_t)__builtin_ia32_pu= npcklwd128((vhi_t)x, (vhi_t)y))=0A+# define swap(x) ((vec_t)__builtin_ia32= _pshufd( \=0A+ (vsi_t)__builtin_ia32_pshufhw( \=0A+ = __builtin_ia32_pshuflw((vhi_t)x, 0b00011011), = 0b00011011), 0b01001110))=0A+# elif INT_SIZE =3D=3D 4 || UINT_SIZE =3D=3D = 4=0A+# define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhdq128((vsi= _t)x, (vsi_t)y))=0A+# define interleave_lo(x, y) ((vec_t)__builtin_ia32_pu= npckldq128((vsi_t)x, (vsi_t)y))=0A+# define swap(x) ((vec_t)__builtin_ia32= _pshufd((vsi_t)x, 0b00011011))=0A+# elif INT_SIZE =3D=3D 8 || UINT_SIZE = =3D=3D 8=0A+# define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhqdq= 128((vdi_t)x, (vdi_t)y))=0A+# define interleave_lo(x, y) ((vec_t)__builtin= _ia32_punpcklqdq128((vdi_t)x, (vdi_t)y))=0A+# define swap(x) ((vec_t)__bui= ltin_ia32_pshufd((vsi_t)x, 0b01001110))=0A+# endif=0A+# if UINT_SIZE = =3D=3D 1=0A+# define max(x, y) ((vec_t)__builtin_ia32_pmaxub128((vqi_t)x, = (vqi_t)y))=0A+# define min(x, y) ((vec_t)__builtin_ia32_pminub128((vqi_t)x= , (vqi_t)y))=0A+# elif INT_SIZE =3D=3D 2=0A+# define max(x, y) __builtin_i= a32_pmaxsw128(x, y)=0A+# define min(x, y) __builtin_ia32_pminsw128(x, = y)=0A+# define mul_hi(x, y) __builtin_ia32_pmulhw128(x, y)=0A+# elif = UINT_SIZE =3D=3D 2=0A+# define mul_hi(x, y) ((vec_t)__builtin_ia32_pmulhuw= 128((vhi_t)x, (vhi_t)y))=0A+# elif UINT_SIZE =3D=3D 4=0A+# define = mul_full(x, y) ((vec_t)__builtin_ia32_pmuludq128((vsi_t)x, (vsi_t)y))=0A+# = endif=0A+# define select(d, x, y, m) ({ \=0A+ void *d_ =3D (d); \=0A+ = vqi_t m_ =3D (vqi_t)(m); \=0A+ __builtin_ia32_maskmovdqu((vqi_t)(x), = m_, d_); \=0A+ __builtin_ia32_maskmovdqu((vqi_t)(y), ~m_, d_); = \=0A+})=0A+#endif=0A+#if VEC_SIZE =3D=3D FLOAT_SIZE=0A+# define max(x, y) = ((vec_t){({ typeof(x[0]) x_ =3D (x)[0], y_ =3D (y)[0]; x_ > y_ ? x_ : y_; = })})=0A+# define min(x, y) ((vec_t){({ typeof(x[0]) x_ =3D (x)[0], y_ =3D = (y)[0]; x_ < y_ ? x_ : y_; })})=0A+#endif=0A+=0A+/*=0A+ * Suppress value = propagation by the compiler, preventing unwanted=0A+ * optimization. This = at once makes the compiler use memory operands=0A+ * more often, which for = our purposes is the more interesting case.=0A+ */=0A+#define touch(var) = asm volatile ( "" : "+m" (var) )=0A+=0A+int simd_test(void)=0A+{=0A+ = unsigned int i, j;=0A+ vec_t x, y, z, src, inv, alt, sh;=0A+=0A+ for = ( i =3D 0, j =3D ELEM_SIZE << 3; i < ELEM_COUNT; ++i )=0A+ {=0A+ = src[i] =3D i + 1;=0A+ inv[i] =3D ELEM_COUNT - i;=0A+#ifdef = UINT_SIZE=0A+ alt[i] =3D -!(i & 1);=0A+#else=0A+ alt[i] =3D = i & 1 ? -1 : 1;=0A+#endif=0A+ if ( !(i & (i + 1)) )=0A+ = --j;=0A+ sh[i] =3D j;=0A+ }=0A+=0A+ touch(src);=0A+ x =3D = src;=0A+ touch(x);=0A+ if ( !to_bool(x =3D=3D src) ) return = __LINE__;=0A+=0A+ touch(src);=0A+ y =3D x + src;=0A+ touch(src);= =0A+ touch(y);=0A+ if ( !to_bool(y =3D=3D 2 * src) ) return = __LINE__;=0A+=0A+ touch(src);=0A+ z =3D y -=3D src;=0A+ = touch(z);=0A+ if ( !to_bool(x =3D=3D z) ) return __LINE__;=0A+=0A+#if = defined(UINT_SIZE)=0A+=0A+ touch(inv);=0A+ x |=3D inv;=0A+ = touch(inv);=0A+ y &=3D inv;=0A+ touch(inv);=0A+ z ^=3D inv;=0A+ = touch(inv);=0A+ touch(x);=0A+ if ( !to_bool((x & ~y) =3D=3D z) ) = return __LINE__;=0A+=0A+#elif ELEM_SIZE > 1 || VEC_SIZE <=3D 8=0A+=0A+ = touch(src);=0A+ x *=3D src;=0A+ y =3D inv * inv;=0A+ touch(src);= =0A+ z =3D src + inv;=0A+ touch(inv);=0A+ z *=3D (src - inv);=0A+ = if ( !to_bool(x - y =3D=3D z) ) return __LINE__;=0A+=0A+#endif=0A+=0A+#i= f defined(FLOAT_SIZE)=0A+=0A+ x =3D src * alt;=0A+ touch(alt);=0A+ = y =3D src / alt;=0A+ if ( !to_bool(x =3D=3D y) ) return __LINE__;=0A+ = touch(alt);=0A+ touch(src);=0A+ if ( !to_bool(x * -alt =3D=3D = -src) ) return __LINE__;=0A+=0A+# if defined(recip) && defined(to_int)=0A+= =0A+ touch(src);=0A+ x =3D recip(src);=0A+ touch(src);=0A+ = touch(x);=0A+ if ( !to_bool(to_int(recip(x)) =3D=3D src) ) return = __LINE__;=0A+=0A+# ifdef rsqrt=0A+ x =3D src * src;=0A+ touch(x);=0A= + y =3D rsqrt(x);=0A+ touch(y);=0A+ if ( !to_bool(to_int(recip(y))= =3D=3D src) ) return __LINE__;=0A+ touch(src);=0A+ if ( !to_bool(to_= int(y) =3D=3D to_int(recip(src))) ) return __LINE__;=0A+# endif=0A+=0A+# = endif=0A+=0A+# ifdef sqrt=0A+ x =3D src * src;=0A+ touch(x);=0A+ = if ( !to_bool(sqrt(x) =3D=3D src) ) return __LINE__;=0A+# endif=0A+=0A+#els= e=0A+=0A+# if ELEM_SIZE > 1=0A+=0A+ touch(inv);=0A+ x =3D src * = inv;=0A+ touch(inv);=0A+ y[ELEM_COUNT - 1] =3D y[0] =3D j =3D = ELEM_COUNT;=0A+ for ( i =3D 1; i < ELEM_COUNT / 2; ++i )=0A+ = y[ELEM_COUNT - i - 1] =3D y[i] =3D y[i - 1] + (j -=3D 2);=0A+ if ( = !to_bool(x =3D=3D y) ) return __LINE__;=0A+=0A+# ifdef mul_hi=0A+ = touch(alt);=0A+ x =3D mul_hi(src, alt);=0A+ touch(alt);=0A+# ifdef = INT_SIZE=0A+ if ( !to_bool(x =3D=3D (alt < 0)) ) return __LINE__;=0A+# = else=0A+ if ( !to_bool(x =3D=3D (src & alt) + alt) ) return __LINE__;=0A= +# endif=0A+# endif=0A+=0A+# ifdef mul_full=0A+ x =3D src ^ alt;=0A+ = touch(inv);=0A+ y =3D mul_full(x, inv);=0A+ touch(inv);=0A+ for = ( i =3D 0; i < ELEM_COUNT; i +=3D 2 )=0A+ {=0A+ unsigned long = long res =3D x[i] * 1ULL * inv[i];=0A+=0A+ z[i] =3D res;=0A+ = z[i + 1] =3D res >> (ELEM_SIZE << 3);=0A+ }=0A+ if ( !to_bool(y = =3D=3D z) ) return __LINE__;=0A+# endif=0A+=0A+ z =3D src;=0A+# ifdef = INT_SIZE=0A+ z *=3D alt;=0A+# endif=0A+ touch(z);=0A+ x =3D z << = 3;=0A+ touch(z);=0A+ y =3D z << 2;=0A+ touch(z);=0A+ if ( = !to_bool(x =3D=3D y + y) ) return __LINE__;=0A+=0A+ touch(x);=0A+ z = =3D x >> 2;=0A+ touch(x);=0A+ if ( !to_bool(y =3D=3D z + z) ) return = __LINE__;=0A+=0A+ z =3D src;=0A+# ifdef INT_SIZE=0A+ z *=3D = alt;=0A+# endif=0A+ /*=0A+ * Note that despite the touch()-es here = there doesn't appear to be a way=0A+ * to make the compiler use a = memory operand for the shift instruction (at=0A+ * least without = resorting to built-ins).=0A+ */=0A+ j =3D 3;=0A+ touch(j);=0A+ = x =3D z << j;=0A+ touch(j);=0A+ j =3D 2;=0A+ touch(j);=0A+ y = =3D z << j;=0A+ touch(j);=0A+ if ( !to_bool(x =3D=3D y + y) ) return = __LINE__;=0A+=0A+ z =3D x >> j;=0A+ touch(j);=0A+ if ( !to_bool(y = =3D=3D z + z) ) return __LINE__;=0A+=0A+# endif=0A+=0A+# if ELEM_SIZE = =3D=3D 2 || defined(__SSE4_1__)=0A+ /*=0A+ * While there are no = instructions with varying shift counts per field,=0A+ * the code turns = out to be a nice exercise for pextr/pinsr.=0A+ */=0A+ z =3D = src;=0A+# ifdef INT_SIZE=0A+ z *=3D alt;=0A+# endif=0A+ /*=0A+ = * Zap elements for which the shift count is negative (and the hence = the=0A+ * decrement below would yield a negative count.=0A+ */=0A+ = z &=3D (sh > 0);=0A+ touch(sh);=0A+ x =3D z << sh;=0A+ = touch(sh);=0A+ --sh;=0A+ touch(sh);=0A+ y =3D z << sh;=0A+ = touch(sh);=0A+ if ( !to_bool(x =3D=3D y + y) ) return __LINE__;=0A+=0A+#= endif=0A+=0A+#endif=0A+=0A+#if defined(max) && defined(min)=0A+# ifdef = UINT_SIZE=0A+ touch(inv);=0A+ x =3D min(src, inv);=0A+ touch(inv);= =0A+ y =3D max(src, inv);=0A+ touch(inv);=0A+ if ( !to_bool(x + y = =3D=3D src + inv) ) return __LINE__;=0A+# else=0A+ x =3D src * alt;=0A+ = y =3D inv * alt;=0A+ touch(y);=0A+ z =3D max(x, y);=0A+ = touch(y);=0A+ y =3D min(x, y);=0A+ touch(y);=0A+ if ( !to_bool((y = + z) * alt =3D=3D src + inv) ) return __LINE__;=0A+# endif=0A+#endif=0A+=0A= +#ifdef swap=0A+ touch(src);=0A+ if ( !to_bool(swap(src) =3D=3D inv) = ) return __LINE__;=0A+#endif=0A+=0A+#if defined(interleave_lo) && = defined(interleave_hi)=0A+ touch(src);=0A+ x =3D interleave_lo(inv, = src);=0A+ touch(src);=0A+ y =3D interleave_hi(inv, src);=0A+ = touch(src);=0A+# ifdef UINT_SIZE=0A+ z =3D ((x - y) ^ ~alt) - ~alt;=0A+#= else=0A+ z =3D (x - y) * alt;=0A+# endif=0A+ if ( !to_bool(z =3D=3D = ELEM_COUNT / 2) ) return __LINE__;=0A+#endif=0A+=0A+#ifdef select=0A+# = ifdef UINT_SIZE=0A+ select(&z, src, inv, alt);=0A+# else=0A+ = select(&z, src, inv, alt > 0);=0A+# endif=0A+ for ( i =3D 0; i < = ELEM_COUNT; ++i )=0A+ y[i] =3D (i & 1 ? inv : src)[i];=0A+ if ( = !to_bool(z =3D=3D y) ) return __LINE__;=0A+#endif=0A+=0A+ return = 0;=0A+}=0A--- a/tools/tests/x86_emulator/test_x86_emulator.c=0A+++ = b/tools/tests/x86_emulator/test_x86_emulator.c=0A@@ -5,6 +5,7 @@=0A =0A = #include "x86_emulate.h"=0A #include "blowfish.h"=0A+#include "simd.h"=0A = =0A #define verbose false /* Switch to true for far more logging. */=0A = =0A@@ -19,11 +20,43 @@ static bool blowfish_check_regs(const st=0A = return regs->eax =3D=3D 2 && regs->edx =3D=3D 1;=0A }=0A =0A+static bool = simd_check_sse(void)=0A+{=0A+ return cpu_has_sse;=0A+}=0A+=0A+static = bool simd_check_sse2(void)=0A+{=0A+ return cpu_has_sse2;=0A+}=0A+=0A+sta= tic bool simd_check_avx(void)=0A+{=0A+ return cpu_has_avx;=0A+}=0A+#defi= ne simd_check_sse_avx simd_check_avx=0A+#define simd_check_sse2_avx = simd_check_avx=0A+=0A+static void simd_set_regs(struct cpu_user_regs = *regs)=0A+{=0A+ if ( cpu_has_mmx )=0A+ asm volatile ( "emms" = );=0A+}=0A+=0A+static bool simd_check_regs(const struct cpu_user_regs = *regs)=0A+{=0A+ if ( !regs->eax )=0A+ return true;=0A+ = printf("[line %u] ", (unsigned int)regs->eax);=0A+ return false;=0A+}=0A= +=0A static const struct {=0A const void *code;=0A size_t size;=0A = unsigned int bitness;=0A const char*name;=0A+ bool (*check_cpu)(= void);=0A void (*set_regs)(struct cpu_user_regs *);=0A bool = (*check_regs)(const struct cpu_user_regs *);=0A } blobs[] =3D {=0A@@ -39,6 = +72,49 @@ static const struct {=0A BLOWFISH(32, blowfish, ),=0A = BLOWFISH(32, blowfish (push), _mno_accumulate_outgoing_args),=0A #undef = BLOWFISH=0A+#define SIMD_(bits, desc, feat, form) = \=0A+ { .code =3D simd_x86_##bits##_D##feat##_##form, \=0A+ = .size =3D sizeof(simd_x86_##bits##_D##feat##_##form), \=0A+ .bitness = =3D bits, .name =3D #desc, \=0A+ .check_cpu =3D = simd_check_##feat, \=0A+ .set_regs =3D simd_set_re= gs, \=0A+ .check_regs =3D simd_check_regs = }=0A+#ifdef __x86_64__=0A+# define SIMD(desc, feat, form) SIMD_(64, desc, = feat, form), \=0A+ SIMD_(32, desc, feat, = form)=0A+#else=0A+# define SIMD(desc, feat, form) SIMD_(32, desc, feat, = form)=0A+#endif=0A+ SIMD(SSE scalar single, sse, f4),=0A+ = SIMD(SSE packed single, sse, 16f4),=0A+ SIMD(SSE2 scalar = single, sse2, f4),=0A+ SIMD(SSE2 packed single, sse2, = 16f4),=0A+ SIMD(SSE2 scalar double, sse2, f8),=0A+ = SIMD(SSE2 packed double, sse2, 16f8),=0A+ SIMD(SSE2 packed s8, = sse2, 16i1),=0A+ SIMD(SSE2 packed u8, sse2, = 16u1),=0A+ SIMD(SSE2 packed s16, sse2, 16i2),=0A+ = SIMD(SSE2 packed u16, sse2, 16u2),=0A+ SIMD(SSE2 packed = s32, sse2, 16i4),=0A+ SIMD(SSE2 packed u32, sse2, = 16u4),=0A+ SIMD(SSE2 packed s64, sse2, 16i8),=0A+ = SIMD(SSE2 packed u64, sse2, 16u8),=0A+ SIMD(SSE/AVX scalar = single, sse_avx, f4),=0A+ SIMD(SSE/AVX packed single, sse_avx, = 16f4),=0A+ SIMD(SSE2/AVX scalar single, sse2_avx, f4),=0A+ = SIMD(SSE2/AVX packed single, sse2_avx, 16f4),=0A+ SIMD(SSE2/AVX scalar = double, sse2_avx, f8),=0A+ SIMD(SSE2/AVX packed double, sse2_avx, = 16f8),=0A+ SIMD(SSE2/AVX packed s8, sse2_avx, 16i1),=0A+ = SIMD(SSE2/AVX packed u8, sse2_avx, 16u1),=0A+ SIMD(SSE2/AVX packed = s16, sse2_avx, 16i2),=0A+ SIMD(SSE2/AVX packed u16, sse2_avx, = 16u2),=0A+ SIMD(SSE2/AVX packed s32, sse2_avx, 16i4),=0A+ = SIMD(SSE2/AVX packed u32, sse2_avx, 16u4),=0A+ SIMD(SSE2/AVX packed = s64, sse2_avx, 16i8),=0A+ SIMD(SSE2/AVX packed u64, sse2_avx, = 16u8),=0A+#undef SIMD_=0A+#undef SIMD=0A };=0A =0A /* EFLAGS bit definition= s. */=0A@@ -2598,6 +2674,9 @@ int main(int argc, char **argv)=0A = continue;=0A }=0A =0A+ if ( blobs[j].check_cpu && = !blobs[j].check_cpu() )=0A+ continue;=0A+=0A memcpy(res,= blobs[j].code, blobs[j].size);=0A ctxt.addr_size =3D ctxt.sp_size = =3D blobs[j].bitness;=0A =0A --=__Part58619C3D.1__= Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: base64 Content-Disposition: inline X19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX18KWGVuLWRldmVs IG1haWxpbmcgbGlzdApYZW4tZGV2ZWxAbGlzdHMueGVuLm9yZwpodHRwczovL2xpc3RzLnhlbi5v cmcveGVuLWRldmVsCg== --=__Part58619C3D.1__=--