All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Jan Beulich" <JBeulich@suse.com>
To: xen-devel <xen-devel@lists.xenproject.org>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Subject: [PATCH 01/17] x86emul: support remaining AVX insns
Date: Wed, 21 Jun 2017 05:59:08 -0600	[thread overview]
Message-ID: <594A7BAC02000078001652FB@prv-mh.provo.novell.com> (raw)
In-Reply-To: <594A733B020000780016527C@prv-mh.provo.novell.com>

[-- Attachment #1: Type: text/plain, Size: 28226 bytes --]

I.e. those not being equivalents of SSEn ones.

There's one necessary change to generic code: Faulting behavior of
VMASKMOVP{S,D} requires us to do partial reads/writes.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/.gitignore
+++ b/.gitignore
@@ -224,7 +224,7 @@
 tools/tests/x86_emulator/*.bin
 tools/tests/x86_emulator/*.tmp
 tools/tests/x86_emulator/asm
-tools/tests/x86_emulator/avx*.h
+tools/tests/x86_emulator/avx*.[ch]
 tools/tests/x86_emulator/blowfish.h
 tools/tests/x86_emulator/sse*.[ch]
 tools/tests/x86_emulator/test_x86_emulator
--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -11,8 +11,8 @@ all: $(TARGET)
 run: $(TARGET)
 	./$(TARGET)
 
-SIMD := sse sse2 sse4
-TESTCASES := blowfish $(SIMD) $(addsuffix -avx,$(filter sse%,$(SIMD)))
+SIMD := sse sse2 sse4 avx
+TESTCASES := blowfish $(SIMD) sse2-avx sse4-avx
 
 blowfish-cflags := ""
 blowfish-cflags-x86_32 := "-mno-accumulate-outgoing-args -Dstatic="
@@ -26,34 +26,36 @@ sse2-flts := 4 8
 sse4-vecs := $(sse2-vecs)
 sse4-ints := $(sse2-ints)
 sse4-flts := $(sse2-flts)
+avx-vecs := 16 32
+avx-ints :=
+avx-flts := 4 8
 
 # When converting SSE to AVX, have the compiler avoid XMM0 to widen
 # coverage of the VEX.vvvv checks in the emulator. We must not do this,
 # however, for SSE4.1 and later, as there are instructions with XMM0 as
 # an implicit operand.
-sse2avx-sse  := -ffixed-xmm0 -Wa,-msse2avx
-sse2avx-sse2 := $(sse2avx-sse)
+sse2avx-sse2 := -ffixed-xmm0 -Wa,-msse2avx
 sse2avx-sse4 := -Wa,-msse2avx
 
+# For AVX and later, have the compiler avoid XMM0 to widen coverage of
+# the VEX.vvvv checks in the emulator.
+non-sse = $(if $(filter sse%,$(1)),,-ffixed-xmm0)
+
 define simd-defs
 $(1)-cflags := \
 	$(foreach vec,$($(1)-vecs), \
 	  $(foreach int,$($(1)-ints), \
-	    "-D_$(vec)i$(int) -m$(1) -O2 -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
-	    "-D_$(vec)u$(int) -m$(1) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)") \
+	    "-D_$(vec)i$(int) -m$(1) $(call non-sse,$(1)) -O2 -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
+	    "-D_$(vec)u$(int) -m$(1) $(call non-sse,$(1)) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)") \
 	  $(foreach flt,$($(1)-flts), \
-	    "-D_$(vec)f$(flt) -m$(1) -O2 -DVEC_SIZE=$(vec) -DFLOAT_SIZE=$(flt)")) \
+	    "-D_$(vec)f$(flt) -m$(1) $(call non-sse,$(1)) -O2 -DVEC_SIZE=$(vec) -DFLOAT_SIZE=$(flt)")) \
 	$(foreach flt,$($(1)-flts), \
-	  "-D_f$(flt) -m$(1) -mfpmath=sse -O2 -DFLOAT_SIZE=$(flt)")
+	  "-D_f$(flt) -m$(1) $(call non-sse,$(1)) -mfpmath=sse -O2 -DFLOAT_SIZE=$(flt)")
 $(1)-avx-cflags := \
 	$(foreach vec,$($(1)-vecs), \
 	  $(foreach int,$($(1)-ints), \
 	    "-D_$(vec)i$(int) -m$(1) $(sse2avx-$(1)) -O2 -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
-	    "-D_$(vec)u$(int) -m$(1) $(sse2avx-$(1)) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)") \
-	  $(foreach flt,$($(1)-flts), \
-	    "-D_$(vec)f$(flt) -m$(1) $(sse2avx-$(1)) -O2 -DVEC_SIZE=$(vec) -DFLOAT_SIZE=$(flt)")) \
-	$(foreach flt,$($(1)-flts), \
-	  "-D_f$(flt) -m$(1) -mfpmath=sse $(sse2avx-$(1)) -O2 -DFLOAT_SIZE=$(flt)")
+	    "-D_$(vec)u$(int) -m$(1) $(sse2avx-$(1)) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)"))
 endef
 
 $(foreach flavor,$(SIMD),$(eval $(call simd-defs,$(flavor))))
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -70,7 +70,13 @@ typedef long long __attribute__((vector_
 #if VEC_SIZE == 8 && defined(__SSE__)
 # define to_bool(cmp) (__builtin_ia32_pmovmskb(cmp) == 0xff)
 #elif VEC_SIZE == 16
-# if defined(__SSE4_1__)
+# if defined(__AVX__) && defined(FLOAT_SIZE)
+#  if ELEM_SIZE == 4
+#   define to_bool(cmp) __builtin_ia32_vtestcps(cmp, (vec_t){} == 0)
+#  elif ELEM_SIZE == 8
+#   define to_bool(cmp) __builtin_ia32_vtestcpd(cmp, (vec_t){} == 0)
+#  endif
+# elif defined(__SSE4_1__)
 #  define to_bool(cmp) __builtin_ia32_ptestc128(cmp, (vdi_t){} == 0)
 # elif defined(__SSE__) && ELEM_SIZE == 4
 #  define to_bool(cmp) (__builtin_ia32_movmskps(cmp) == 0xf)
@@ -81,6 +87,12 @@ typedef long long __attribute__((vector_
 #   define to_bool(cmp) (__builtin_ia32_pmovmskb128(cmp) == 0xffff)
 #  endif
 # endif
+#elif VEC_SIZE == 32
+# if defined(__AVX__) && ELEM_SIZE == 4
+#  define to_bool(cmp) (__builtin_ia32_movmskps256(cmp) == 0xff)
+# elif defined(__AVX__) && ELEM_SIZE == 8
+#  define to_bool(cmp) (__builtin_ia32_movmskpd256(cmp) == 0xf)
+# endif
 #endif
 
 #ifndef to_bool
@@ -105,6 +117,12 @@ static inline bool _to_bool(byte_vec_t b
 # elif FLOAT_SIZE == 8
 #  define to_int(x) __builtin_ia32_cvtdq2pd(__builtin_ia32_cvtpd2dq(x))
 # endif
+#elif VEC_SIZE == 32 && defined(__AVX__)
+# if FLOAT_SIZE == 4
+#  define to_int(x) __builtin_ia32_cvtdq2ps256(__builtin_ia32_cvtps2dq256(x))
+# elif FLOAT_SIZE == 8
+#  define to_int(x) __builtin_ia32_cvtdq2pd256(__builtin_ia32_cvtpd2dq256(x))
+# endif
 #endif
 
 #if VEC_SIZE == FLOAT_SIZE
@@ -116,7 +134,25 @@ static inline bool _to_bool(byte_vec_t b
 #endif
 
 #if FLOAT_SIZE == 4 && defined(__SSE__)
-# if VEC_SIZE == 16
+# if VEC_SIZE == 32 && defined(__AVX__)
+#  define broadcast(x) ({ float t_ = (x); __builtin_ia32_vbroadcastss256(&t_); })
+#  define max(x, y) __builtin_ia32_maxps256(x, y)
+#  define min(x, y) __builtin_ia32_minps256(x, y)
+#  define recip(x) __builtin_ia32_rcpps256(x)
+#  define rsqrt(x) __builtin_ia32_rsqrtps256(x)
+#  define sqrt(x) __builtin_ia32_sqrtps256(x)
+#  define swap(x) ({ \
+    vec_t t_ = __builtin_ia32_vpermilps256(x, 0b00011011); \
+    __builtin_ia32_vperm2f128_ps256(t_, t_, 0b00000001); \
+})
+#  define swap2(x) ({ \
+    vec_t t_ = __builtin_ia32_vpermilvarps256(x, __builtin_ia32_cvtps2dq256(inv) - 1); \
+    __builtin_ia32_vperm2f128_ps256(t_, t_, 0b00000001); \
+})
+# elif VEC_SIZE == 16
+#  ifdef __AVX__
+#   define broadcast(x) ({ float t_ = (x); __builtin_ia32_vbroadcastss(&t_); })
+#  endif
 #  define interleave_hi(x, y) __builtin_ia32_unpckhps(x, y)
 #  define interleave_lo(x, y) __builtin_ia32_unpcklps(x, y)
 #  define max(x, y) __builtin_ia32_maxps(x, y)
@@ -125,13 +161,39 @@ static inline bool _to_bool(byte_vec_t b
 #  define rsqrt(x) __builtin_ia32_rsqrtps(x)
 #  define sqrt(x) __builtin_ia32_sqrtps(x)
 #  define swap(x) __builtin_ia32_shufps(x, x, 0b00011011)
+#  ifdef __AVX__
+#   define swap2(x) __builtin_ia32_vpermilvarps(x, __builtin_ia32_cvtps2dq(inv) - 1)
+#  endif
 # elif VEC_SIZE == 4
 #  define recip(x) scalar_1op(x, "rcpss %[in], %[out]")
 #  define rsqrt(x) scalar_1op(x, "rsqrtss %[in], %[out]")
 #  define sqrt(x) scalar_1op(x, "sqrtss %[in], %[out]")
 # endif
 #elif FLOAT_SIZE == 8 && defined(__SSE2__)
-# if VEC_SIZE == 16
+# if VEC_SIZE == 32 && defined(__AVX__)
+#  define broadcast(x) ({ double t_ = (x); __builtin_ia32_vbroadcastsd256(&t_); })
+#  define max(x, y) __builtin_ia32_maxpd256(x, y)
+#  define min(x, y) __builtin_ia32_minpd256(x, y)
+#  define recip(x) ({ \
+    float __attribute__((vector_size(16))) t_ = __builtin_ia32_cvtpd2ps256(x); \
+    t_ = __builtin_ia32_vextractf128_ps256( \
+             __builtin_ia32_rcpps256( \
+                 __builtin_ia32_vbroadcastf128_ps256(&t_)), 0); \
+    __builtin_ia32_cvtps2pd256(t_); \
+})
+#  define rsqrt(x) ({ \
+    float __attribute__((vector_size(16))) t1_ = __builtin_ia32_cvtpd2ps256(x); \
+    float __attribute__((vector_size(32))) t2_ = __builtin_ia32_vinsertf128_ps256((typeof(t2_)){}, t1_, 0); \
+    t2_ = __builtin_ia32_vinsertf128_ps256(t2_, t1_, 1); \
+    t1_ = __builtin_ia32_vextractf128_ps256(__builtin_ia32_rsqrtps256(t2_), 0); \
+    __builtin_ia32_cvtps2pd256(t1_); \
+})
+#  define sqrt(x) __builtin_ia32_sqrtpd256(x)
+#  define swap(x) ({ \
+    vec_t t_ = __builtin_ia32_vpermilpd256(x, 0b00000101); \
+    __builtin_ia32_vperm2f128_pd256(t_, t_, 0b00000001); \
+})
+# elif VEC_SIZE == 16
 #  define interleave_hi(x, y) __builtin_ia32_unpckhpd(x, y)
 #  define interleave_lo(x, y) __builtin_ia32_unpcklpd(x, y)
 #  define max(x, y) __builtin_ia32_maxpd(x, y)
@@ -140,6 +202,10 @@ static inline bool _to_bool(byte_vec_t b
 #  define rsqrt(x) __builtin_ia32_cvtps2pd(__builtin_ia32_rsqrtps(__builtin_ia32_cvtpd2ps(x)))
 #  define sqrt(x) __builtin_ia32_sqrtpd(x)
 #  define swap(x) __builtin_ia32_shufpd(x, x, 0b01)
+#  ifdef __AVX__
+#   define swap2(x) __builtin_ia32_vpermilvarpd(x, __builtin_ia32_pmovsxdq128( \
+                                                       __builtin_ia32_cvtpd2dq(inv) - 1) << 1)
+#  endif
 # elif VEC_SIZE == 8
 #  define recip(x) scalar_1op(x, "cvtsd2ss %[in], %[out]; rcpss %[out], %[out]; cvtss2sd %[out], %[out]")
 #  define rsqrt(x) scalar_1op(x, "cvtsd2ss %[in], %[out]; rsqrtss %[out], %[out]; cvtss2sd %[out], %[out]")
@@ -201,6 +267,31 @@ static inline bool _to_bool(byte_vec_t b
 #  define hadd(x, y) __builtin_ia32_haddpd(x, y)
 #  define hsub(x, y) __builtin_ia32_hsubpd(x, y)
 # endif
+#elif VEC_SIZE == 32 && defined(__AVX__)
+# if FLOAT_SIZE == 4
+#  define addsub(x, y) __builtin_ia32_addsubps256(x, y)
+#  define dup_hi(x) __builtin_ia32_movshdup256(x)
+#  define dup_lo(x) __builtin_ia32_movsldup256(x)
+#  define hadd(x, y) ({ \
+        vec_t t_ = __builtin_ia32_haddps256(x, y); \
+        (vec_t){t_[0], t_[1], t_[4], t_[5], t_[2], t_[3], t_[6], t_[7]}; \
+})
+#  define hsub(x, y) ({ \
+        vec_t t_ = __builtin_ia32_hsubps256(x, y); \
+        (vec_t){t_[0], t_[1], t_[4], t_[5], t_[2], t_[3], t_[6], t_[7]}; \
+})
+# elif FLOAT_SIZE == 8
+#  define addsub(x, y) __builtin_ia32_addsubpd256(x, y)
+#  define dup_lo(x) __builtin_ia32_movddup256(x)
+#  define hadd(x, y) ({ \
+        vec_t t_ = __builtin_ia32_haddpd256(x, y); \
+        (vec_t){t_[0], t_[2], t_[1], t_[3]}; \
+})
+#  define hsub(x, y) ({ \
+        vec_t t_ = __builtin_ia32_hsubpd256(x, y); \
+        (vec_t){t_[0], t_[2], t_[1], t_[3]}; \
+})
+# endif
 #endif
 #if VEC_SIZE == 16 && defined(__SSSE3__)
 # if INT_SIZE == 1
@@ -282,6 +373,31 @@ static inline bool _to_bool(byte_vec_t b
 #  define mix(x, y) __builtin_ia32_blendpd(x, y, 0b10)
 # endif
 #endif
+#if VEC_SIZE == 32 && defined(__AVX__)
+# if FLOAT_SIZE == 4
+#  define dot_product(x, y) ({ \
+    vec_t t_ = __builtin_ia32_dpps256(x, y, 0b11110001); \
+    (vec_t){t_[0] + t_[4]}; \
+})
+#  define mix(x, y) __builtin_ia32_blendps256(x, y, 0b10101010)
+#  define select(d, x, y, m) (*(d) = __builtin_ia32_blendvps256(y, x, m))
+#  define select2(d, x, y, m) ({ \
+    vsi_t m_ = (vsi_t)(m); \
+    *(d) = __builtin_ia32_maskloadps256(&(x),  m_); \
+    __builtin_ia32_maskstoreps256(d, ~m_, y); \
+})
+#  define trunc(x) __builtin_ia32_roundps256(x, 0b1011)
+# elif FLOAT_SIZE == 8
+#  define mix(x, y) __builtin_ia32_blendpd256(x, y, 0b1010)
+#  define select(d, x, y, m) (*(d) = __builtin_ia32_blendvpd256(y, x, m))
+#  define select2(d, x, y, m) ({ \
+    vdi_t m_ = (vdi_t)(m); \
+    *(d) = __builtin_ia32_maskloadpd256(&(x),  m_); \
+    __builtin_ia32_maskstorepd256(d, ~m_, y); \
+})
+#  define trunc(x) __builtin_ia32_roundpd256(x, 0b1011)
+# endif
+#endif
 #if VEC_SIZE == FLOAT_SIZE
 # define max(x, y) ((vec_t){({ typeof(x[0]) x_ = (x)[0], y_ = (y)[0]; x_ > y_ ? x_ : y_; })})
 # define min(x, y) ((vec_t){({ typeof(x[0]) x_ = (x)[0], y_ = (y)[0]; x_ < y_ ? x_ : y_; })})
@@ -555,6 +671,15 @@ int simd_test(void)
     if ( !to_bool(swap(src) == inv) ) return __LINE__;
 #endif
 
+#ifdef swap2
+    touch(src);
+    if ( !to_bool(swap2(src) == inv) ) return __LINE__;
+#endif
+
+#if defined(broadcast)
+    if ( !to_bool(broadcast(ELEM_COUNT + 1) == src + inv) ) return __LINE__;
+#endif
+
 #if defined(interleave_lo) && defined(interleave_hi)
     touch(src);
     x = interleave_lo(inv, src);
@@ -652,6 +777,15 @@ int simd_test(void)
 # endif
     if ( !to_bool(z == y) ) return __LINE__;
 #endif
+
+#ifdef select2
+# ifdef UINT_SIZE
+    select2(&z, src, inv, alt);
+# else
+    select2(&z, src, inv, alt > 0);
+# endif
+    if ( !to_bool(z == y) ) return __LINE__;
+#endif
 
 #ifdef mix
     touch(src);
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -8,9 +8,9 @@
 #include "sse.h"
 #include "sse2.h"
 #include "sse4.h"
-#include "sse-avx.h"
 #include "sse2-avx.h"
 #include "sse4-avx.h"
+#include "avx.h"
 
 #define verbose false /* Switch to true for far more logging. */
 
@@ -44,7 +44,6 @@ static bool simd_check_avx(void)
 {
     return cpu_has_avx;
 }
-#define simd_check_sse_avx   simd_check_avx
 #define simd_check_sse2_avx  simd_check_avx
 #define simd_check_sse4_avx  simd_check_avx
 
@@ -122,12 +121,6 @@ static const struct {
     SIMD(SSE4 packed u32,        sse4,      16u4),
     SIMD(SSE4 packed s64,        sse4,      16i8),
     SIMD(SSE4 packed u64,        sse4,      16u8),
-    SIMD(SSE/AVX scalar single,  sse_avx,     f4),
-    SIMD(SSE/AVX packed single,  sse_avx,   16f4),
-    SIMD(SSE2/AVX scalar single, sse2_avx,    f4),
-    SIMD(SSE2/AVX packed single, sse2_avx,  16f4),
-    SIMD(SSE2/AVX scalar double, sse2_avx,    f8),
-    SIMD(SSE2/AVX packed double, sse2_avx,  16f8),
     SIMD(SSE2/AVX packed s8,     sse2_avx,  16i1),
     SIMD(SSE2/AVX packed u8,     sse2_avx,  16u1),
     SIMD(SSE2/AVX packed s16,    sse2_avx,  16i2),
@@ -136,10 +129,6 @@ static const struct {
     SIMD(SSE2/AVX packed u32,    sse2_avx,  16u4),
     SIMD(SSE2/AVX packed s64,    sse2_avx,  16i8),
     SIMD(SSE2/AVX packed u64,    sse2_avx,  16u8),
-    SIMD(SSE4/AVX scalar single, sse4_avx,    f4),
-    SIMD(SSE4/AVX packed single, sse4_avx,  16f4),
-    SIMD(SSE4/AVX scalar double, sse4_avx,    f8),
-    SIMD(SSE4/AVX packed double, sse4_avx,  16f8),
     SIMD(SSE4/AVX packed s8,     sse4_avx,  16i1),
     SIMD(SSE4/AVX packed u8,     sse4_avx,  16u1),
     SIMD(SSE4/AVX packed s16,    sse4_avx,  16i2),
@@ -148,6 +137,12 @@ static const struct {
     SIMD(SSE4/AVX packed u32,    sse4_avx,  16u4),
     SIMD(SSE4/AVX packed s64,    sse4_avx,  16i8),
     SIMD(SSE4/AVX packed u64,    sse4_avx,  16u8),
+    SIMD(AVX scalar single,      avx,         f4),
+    SIMD(AVX 128bit single,      avx,       16f4),
+    SIMD(AVX 256bit single,      avx,       32f4),
+    SIMD(AVX scalar double,      avx,         f8),
+    SIMD(AVX 128bit double,      avx,       16f8),
+    SIMD(AVX 256bit double,      avx,       32f8),
 #undef SIMD_
 #undef SIMD
 };
@@ -2834,6 +2829,81 @@ int main(int argc, char **argv)
         printf("okay\n");
     }
     else
+        printf("skipped\n");
+
+    /*
+     * The following "maskmov" tests are not only making sure the written data
+     * is correct, but verify (by placing operands on the mapping boundaries)
+     * that elements controlled by clear mask bits aren't being accessed.
+     */
+    printf("%-40s", "Testing vmaskmovps %xmm1,%xmm2,(%edx)...");
+    if ( stack_exec && cpu_has_avx )
+    {
+        decl_insn(vmaskmovps);
+
+        asm volatile ( "vxorps %%xmm1, %%xmm1, %%xmm1\n\t"
+                       "vcmpeqss %%xmm1, %%xmm1, %%xmm2\n\t"
+                       put_insn(vmaskmovps, "vmaskmovps %%xmm1, %%xmm2, (%0)")
+                       :: "d" (NULL) );
+
+        memset(res + MMAP_SZ / sizeof(*res) - 8, 0xdb, 32);
+        set_insn(vmaskmovps);
+        regs.edx = (unsigned long)res + MMAP_SZ - 4;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vmaskmovps) ||
+             res[MMAP_SZ / sizeof(*res) - 1] ||
+             memcmp(res + MMAP_SZ / sizeof(*res) - 8,
+                    res + MMAP_SZ / sizeof(*res) - 4, 12) )
+            goto fail;
+
+        asm volatile ( "vinsertps $0b00110111, %xmm2, %xmm2, %xmm2" );
+        memset(res, 0xdb, 32);
+        set_insn(vmaskmovps);
+        regs.edx = (unsigned long)(res - 3);
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vmaskmovps) ||
+             res[0] || memcmp(res + 1, res + 4, 12) )
+            goto fail;
+
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing vmaskmovpd %xmm1,%xmm2,(%edx)...");
+    if ( stack_exec && cpu_has_avx )
+    {
+        decl_insn(vmaskmovpd);
+
+        asm volatile ( "vxorpd %%xmm1, %%xmm1, %%xmm1\n\t"
+                       "vcmpeqsd %%xmm1, %%xmm1, %%xmm2\n\t"
+                       put_insn(vmaskmovpd, "vmaskmovpd %%xmm1, %%xmm2, (%0)")
+                       :: "d" (NULL) );
+
+        memset(res + MMAP_SZ / sizeof(*res) - 8, 0xdb, 32);
+        set_insn(vmaskmovpd);
+        regs.edx = (unsigned long)res + MMAP_SZ - 8;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vmaskmovpd) ||
+             res[MMAP_SZ / sizeof(*res) - 1] ||
+             res[MMAP_SZ / sizeof(*res) - 2] ||
+             memcmp(res + MMAP_SZ / sizeof(*res) - 8,
+                    res + MMAP_SZ / sizeof(*res) - 4, 8) )
+            goto fail;
+
+        asm volatile ( "vmovddup %xmm2, %xmm2\n\t"
+                       "vmovsd %xmm1, %xmm2, %xmm2" );
+        memset(res, 0xdb, 32);
+        set_insn(vmaskmovpd);
+        regs.edx = (unsigned long)(res - 2);
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vmaskmovpd) ||
+             res[0] || res[1] || memcmp(res + 2, res + 4, 8) )
+            goto fail;
+
+        printf("okay\n");
+    }
+    else
         printf("skipped\n");
 
     printf("%-40s", "Testing stmxcsr (%edx)...");
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -226,6 +226,12 @@ enum simd_opsize {
      */
     simd_scalar_fp,
 
+    /*
+     * 128 bits of integer or floating point data, with no further
+     * formatting information.
+     */
+    simd_128,
+
     /* Operand size encoded in non-standard way. */
     simd_other
 };
@@ -361,14 +367,19 @@ static const struct {
     uint8_t vsib:1;
 } ext0f38_table[256] = {
     [0x00 ... 0x0b] = { .simd_size = simd_packed_int },
+    [0x0c ... 0x0f] = { .simd_size = simd_packed_fp },
     [0x10] = { .simd_size = simd_packed_int },
     [0x14 ... 0x15] = { .simd_size = simd_packed_fp },
     [0x17] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0x18 ... 0x19] = { .simd_size = simd_scalar_fp, .two_op = 1 },
+    [0x1a] = { .simd_size = simd_128, .two_op = 1 },
     [0x1c ... 0x1e] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0x20 ... 0x25] = { .simd_size = simd_other, .two_op = 1 },
     [0x28 ... 0x29] = { .simd_size = simd_packed_int },
     [0x2a] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0x2b] = { .simd_size = simd_packed_int },
+    [0x2c ... 0x2d] = { .simd_size = simd_other },
+    [0x2e ... 0x2f] = { .simd_size = simd_other, .to_mem = 1 },
     [0x30 ... 0x35] = { .simd_size = simd_other, .two_op = 1 },
     [0x37 ... 0x3f] = { .simd_size = simd_packed_int },
     [0x40] = { .simd_size = simd_packed_int },
@@ -391,11 +402,15 @@ static const struct {
     uint8_t two_op:1;
     uint8_t four_op:1;
 } ext0f3a_table[256] = {
+    [0x04 ... 0x05] = { .simd_size = simd_packed_fp, .two_op = 1 },
+    [0x06] = { .simd_size = simd_packed_fp },
     [0x08 ... 0x09] = { .simd_size = simd_packed_fp, .two_op = 1 },
     [0x0a ... 0x0b] = { .simd_size = simd_scalar_fp },
     [0x0c ... 0x0d] = { .simd_size = simd_packed_fp },
     [0x0e ... 0x0f] = { .simd_size = simd_packed_int },
     [0x14 ... 0x17] = { .simd_size = simd_none, .to_mem = 1, .two_op = 1 },
+    [0x18] = { .simd_size = simd_128 },
+    [0x19] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1 },
     [0x20] = { .simd_size = simd_none },
     [0x21] = { .simd_size = simd_other },
     [0x22] = { .simd_size = simd_none },
@@ -469,15 +484,18 @@ union vex {
     buf_ + 3; \
 })
 
+#define copy_VEX(ptr, vex) ({ \
+    if ( !mode_64bit() ) \
+        (vex).reg |= 8; \
+    (ptr)[0 - PFX_BYTES] = 0xc4; \
+    (ptr)[1 - PFX_BYTES] = (vex).raw[0]; \
+    (ptr)[2 - PFX_BYTES] = (vex).raw[1]; \
+    container_of((ptr) + 1 - PFX_BYTES, typeof(vex), raw[0]); \
+})
+
 #define copy_REX_VEX(ptr, rex, vex) do { \
     if ( (vex).opcx != vex_none ) \
-    { \
-        if ( !mode_64bit() ) \
-            vex.reg |= 8; \
-        (ptr)[0 - PFX_BYTES] = 0xc4; \
-        (ptr)[1 - PFX_BYTES] = (vex).raw[0]; \
-        (ptr)[2 - PFX_BYTES] = (vex).raw[1]; \
-    } \
+        copy_VEX(ptr, vex); \
     else \
     { \
         if ( (vex).pfx ) \
@@ -2933,6 +2951,10 @@ x86_decode(
         op_bytes = 4 << (ctxt->opcode & 1);
         break;
 
+    case simd_128:
+        op_bytes = 16;
+        break;
+
     default:
         op_bytes = 0;
         break;
@@ -2959,6 +2981,7 @@ x86_emulate(
     struct x86_emulate_state state;
     int rc;
     uint8_t b, d, *opc = NULL;
+    unsigned int first_byte = 0;
     bool singlestep = (_regs.eflags & X86_EFLAGS_TF) &&
 	    !is_branch_step(ctxt, ops);
     bool sfence = false;
@@ -7099,6 +7122,18 @@ x86_emulate(
         fic.insn_bytes = PFX_BYTES + 3;
         break;
 
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x19): /* vbroadcastsd m64,ymm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x1a): /* vbroadcastf128 m128,ymm */
+        generate_exception_if(!vex.l, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x18): /* vbroadcastss m32,{x,y}mm */
+        generate_exception_if(ea.type != OP_MEM, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x0c): /* vpermilps {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x0d): /* vpermilpd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+        generate_exception_if(vex.w, EXC_UD);
+        goto simd_0f_avx;
+
     case X86EMUL_OPC_66(0x0f38, 0x20): /* pmovsxbw xmm/m64,xmm */
     case X86EMUL_OPC_66(0x0f38, 0x21): /* pmovsxbd xmm/m32,xmm */
     case X86EMUL_OPC_66(0x0f38, 0x22): /* pmovsxbq xmm/m16,xmm */
@@ -7132,6 +7167,10 @@ x86_emulate(
         host_and_vcpu_must_have(sse4_1);
         goto simd_0f38_common;
 
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x0e): /* vtestps {x,y}mm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x0f): /* vtestpd {x,y}mm/mem,{x,y}mm */
+        generate_exception_if(vex.w, EXC_UD);
+        /* fall through */
     case X86EMUL_OPC_66(0x0f38, 0x17):     /* ptest xmm/m128,xmm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x17): /* vptest {x,y}mm/mem,{x,y}mm */
         if ( vex.opcx == vex_none )
@@ -7212,6 +7251,69 @@ x86_emulate(
         }
         goto movdqa;
 
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x2c): /* vmaskmovps mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x2d): /* vmaskmovpd mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x2e): /* vmaskmovps {x,y}mm,{x,y}mm,mem */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x2f): /* vmaskmovpd {x,y}mm,{x,y}mm,mem */
+    {
+        typeof(vex) *pvex;
+
+        generate_exception_if(ea.type != OP_MEM || vex.w, EXC_UD);
+        host_and_vcpu_must_have(avx);
+        get_fpu(X86EMUL_FPU_ymm, &fic);
+
+        /*
+         * While we can't reasonably provide fully correct behavior here
+         * (in particular, for writes, avoiding the memory read in anticipation
+         * of all elements in the range eventually being written), we can (and
+         * should) still limit the memory access to the smallest possible range
+         * (suppressing it altogether if all mask bits are clear), to provide
+         * correct faulting behavior. Read the mask bits via vmovmskp{s,d}
+         * for that purpose.
+         */
+        opc = init_prefixes(stub);
+        pvex = copy_VEX(opc, vex);
+        pvex->opcx = vex_0f;
+        if ( !(b & 1) )
+            pvex->pfx = vex_none;
+        opc[0] = 0x50; /* vmovmskp{s,d} */
+        /* Use %rax as GPR destination and VEX.vvvv as source. */
+        pvex->r = 1;
+        pvex->b = !mode_64bit() || (vex.reg >> 3);
+        opc[1] = 0xc0 | (~vex.reg & 7);
+        pvex->reg = 0xf;
+        opc[2] = 0xc3;
+
+        invoke_stub("", "", "=a" (ea.val) : [dummy] "i" (0));
+        put_stub(stub);
+
+        if ( !ea.val )
+            goto complete_insn;
+
+        op_bytes = 4 << (b & 1);
+        first_byte = __builtin_ctz(ea.val);
+        ea.val >>= first_byte;
+        first_byte *= op_bytes;
+        op_bytes *= 32 - __builtin_clz(ea.val);
+
+        /*
+         * Even for the memory write variant a memory read is needed, unless
+         * all set mask bits are contiguous.
+         */
+        if ( ea.val & (ea.val + 1) )
+            d = (d & ~SrcMask) | SrcMem;
+
+        opc = init_prefixes(stub);
+        opc[0] = b;
+        /* Convert memory operand to (%rAX). */
+        rex_prefix &= ~REX_B;
+        vex.b = 1;
+        opc[1] = modrm & 0x38;
+        fic.insn_bytes = PFX_BYTES + 2;
+
+        break;
+    }
+
     case X86EMUL_OPC_66(0x0f38, 0x37): /* pcmpgtq xmm/m128,xmm */
         host_and_vcpu_must_have(sse4_2);
         goto simd_0f38_common;
@@ -7407,6 +7509,16 @@ x86_emulate(
                             : "0" ((uint32_t)src.val), "rm" (_regs.edx) );
         break;
 
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x06): /* vperm2f128 $imm8,ymm/m256,ymm,ymm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x18): /* vinsertf128 $imm8,xmm/m128,ymm,ymm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x19): /* vextractf128 $imm8,ymm,xmm/m128 */
+        generate_exception_if(!vex.l, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x04): /* vpermilps $imm8,{x,y}mm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x05): /* vpermilpd $imm8,{x,y}mm/mem,{x,y}mm */
+        generate_exception_if(vex.w, EXC_UD);
+        goto simd_0f_imm8_avx;
+
     case X86EMUL_OPC(0x0f3a, 0x0f):    /* palignr $imm8,mm/m64,mm */
     case X86EMUL_OPC_66(0x0f3a, 0x0f): /* palignr $imm8,xmm/m128,xmm */
         host_and_vcpu_must_have(ssse3);
@@ -7758,7 +7870,9 @@ x86_emulate(
             switch ( d & SrcMask )
             {
             case SrcMem:
-                rc = ops->read(ea.mem.seg, ea.mem.off, mmvalp, op_bytes, ctxt);
+                rc = ops->read(ea.mem.seg, ea.mem.off + first_byte,
+                               (void *)mmvalp + first_byte, op_bytes,
+                               ctxt);
                 if ( rc != X86EMUL_OKAY )
                     goto done;
                 /* fall through */
@@ -7776,7 +7890,22 @@ x86_emulate(
             if ( (d & DstMask) == DstMem )
             {
                 fail_if(!ops->write); /* Check before running the stub. */
-                ASSERT(d & Mov);
+                if ( (d & SrcMask) == SrcMem )
+                    d |= Mov; /* Force memory write to occur below. */
+
+                switch ( ctxt->opcode )
+                {
+                case X86EMUL_OPC_VEX_66(0x0f38, 0x2e): /* vmaskmovps */
+                case X86EMUL_OPC_VEX_66(0x0f38, 0x2f): /* vmaskmovpd */
+                case X86EMUL_OPC_VEX_66(0x0f38, 0x8e): /* vpmaskmov{d,q} */
+                    /* These have merge semantics; force write to occur. */
+                    d |= Mov;
+                    break;
+                default:
+                    ASSERT(d & Mov);
+                    break;
+                }
+
                 dst.type = OP_MEM;
                 dst.bytes = op_bytes;
                 dst.mem = ea.mem;
@@ -7824,8 +7953,9 @@ x86_emulate(
         else
         {
             fail_if(!ops->write);
-            rc = ops->write(dst.mem.seg, dst.mem.off,
-                            !state->simd_size ? &dst.val : (void *)mmvalp,
+            rc = ops->write(dst.mem.seg, dst.mem.off + first_byte,
+                            !state->simd_size ? &dst.val
+                                              : (void *)mmvalp + first_byte,
                             dst.bytes, ctxt);
             if ( sfence )
                 asm volatile ( "sfence" ::: "memory" );



[-- Attachment #2: x86emul-AVX.patch --]
[-- Type: text/plain, Size: 28262 bytes --]

x86emul: support remaining AVX insns

I.e. those not being equivalents of SSEn ones.

There's one necessary change to generic code: Faulting behavior of
VMASKMOVP{S,D} requires us to do partial reads/writes.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/.gitignore
+++ b/.gitignore
@@ -224,7 +224,7 @@
 tools/tests/x86_emulator/*.bin
 tools/tests/x86_emulator/*.tmp
 tools/tests/x86_emulator/asm
-tools/tests/x86_emulator/avx*.h
+tools/tests/x86_emulator/avx*.[ch]
 tools/tests/x86_emulator/blowfish.h
 tools/tests/x86_emulator/sse*.[ch]
 tools/tests/x86_emulator/test_x86_emulator
--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -11,8 +11,8 @@ all: $(TARGET)
 run: $(TARGET)
 	./$(TARGET)
 
-SIMD := sse sse2 sse4
-TESTCASES := blowfish $(SIMD) $(addsuffix -avx,$(filter sse%,$(SIMD)))
+SIMD := sse sse2 sse4 avx
+TESTCASES := blowfish $(SIMD) sse2-avx sse4-avx
 
 blowfish-cflags := ""
 blowfish-cflags-x86_32 := "-mno-accumulate-outgoing-args -Dstatic="
@@ -26,34 +26,36 @@ sse2-flts := 4 8
 sse4-vecs := $(sse2-vecs)
 sse4-ints := $(sse2-ints)
 sse4-flts := $(sse2-flts)
+avx-vecs := 16 32
+avx-ints :=
+avx-flts := 4 8
 
 # When converting SSE to AVX, have the compiler avoid XMM0 to widen
 # coverage of the VEX.vvvv checks in the emulator. We must not do this,
 # however, for SSE4.1 and later, as there are instructions with XMM0 as
 # an implicit operand.
-sse2avx-sse  := -ffixed-xmm0 -Wa,-msse2avx
-sse2avx-sse2 := $(sse2avx-sse)
+sse2avx-sse2 := -ffixed-xmm0 -Wa,-msse2avx
 sse2avx-sse4 := -Wa,-msse2avx
 
+# For AVX and later, have the compiler avoid XMM0 to widen coverage of
+# the VEX.vvvv checks in the emulator.
+non-sse = $(if $(filter sse%,$(1)),,-ffixed-xmm0)
+
 define simd-defs
 $(1)-cflags := \
 	$(foreach vec,$($(1)-vecs), \
 	  $(foreach int,$($(1)-ints), \
-	    "-D_$(vec)i$(int) -m$(1) -O2 -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
-	    "-D_$(vec)u$(int) -m$(1) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)") \
+	    "-D_$(vec)i$(int) -m$(1) $(call non-sse,$(1)) -O2 -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
+	    "-D_$(vec)u$(int) -m$(1) $(call non-sse,$(1)) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)") \
 	  $(foreach flt,$($(1)-flts), \
-	    "-D_$(vec)f$(flt) -m$(1) -O2 -DVEC_SIZE=$(vec) -DFLOAT_SIZE=$(flt)")) \
+	    "-D_$(vec)f$(flt) -m$(1) $(call non-sse,$(1)) -O2 -DVEC_SIZE=$(vec) -DFLOAT_SIZE=$(flt)")) \
 	$(foreach flt,$($(1)-flts), \
-	  "-D_f$(flt) -m$(1) -mfpmath=sse -O2 -DFLOAT_SIZE=$(flt)")
+	  "-D_f$(flt) -m$(1) $(call non-sse,$(1)) -mfpmath=sse -O2 -DFLOAT_SIZE=$(flt)")
 $(1)-avx-cflags := \
 	$(foreach vec,$($(1)-vecs), \
 	  $(foreach int,$($(1)-ints), \
 	    "-D_$(vec)i$(int) -m$(1) $(sse2avx-$(1)) -O2 -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
-	    "-D_$(vec)u$(int) -m$(1) $(sse2avx-$(1)) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)") \
-	  $(foreach flt,$($(1)-flts), \
-	    "-D_$(vec)f$(flt) -m$(1) $(sse2avx-$(1)) -O2 -DVEC_SIZE=$(vec) -DFLOAT_SIZE=$(flt)")) \
-	$(foreach flt,$($(1)-flts), \
-	  "-D_f$(flt) -m$(1) -mfpmath=sse $(sse2avx-$(1)) -O2 -DFLOAT_SIZE=$(flt)")
+	    "-D_$(vec)u$(int) -m$(1) $(sse2avx-$(1)) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)"))
 endef
 
 $(foreach flavor,$(SIMD),$(eval $(call simd-defs,$(flavor))))
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -70,7 +70,13 @@ typedef long long __attribute__((vector_
 #if VEC_SIZE == 8 && defined(__SSE__)
 # define to_bool(cmp) (__builtin_ia32_pmovmskb(cmp) == 0xff)
 #elif VEC_SIZE == 16
-# if defined(__SSE4_1__)
+# if defined(__AVX__) && defined(FLOAT_SIZE)
+#  if ELEM_SIZE == 4
+#   define to_bool(cmp) __builtin_ia32_vtestcps(cmp, (vec_t){} == 0)
+#  elif ELEM_SIZE == 8
+#   define to_bool(cmp) __builtin_ia32_vtestcpd(cmp, (vec_t){} == 0)
+#  endif
+# elif defined(__SSE4_1__)
 #  define to_bool(cmp) __builtin_ia32_ptestc128(cmp, (vdi_t){} == 0)
 # elif defined(__SSE__) && ELEM_SIZE == 4
 #  define to_bool(cmp) (__builtin_ia32_movmskps(cmp) == 0xf)
@@ -81,6 +87,12 @@ typedef long long __attribute__((vector_
 #   define to_bool(cmp) (__builtin_ia32_pmovmskb128(cmp) == 0xffff)
 #  endif
 # endif
+#elif VEC_SIZE == 32
+# if defined(__AVX__) && ELEM_SIZE == 4
+#  define to_bool(cmp) (__builtin_ia32_movmskps256(cmp) == 0xff)
+# elif defined(__AVX__) && ELEM_SIZE == 8
+#  define to_bool(cmp) (__builtin_ia32_movmskpd256(cmp) == 0xf)
+# endif
 #endif
 
 #ifndef to_bool
@@ -105,6 +117,12 @@ static inline bool _to_bool(byte_vec_t b
 # elif FLOAT_SIZE == 8
 #  define to_int(x) __builtin_ia32_cvtdq2pd(__builtin_ia32_cvtpd2dq(x))
 # endif
+#elif VEC_SIZE == 32 && defined(__AVX__)
+# if FLOAT_SIZE == 4
+#  define to_int(x) __builtin_ia32_cvtdq2ps256(__builtin_ia32_cvtps2dq256(x))
+# elif FLOAT_SIZE == 8
+#  define to_int(x) __builtin_ia32_cvtdq2pd256(__builtin_ia32_cvtpd2dq256(x))
+# endif
 #endif
 
 #if VEC_SIZE == FLOAT_SIZE
@@ -116,7 +134,25 @@ static inline bool _to_bool(byte_vec_t b
 #endif
 
 #if FLOAT_SIZE == 4 && defined(__SSE__)
-# if VEC_SIZE == 16
+# if VEC_SIZE == 32 && defined(__AVX__)
+#  define broadcast(x) ({ float t_ = (x); __builtin_ia32_vbroadcastss256(&t_); })
+#  define max(x, y) __builtin_ia32_maxps256(x, y)
+#  define min(x, y) __builtin_ia32_minps256(x, y)
+#  define recip(x) __builtin_ia32_rcpps256(x)
+#  define rsqrt(x) __builtin_ia32_rsqrtps256(x)
+#  define sqrt(x) __builtin_ia32_sqrtps256(x)
+#  define swap(x) ({ \
+    vec_t t_ = __builtin_ia32_vpermilps256(x, 0b00011011); \
+    __builtin_ia32_vperm2f128_ps256(t_, t_, 0b00000001); \
+})
+#  define swap2(x) ({ \
+    vec_t t_ = __builtin_ia32_vpermilvarps256(x, __builtin_ia32_cvtps2dq256(inv) - 1); \
+    __builtin_ia32_vperm2f128_ps256(t_, t_, 0b00000001); \
+})
+# elif VEC_SIZE == 16
+#  ifdef __AVX__
+#   define broadcast(x) ({ float t_ = (x); __builtin_ia32_vbroadcastss(&t_); })
+#  endif
 #  define interleave_hi(x, y) __builtin_ia32_unpckhps(x, y)
 #  define interleave_lo(x, y) __builtin_ia32_unpcklps(x, y)
 #  define max(x, y) __builtin_ia32_maxps(x, y)
@@ -125,13 +161,39 @@ static inline bool _to_bool(byte_vec_t b
 #  define rsqrt(x) __builtin_ia32_rsqrtps(x)
 #  define sqrt(x) __builtin_ia32_sqrtps(x)
 #  define swap(x) __builtin_ia32_shufps(x, x, 0b00011011)
+#  ifdef __AVX__
+#   define swap2(x) __builtin_ia32_vpermilvarps(x, __builtin_ia32_cvtps2dq(inv) - 1)
+#  endif
 # elif VEC_SIZE == 4
 #  define recip(x) scalar_1op(x, "rcpss %[in], %[out]")
 #  define rsqrt(x) scalar_1op(x, "rsqrtss %[in], %[out]")
 #  define sqrt(x) scalar_1op(x, "sqrtss %[in], %[out]")
 # endif
 #elif FLOAT_SIZE == 8 && defined(__SSE2__)
-# if VEC_SIZE == 16
+# if VEC_SIZE == 32 && defined(__AVX__)
+#  define broadcast(x) ({ double t_ = (x); __builtin_ia32_vbroadcastsd256(&t_); })
+#  define max(x, y) __builtin_ia32_maxpd256(x, y)
+#  define min(x, y) __builtin_ia32_minpd256(x, y)
+#  define recip(x) ({ \
+    float __attribute__((vector_size(16))) t_ = __builtin_ia32_cvtpd2ps256(x); \
+    t_ = __builtin_ia32_vextractf128_ps256( \
+             __builtin_ia32_rcpps256( \
+                 __builtin_ia32_vbroadcastf128_ps256(&t_)), 0); \
+    __builtin_ia32_cvtps2pd256(t_); \
+})
+#  define rsqrt(x) ({ \
+    float __attribute__((vector_size(16))) t1_ = __builtin_ia32_cvtpd2ps256(x); \
+    float __attribute__((vector_size(32))) t2_ = __builtin_ia32_vinsertf128_ps256((typeof(t2_)){}, t1_, 0); \
+    t2_ = __builtin_ia32_vinsertf128_ps256(t2_, t1_, 1); \
+    t1_ = __builtin_ia32_vextractf128_ps256(__builtin_ia32_rsqrtps256(t2_), 0); \
+    __builtin_ia32_cvtps2pd256(t1_); \
+})
+#  define sqrt(x) __builtin_ia32_sqrtpd256(x)
+#  define swap(x) ({ \
+    vec_t t_ = __builtin_ia32_vpermilpd256(x, 0b00000101); \
+    __builtin_ia32_vperm2f128_pd256(t_, t_, 0b00000001); \
+})
+# elif VEC_SIZE == 16
 #  define interleave_hi(x, y) __builtin_ia32_unpckhpd(x, y)
 #  define interleave_lo(x, y) __builtin_ia32_unpcklpd(x, y)
 #  define max(x, y) __builtin_ia32_maxpd(x, y)
@@ -140,6 +202,10 @@ static inline bool _to_bool(byte_vec_t b
 #  define rsqrt(x) __builtin_ia32_cvtps2pd(__builtin_ia32_rsqrtps(__builtin_ia32_cvtpd2ps(x)))
 #  define sqrt(x) __builtin_ia32_sqrtpd(x)
 #  define swap(x) __builtin_ia32_shufpd(x, x, 0b01)
+#  ifdef __AVX__
+#   define swap2(x) __builtin_ia32_vpermilvarpd(x, __builtin_ia32_pmovsxdq128( \
+                                                       __builtin_ia32_cvtpd2dq(inv) - 1) << 1)
+#  endif
 # elif VEC_SIZE == 8
 #  define recip(x) scalar_1op(x, "cvtsd2ss %[in], %[out]; rcpss %[out], %[out]; cvtss2sd %[out], %[out]")
 #  define rsqrt(x) scalar_1op(x, "cvtsd2ss %[in], %[out]; rsqrtss %[out], %[out]; cvtss2sd %[out], %[out]")
@@ -201,6 +267,31 @@ static inline bool _to_bool(byte_vec_t b
 #  define hadd(x, y) __builtin_ia32_haddpd(x, y)
 #  define hsub(x, y) __builtin_ia32_hsubpd(x, y)
 # endif
+#elif VEC_SIZE == 32 && defined(__AVX__)
+# if FLOAT_SIZE == 4
+#  define addsub(x, y) __builtin_ia32_addsubps256(x, y)
+#  define dup_hi(x) __builtin_ia32_movshdup256(x)
+#  define dup_lo(x) __builtin_ia32_movsldup256(x)
+#  define hadd(x, y) ({ \
+        vec_t t_ = __builtin_ia32_haddps256(x, y); \
+        (vec_t){t_[0], t_[1], t_[4], t_[5], t_[2], t_[3], t_[6], t_[7]}; \
+})
+#  define hsub(x, y) ({ \
+        vec_t t_ = __builtin_ia32_hsubps256(x, y); \
+        (vec_t){t_[0], t_[1], t_[4], t_[5], t_[2], t_[3], t_[6], t_[7]}; \
+})
+# elif FLOAT_SIZE == 8
+#  define addsub(x, y) __builtin_ia32_addsubpd256(x, y)
+#  define dup_lo(x) __builtin_ia32_movddup256(x)
+#  define hadd(x, y) ({ \
+        vec_t t_ = __builtin_ia32_haddpd256(x, y); \
+        (vec_t){t_[0], t_[2], t_[1], t_[3]}; \
+})
+#  define hsub(x, y) ({ \
+        vec_t t_ = __builtin_ia32_hsubpd256(x, y); \
+        (vec_t){t_[0], t_[2], t_[1], t_[3]}; \
+})
+# endif
 #endif
 #if VEC_SIZE == 16 && defined(__SSSE3__)
 # if INT_SIZE == 1
@@ -282,6 +373,31 @@ static inline bool _to_bool(byte_vec_t b
 #  define mix(x, y) __builtin_ia32_blendpd(x, y, 0b10)
 # endif
 #endif
+#if VEC_SIZE == 32 && defined(__AVX__)
+# if FLOAT_SIZE == 4
+#  define dot_product(x, y) ({ \
+    vec_t t_ = __builtin_ia32_dpps256(x, y, 0b11110001); \
+    (vec_t){t_[0] + t_[4]}; \
+})
+#  define mix(x, y) __builtin_ia32_blendps256(x, y, 0b10101010)
+#  define select(d, x, y, m) (*(d) = __builtin_ia32_blendvps256(y, x, m))
+#  define select2(d, x, y, m) ({ \
+    vsi_t m_ = (vsi_t)(m); \
+    *(d) = __builtin_ia32_maskloadps256(&(x),  m_); \
+    __builtin_ia32_maskstoreps256(d, ~m_, y); \
+})
+#  define trunc(x) __builtin_ia32_roundps256(x, 0b1011)
+# elif FLOAT_SIZE == 8
+#  define mix(x, y) __builtin_ia32_blendpd256(x, y, 0b1010)
+#  define select(d, x, y, m) (*(d) = __builtin_ia32_blendvpd256(y, x, m))
+#  define select2(d, x, y, m) ({ \
+    vdi_t m_ = (vdi_t)(m); \
+    *(d) = __builtin_ia32_maskloadpd256(&(x),  m_); \
+    __builtin_ia32_maskstorepd256(d, ~m_, y); \
+})
+#  define trunc(x) __builtin_ia32_roundpd256(x, 0b1011)
+# endif
+#endif
 #if VEC_SIZE == FLOAT_SIZE
 # define max(x, y) ((vec_t){({ typeof(x[0]) x_ = (x)[0], y_ = (y)[0]; x_ > y_ ? x_ : y_; })})
 # define min(x, y) ((vec_t){({ typeof(x[0]) x_ = (x)[0], y_ = (y)[0]; x_ < y_ ? x_ : y_; })})
@@ -555,6 +671,15 @@ int simd_test(void)
     if ( !to_bool(swap(src) == inv) ) return __LINE__;
 #endif
 
+#ifdef swap2
+    touch(src);
+    if ( !to_bool(swap2(src) == inv) ) return __LINE__;
+#endif
+
+#if defined(broadcast)
+    if ( !to_bool(broadcast(ELEM_COUNT + 1) == src + inv) ) return __LINE__;
+#endif
+
 #if defined(interleave_lo) && defined(interleave_hi)
     touch(src);
     x = interleave_lo(inv, src);
@@ -652,6 +777,15 @@ int simd_test(void)
 # endif
     if ( !to_bool(z == y) ) return __LINE__;
 #endif
+
+#ifdef select2
+# ifdef UINT_SIZE
+    select2(&z, src, inv, alt);
+# else
+    select2(&z, src, inv, alt > 0);
+# endif
+    if ( !to_bool(z == y) ) return __LINE__;
+#endif
 
 #ifdef mix
     touch(src);
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -8,9 +8,9 @@
 #include "sse.h"
 #include "sse2.h"
 #include "sse4.h"
-#include "sse-avx.h"
 #include "sse2-avx.h"
 #include "sse4-avx.h"
+#include "avx.h"
 
 #define verbose false /* Switch to true for far more logging. */
 
@@ -44,7 +44,6 @@ static bool simd_check_avx(void)
 {
     return cpu_has_avx;
 }
-#define simd_check_sse_avx   simd_check_avx
 #define simd_check_sse2_avx  simd_check_avx
 #define simd_check_sse4_avx  simd_check_avx
 
@@ -122,12 +121,6 @@ static const struct {
     SIMD(SSE4 packed u32,        sse4,      16u4),
     SIMD(SSE4 packed s64,        sse4,      16i8),
     SIMD(SSE4 packed u64,        sse4,      16u8),
-    SIMD(SSE/AVX scalar single,  sse_avx,     f4),
-    SIMD(SSE/AVX packed single,  sse_avx,   16f4),
-    SIMD(SSE2/AVX scalar single, sse2_avx,    f4),
-    SIMD(SSE2/AVX packed single, sse2_avx,  16f4),
-    SIMD(SSE2/AVX scalar double, sse2_avx,    f8),
-    SIMD(SSE2/AVX packed double, sse2_avx,  16f8),
     SIMD(SSE2/AVX packed s8,     sse2_avx,  16i1),
     SIMD(SSE2/AVX packed u8,     sse2_avx,  16u1),
     SIMD(SSE2/AVX packed s16,    sse2_avx,  16i2),
@@ -136,10 +129,6 @@ static const struct {
     SIMD(SSE2/AVX packed u32,    sse2_avx,  16u4),
     SIMD(SSE2/AVX packed s64,    sse2_avx,  16i8),
     SIMD(SSE2/AVX packed u64,    sse2_avx,  16u8),
-    SIMD(SSE4/AVX scalar single, sse4_avx,    f4),
-    SIMD(SSE4/AVX packed single, sse4_avx,  16f4),
-    SIMD(SSE4/AVX scalar double, sse4_avx,    f8),
-    SIMD(SSE4/AVX packed double, sse4_avx,  16f8),
     SIMD(SSE4/AVX packed s8,     sse4_avx,  16i1),
     SIMD(SSE4/AVX packed u8,     sse4_avx,  16u1),
     SIMD(SSE4/AVX packed s16,    sse4_avx,  16i2),
@@ -148,6 +137,12 @@ static const struct {
     SIMD(SSE4/AVX packed u32,    sse4_avx,  16u4),
     SIMD(SSE4/AVX packed s64,    sse4_avx,  16i8),
     SIMD(SSE4/AVX packed u64,    sse4_avx,  16u8),
+    SIMD(AVX scalar single,      avx,         f4),
+    SIMD(AVX 128bit single,      avx,       16f4),
+    SIMD(AVX 256bit single,      avx,       32f4),
+    SIMD(AVX scalar double,      avx,         f8),
+    SIMD(AVX 128bit double,      avx,       16f8),
+    SIMD(AVX 256bit double,      avx,       32f8),
 #undef SIMD_
 #undef SIMD
 };
@@ -2834,6 +2829,81 @@ int main(int argc, char **argv)
         printf("okay\n");
     }
     else
+        printf("skipped\n");
+
+    /*
+     * The following "maskmov" tests are not only making sure the written data
+     * is correct, but verify (by placing operands on the mapping boundaries)
+     * that elements controlled by clear mask bits aren't being accessed.
+     */
+    printf("%-40s", "Testing vmaskmovps %xmm1,%xmm2,(%edx)...");
+    if ( stack_exec && cpu_has_avx )
+    {
+        decl_insn(vmaskmovps);
+
+        asm volatile ( "vxorps %%xmm1, %%xmm1, %%xmm1\n\t"
+                       "vcmpeqss %%xmm1, %%xmm1, %%xmm2\n\t"
+                       put_insn(vmaskmovps, "vmaskmovps %%xmm1, %%xmm2, (%0)")
+                       :: "d" (NULL) );
+
+        memset(res + MMAP_SZ / sizeof(*res) - 8, 0xdb, 32);
+        set_insn(vmaskmovps);
+        regs.edx = (unsigned long)res + MMAP_SZ - 4;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vmaskmovps) ||
+             res[MMAP_SZ / sizeof(*res) - 1] ||
+             memcmp(res + MMAP_SZ / sizeof(*res) - 8,
+                    res + MMAP_SZ / sizeof(*res) - 4, 12) )
+            goto fail;
+
+        asm volatile ( "vinsertps $0b00110111, %xmm2, %xmm2, %xmm2" );
+        memset(res, 0xdb, 32);
+        set_insn(vmaskmovps);
+        regs.edx = (unsigned long)(res - 3);
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vmaskmovps) ||
+             res[0] || memcmp(res + 1, res + 4, 12) )
+            goto fail;
+
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing vmaskmovpd %xmm1,%xmm2,(%edx)...");
+    if ( stack_exec && cpu_has_avx )
+    {
+        decl_insn(vmaskmovpd);
+
+        asm volatile ( "vxorpd %%xmm1, %%xmm1, %%xmm1\n\t"
+                       "vcmpeqsd %%xmm1, %%xmm1, %%xmm2\n\t"
+                       put_insn(vmaskmovpd, "vmaskmovpd %%xmm1, %%xmm2, (%0)")
+                       :: "d" (NULL) );
+
+        memset(res + MMAP_SZ / sizeof(*res) - 8, 0xdb, 32);
+        set_insn(vmaskmovpd);
+        regs.edx = (unsigned long)res + MMAP_SZ - 8;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vmaskmovpd) ||
+             res[MMAP_SZ / sizeof(*res) - 1] ||
+             res[MMAP_SZ / sizeof(*res) - 2] ||
+             memcmp(res + MMAP_SZ / sizeof(*res) - 8,
+                    res + MMAP_SZ / sizeof(*res) - 4, 8) )
+            goto fail;
+
+        asm volatile ( "vmovddup %xmm2, %xmm2\n\t"
+                       "vmovsd %xmm1, %xmm2, %xmm2" );
+        memset(res, 0xdb, 32);
+        set_insn(vmaskmovpd);
+        regs.edx = (unsigned long)(res - 2);
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vmaskmovpd) ||
+             res[0] || res[1] || memcmp(res + 2, res + 4, 8) )
+            goto fail;
+
+        printf("okay\n");
+    }
+    else
         printf("skipped\n");
 
     printf("%-40s", "Testing stmxcsr (%edx)...");
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -226,6 +226,12 @@ enum simd_opsize {
      */
     simd_scalar_fp,
 
+    /*
+     * 128 bits of integer or floating point data, with no further
+     * formatting information.
+     */
+    simd_128,
+
     /* Operand size encoded in non-standard way. */
     simd_other
 };
@@ -361,14 +367,19 @@ static const struct {
     uint8_t vsib:1;
 } ext0f38_table[256] = {
     [0x00 ... 0x0b] = { .simd_size = simd_packed_int },
+    [0x0c ... 0x0f] = { .simd_size = simd_packed_fp },
     [0x10] = { .simd_size = simd_packed_int },
     [0x14 ... 0x15] = { .simd_size = simd_packed_fp },
     [0x17] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0x18 ... 0x19] = { .simd_size = simd_scalar_fp, .two_op = 1 },
+    [0x1a] = { .simd_size = simd_128, .two_op = 1 },
     [0x1c ... 0x1e] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0x20 ... 0x25] = { .simd_size = simd_other, .two_op = 1 },
     [0x28 ... 0x29] = { .simd_size = simd_packed_int },
     [0x2a] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0x2b] = { .simd_size = simd_packed_int },
+    [0x2c ... 0x2d] = { .simd_size = simd_other },
+    [0x2e ... 0x2f] = { .simd_size = simd_other, .to_mem = 1 },
     [0x30 ... 0x35] = { .simd_size = simd_other, .two_op = 1 },
     [0x37 ... 0x3f] = { .simd_size = simd_packed_int },
     [0x40] = { .simd_size = simd_packed_int },
@@ -391,11 +402,15 @@ static const struct {
     uint8_t two_op:1;
     uint8_t four_op:1;
 } ext0f3a_table[256] = {
+    [0x04 ... 0x05] = { .simd_size = simd_packed_fp, .two_op = 1 },
+    [0x06] = { .simd_size = simd_packed_fp },
     [0x08 ... 0x09] = { .simd_size = simd_packed_fp, .two_op = 1 },
     [0x0a ... 0x0b] = { .simd_size = simd_scalar_fp },
     [0x0c ... 0x0d] = { .simd_size = simd_packed_fp },
     [0x0e ... 0x0f] = { .simd_size = simd_packed_int },
     [0x14 ... 0x17] = { .simd_size = simd_none, .to_mem = 1, .two_op = 1 },
+    [0x18] = { .simd_size = simd_128 },
+    [0x19] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1 },
     [0x20] = { .simd_size = simd_none },
     [0x21] = { .simd_size = simd_other },
     [0x22] = { .simd_size = simd_none },
@@ -469,15 +484,18 @@ union vex {
     buf_ + 3; \
 })
 
+#define copy_VEX(ptr, vex) ({ \
+    if ( !mode_64bit() ) \
+        (vex).reg |= 8; \
+    (ptr)[0 - PFX_BYTES] = 0xc4; \
+    (ptr)[1 - PFX_BYTES] = (vex).raw[0]; \
+    (ptr)[2 - PFX_BYTES] = (vex).raw[1]; \
+    container_of((ptr) + 1 - PFX_BYTES, typeof(vex), raw[0]); \
+})
+
 #define copy_REX_VEX(ptr, rex, vex) do { \
     if ( (vex).opcx != vex_none ) \
-    { \
-        if ( !mode_64bit() ) \
-            vex.reg |= 8; \
-        (ptr)[0 - PFX_BYTES] = 0xc4; \
-        (ptr)[1 - PFX_BYTES] = (vex).raw[0]; \
-        (ptr)[2 - PFX_BYTES] = (vex).raw[1]; \
-    } \
+        copy_VEX(ptr, vex); \
     else \
     { \
         if ( (vex).pfx ) \
@@ -2933,6 +2951,10 @@ x86_decode(
         op_bytes = 4 << (ctxt->opcode & 1);
         break;
 
+    case simd_128:
+        op_bytes = 16;
+        break;
+
     default:
         op_bytes = 0;
         break;
@@ -2959,6 +2981,7 @@ x86_emulate(
     struct x86_emulate_state state;
     int rc;
     uint8_t b, d, *opc = NULL;
+    unsigned int first_byte = 0;
     bool singlestep = (_regs.eflags & X86_EFLAGS_TF) &&
 	    !is_branch_step(ctxt, ops);
     bool sfence = false;
@@ -7099,6 +7122,18 @@ x86_emulate(
         fic.insn_bytes = PFX_BYTES + 3;
         break;
 
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x19): /* vbroadcastsd m64,ymm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x1a): /* vbroadcastf128 m128,ymm */
+        generate_exception_if(!vex.l, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x18): /* vbroadcastss m32,{x,y}mm */
+        generate_exception_if(ea.type != OP_MEM, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x0c): /* vpermilps {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x0d): /* vpermilpd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+        generate_exception_if(vex.w, EXC_UD);
+        goto simd_0f_avx;
+
     case X86EMUL_OPC_66(0x0f38, 0x20): /* pmovsxbw xmm/m64,xmm */
     case X86EMUL_OPC_66(0x0f38, 0x21): /* pmovsxbd xmm/m32,xmm */
     case X86EMUL_OPC_66(0x0f38, 0x22): /* pmovsxbq xmm/m16,xmm */
@@ -7132,6 +7167,10 @@ x86_emulate(
         host_and_vcpu_must_have(sse4_1);
         goto simd_0f38_common;
 
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x0e): /* vtestps {x,y}mm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x0f): /* vtestpd {x,y}mm/mem,{x,y}mm */
+        generate_exception_if(vex.w, EXC_UD);
+        /* fall through */
     case X86EMUL_OPC_66(0x0f38, 0x17):     /* ptest xmm/m128,xmm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x17): /* vptest {x,y}mm/mem,{x,y}mm */
         if ( vex.opcx == vex_none )
@@ -7212,6 +7251,69 @@ x86_emulate(
         }
         goto movdqa;
 
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x2c): /* vmaskmovps mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x2d): /* vmaskmovpd mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x2e): /* vmaskmovps {x,y}mm,{x,y}mm,mem */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x2f): /* vmaskmovpd {x,y}mm,{x,y}mm,mem */
+    {
+        typeof(vex) *pvex;
+
+        generate_exception_if(ea.type != OP_MEM || vex.w, EXC_UD);
+        host_and_vcpu_must_have(avx);
+        get_fpu(X86EMUL_FPU_ymm, &fic);
+
+        /*
+         * While we can't reasonably provide fully correct behavior here
+         * (in particular, for writes, avoiding the memory read in anticipation
+         * of all elements in the range eventually being written), we can (and
+         * should) still limit the memory access to the smallest possible range
+         * (suppressing it altogether if all mask bits are clear), to provide
+         * correct faulting behavior. Read the mask bits via vmovmskp{s,d}
+         * for that purpose.
+         */
+        opc = init_prefixes(stub);
+        pvex = copy_VEX(opc, vex);
+        pvex->opcx = vex_0f;
+        if ( !(b & 1) )
+            pvex->pfx = vex_none;
+        opc[0] = 0x50; /* vmovmskp{s,d} */
+        /* Use %rax as GPR destination and VEX.vvvv as source. */
+        pvex->r = 1;
+        pvex->b = !mode_64bit() || (vex.reg >> 3);
+        opc[1] = 0xc0 | (~vex.reg & 7);
+        pvex->reg = 0xf;
+        opc[2] = 0xc3;
+
+        invoke_stub("", "", "=a" (ea.val) : [dummy] "i" (0));
+        put_stub(stub);
+
+        if ( !ea.val )
+            goto complete_insn;
+
+        op_bytes = 4 << (b & 1);
+        first_byte = __builtin_ctz(ea.val);
+        ea.val >>= first_byte;
+        first_byte *= op_bytes;
+        op_bytes *= 32 - __builtin_clz(ea.val);
+
+        /*
+         * Even for the memory write variant a memory read is needed, unless
+         * all set mask bits are contiguous.
+         */
+        if ( ea.val & (ea.val + 1) )
+            d = (d & ~SrcMask) | SrcMem;
+
+        opc = init_prefixes(stub);
+        opc[0] = b;
+        /* Convert memory operand to (%rAX). */
+        rex_prefix &= ~REX_B;
+        vex.b = 1;
+        opc[1] = modrm & 0x38;
+        fic.insn_bytes = PFX_BYTES + 2;
+
+        break;
+    }
+
     case X86EMUL_OPC_66(0x0f38, 0x37): /* pcmpgtq xmm/m128,xmm */
         host_and_vcpu_must_have(sse4_2);
         goto simd_0f38_common;
@@ -7407,6 +7509,16 @@ x86_emulate(
                             : "0" ((uint32_t)src.val), "rm" (_regs.edx) );
         break;
 
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x06): /* vperm2f128 $imm8,ymm/m256,ymm,ymm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x18): /* vinsertf128 $imm8,xmm/m128,ymm,ymm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x19): /* vextractf128 $imm8,ymm,xmm/m128 */
+        generate_exception_if(!vex.l, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x04): /* vpermilps $imm8,{x,y}mm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x05): /* vpermilpd $imm8,{x,y}mm/mem,{x,y}mm */
+        generate_exception_if(vex.w, EXC_UD);
+        goto simd_0f_imm8_avx;
+
     case X86EMUL_OPC(0x0f3a, 0x0f):    /* palignr $imm8,mm/m64,mm */
     case X86EMUL_OPC_66(0x0f3a, 0x0f): /* palignr $imm8,xmm/m128,xmm */
         host_and_vcpu_must_have(ssse3);
@@ -7758,7 +7870,9 @@ x86_emulate(
             switch ( d & SrcMask )
             {
             case SrcMem:
-                rc = ops->read(ea.mem.seg, ea.mem.off, mmvalp, op_bytes, ctxt);
+                rc = ops->read(ea.mem.seg, ea.mem.off + first_byte,
+                               (void *)mmvalp + first_byte, op_bytes,
+                               ctxt);
                 if ( rc != X86EMUL_OKAY )
                     goto done;
                 /* fall through */
@@ -7776,7 +7890,22 @@ x86_emulate(
             if ( (d & DstMask) == DstMem )
             {
                 fail_if(!ops->write); /* Check before running the stub. */
-                ASSERT(d & Mov);
+                if ( (d & SrcMask) == SrcMem )
+                    d |= Mov; /* Force memory write to occur below. */
+
+                switch ( ctxt->opcode )
+                {
+                case X86EMUL_OPC_VEX_66(0x0f38, 0x2e): /* vmaskmovps */
+                case X86EMUL_OPC_VEX_66(0x0f38, 0x2f): /* vmaskmovpd */
+                case X86EMUL_OPC_VEX_66(0x0f38, 0x8e): /* vpmaskmov{d,q} */
+                    /* These have merge semantics; force write to occur. */
+                    d |= Mov;
+                    break;
+                default:
+                    ASSERT(d & Mov);
+                    break;
+                }
+
                 dst.type = OP_MEM;
                 dst.bytes = op_bytes;
                 dst.mem = ea.mem;
@@ -7824,8 +7953,9 @@ x86_emulate(
         else
         {
             fail_if(!ops->write);
-            rc = ops->write(dst.mem.seg, dst.mem.off,
-                            !state->simd_size ? &dst.val : (void *)mmvalp,
+            rc = ops->write(dst.mem.seg, dst.mem.off + first_byte,
+                            !state->simd_size ? &dst.val
+                                              : (void *)mmvalp + first_byte,
                             dst.bytes, ctxt);
             if ( sfence )
                 asm volatile ( "sfence" ::: "memory" );

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

  reply	other threads:[~2017-06-21 11:59 UTC|newest]

Thread overview: 27+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-06-21 11:23 [PATCH 00/17] x86: emulator enhancements Jan Beulich
2017-06-21 11:59 ` Jan Beulich [this message]
2017-09-13 15:02   ` [PATCH 01/17] x86emul: support remaining AVX insns George Dunlap
2017-09-13 15:31     ` Jan Beulich
2017-06-21 11:59 ` [PATCH 02/17] x86emul: re-order cases of main switch statement Jan Beulich
2017-09-13 15:15   ` George Dunlap
2017-06-21 12:00 ` [PATCH 03/17] x86emul: build SIMD tests with -Os Jan Beulich
2017-09-13 15:19   ` George Dunlap
2017-09-13 15:34     ` Jan Beulich
2017-06-21 12:01 ` [PATCH 04/17] x86emul: support F16C insns Jan Beulich
2017-09-13 17:10   ` George Dunlap
2017-09-14  9:13     ` George Dunlap
2017-09-14 10:24       ` Jan Beulich
2017-06-21 12:01 ` [PATCH 05/17] x86emul: support FMA4 insns Jan Beulich
2017-06-21 12:02 ` [PATCH 06/17] x86emul: support FMA insns Jan Beulich
2017-06-21 12:02 ` [PATCH 07/17] x86emul: support most remaining AVX2 insns Jan Beulich
2017-06-21 12:03 ` [PATCH 08/17] x86emul: fold/eliminate some local variables Jan Beulich
2017-06-21 12:04 ` [PATCH 09/17] x86emul: support AVX2 gather insns Jan Beulich
2017-06-21 12:04 ` [PATCH 10/17] x86emul: add tables for XOP 08 and 09 extension spaces Jan Beulich
2017-06-21 12:05 ` [PATCH 11/17] x86emul: support XOP insns Jan Beulich
2017-06-21 12:05 ` [PATCH 12/17] x86emul: support 3DNow! insns Jan Beulich
2017-06-21 12:06 ` [PATCH 13/17] x86emul: re-order checks in test harness Jan Beulich
2017-06-21 12:07 ` [PATCH 14/17] x86emul: abstract out XCRn accesses Jan Beulich
2017-06-21 12:07 ` [PATCH 15/17] x86emul: adjust_bnd() should check XCR0 Jan Beulich
2017-06-21 12:08 ` [PATCH 16/17] x86emul: make all FPU emulation use the stub Jan Beulich
2017-06-21 12:09 ` [PATCH 17/17] x86/HVM: eliminate custom #MF/#XM handling Jan Beulich
2017-09-05 17:08 ` [PATCH 00/17] x86: emulator enhancements George Dunlap

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=594A7BAC02000078001652FB@prv-mh.provo.novell.com \
    --to=jbeulich@suse.com \
    --cc=andrew.cooper3@citrix.com \
    --cc=xen-devel@lists.xenproject.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.