All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 00/17] x86: emulator enhancements
@ 2017-06-21 11:23 Jan Beulich
  2017-06-21 11:59 ` [PATCH 01/17] x86emul: support remaining AVX insns Jan Beulich
                   ` (17 more replies)
  0 siblings, 18 replies; 27+ messages in thread
From: Jan Beulich @ 2017-06-21 11:23 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

01: support remaining AVX insns
02: re-order cases of main switch statement
03: build SIMD tests with -Os
04: support F16C insns
05: support FMA4 insns
06: support FMA insns
07: support most remaining AVX2 insns
08: fold/eliminate some local variables
09: support AVX2 gather insns
10: add tables for XOP 08 and 09 extension spaces
11: support XOP insns
12: support 3DNow! insns
13: re-order checks in test harness
14: abstract out XCRn accesses
15: adjust_bnd() should check XCR0
16: make all FPU emulation use the stub
17: eliminate custom #MF/#XM handling

Signed-off-by: Jan Beulich <jbeulich@suse.com>


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 01/17] x86emul: support remaining AVX insns
  2017-06-21 11:23 [PATCH 00/17] x86: emulator enhancements Jan Beulich
@ 2017-06-21 11:59 ` Jan Beulich
  2017-09-13 15:02   ` George Dunlap
  2017-06-21 11:59 ` [PATCH 02/17] x86emul: re-order cases of main switch statement Jan Beulich
                   ` (16 subsequent siblings)
  17 siblings, 1 reply; 27+ messages in thread
From: Jan Beulich @ 2017-06-21 11:59 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 28226 bytes --]

I.e. those not being equivalents of SSEn ones.

There's one necessary change to generic code: Faulting behavior of
VMASKMOVP{S,D} requires us to do partial reads/writes.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/.gitignore
+++ b/.gitignore
@@ -224,7 +224,7 @@
 tools/tests/x86_emulator/*.bin
 tools/tests/x86_emulator/*.tmp
 tools/tests/x86_emulator/asm
-tools/tests/x86_emulator/avx*.h
+tools/tests/x86_emulator/avx*.[ch]
 tools/tests/x86_emulator/blowfish.h
 tools/tests/x86_emulator/sse*.[ch]
 tools/tests/x86_emulator/test_x86_emulator
--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -11,8 +11,8 @@ all: $(TARGET)
 run: $(TARGET)
 	./$(TARGET)
 
-SIMD := sse sse2 sse4
-TESTCASES := blowfish $(SIMD) $(addsuffix -avx,$(filter sse%,$(SIMD)))
+SIMD := sse sse2 sse4 avx
+TESTCASES := blowfish $(SIMD) sse2-avx sse4-avx
 
 blowfish-cflags := ""
 blowfish-cflags-x86_32 := "-mno-accumulate-outgoing-args -Dstatic="
@@ -26,34 +26,36 @@ sse2-flts := 4 8
 sse4-vecs := $(sse2-vecs)
 sse4-ints := $(sse2-ints)
 sse4-flts := $(sse2-flts)
+avx-vecs := 16 32
+avx-ints :=
+avx-flts := 4 8
 
 # When converting SSE to AVX, have the compiler avoid XMM0 to widen
 # coverage of the VEX.vvvv checks in the emulator. We must not do this,
 # however, for SSE4.1 and later, as there are instructions with XMM0 as
 # an implicit operand.
-sse2avx-sse  := -ffixed-xmm0 -Wa,-msse2avx
-sse2avx-sse2 := $(sse2avx-sse)
+sse2avx-sse2 := -ffixed-xmm0 -Wa,-msse2avx
 sse2avx-sse4 := -Wa,-msse2avx
 
+# For AVX and later, have the compiler avoid XMM0 to widen coverage of
+# the VEX.vvvv checks in the emulator.
+non-sse = $(if $(filter sse%,$(1)),,-ffixed-xmm0)
+
 define simd-defs
 $(1)-cflags := \
 	$(foreach vec,$($(1)-vecs), \
 	  $(foreach int,$($(1)-ints), \
-	    "-D_$(vec)i$(int) -m$(1) -O2 -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
-	    "-D_$(vec)u$(int) -m$(1) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)") \
+	    "-D_$(vec)i$(int) -m$(1) $(call non-sse,$(1)) -O2 -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
+	    "-D_$(vec)u$(int) -m$(1) $(call non-sse,$(1)) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)") \
 	  $(foreach flt,$($(1)-flts), \
-	    "-D_$(vec)f$(flt) -m$(1) -O2 -DVEC_SIZE=$(vec) -DFLOAT_SIZE=$(flt)")) \
+	    "-D_$(vec)f$(flt) -m$(1) $(call non-sse,$(1)) -O2 -DVEC_SIZE=$(vec) -DFLOAT_SIZE=$(flt)")) \
 	$(foreach flt,$($(1)-flts), \
-	  "-D_f$(flt) -m$(1) -mfpmath=sse -O2 -DFLOAT_SIZE=$(flt)")
+	  "-D_f$(flt) -m$(1) $(call non-sse,$(1)) -mfpmath=sse -O2 -DFLOAT_SIZE=$(flt)")
 $(1)-avx-cflags := \
 	$(foreach vec,$($(1)-vecs), \
 	  $(foreach int,$($(1)-ints), \
 	    "-D_$(vec)i$(int) -m$(1) $(sse2avx-$(1)) -O2 -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
-	    "-D_$(vec)u$(int) -m$(1) $(sse2avx-$(1)) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)") \
-	  $(foreach flt,$($(1)-flts), \
-	    "-D_$(vec)f$(flt) -m$(1) $(sse2avx-$(1)) -O2 -DVEC_SIZE=$(vec) -DFLOAT_SIZE=$(flt)")) \
-	$(foreach flt,$($(1)-flts), \
-	  "-D_f$(flt) -m$(1) -mfpmath=sse $(sse2avx-$(1)) -O2 -DFLOAT_SIZE=$(flt)")
+	    "-D_$(vec)u$(int) -m$(1) $(sse2avx-$(1)) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)"))
 endef
 
 $(foreach flavor,$(SIMD),$(eval $(call simd-defs,$(flavor))))
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -70,7 +70,13 @@ typedef long long __attribute__((vector_
 #if VEC_SIZE == 8 && defined(__SSE__)
 # define to_bool(cmp) (__builtin_ia32_pmovmskb(cmp) == 0xff)
 #elif VEC_SIZE == 16
-# if defined(__SSE4_1__)
+# if defined(__AVX__) && defined(FLOAT_SIZE)
+#  if ELEM_SIZE == 4
+#   define to_bool(cmp) __builtin_ia32_vtestcps(cmp, (vec_t){} == 0)
+#  elif ELEM_SIZE == 8
+#   define to_bool(cmp) __builtin_ia32_vtestcpd(cmp, (vec_t){} == 0)
+#  endif
+# elif defined(__SSE4_1__)
 #  define to_bool(cmp) __builtin_ia32_ptestc128(cmp, (vdi_t){} == 0)
 # elif defined(__SSE__) && ELEM_SIZE == 4
 #  define to_bool(cmp) (__builtin_ia32_movmskps(cmp) == 0xf)
@@ -81,6 +87,12 @@ typedef long long __attribute__((vector_
 #   define to_bool(cmp) (__builtin_ia32_pmovmskb128(cmp) == 0xffff)
 #  endif
 # endif
+#elif VEC_SIZE == 32
+# if defined(__AVX__) && ELEM_SIZE == 4
+#  define to_bool(cmp) (__builtin_ia32_movmskps256(cmp) == 0xff)
+# elif defined(__AVX__) && ELEM_SIZE == 8
+#  define to_bool(cmp) (__builtin_ia32_movmskpd256(cmp) == 0xf)
+# endif
 #endif
 
 #ifndef to_bool
@@ -105,6 +117,12 @@ static inline bool _to_bool(byte_vec_t b
 # elif FLOAT_SIZE == 8
 #  define to_int(x) __builtin_ia32_cvtdq2pd(__builtin_ia32_cvtpd2dq(x))
 # endif
+#elif VEC_SIZE == 32 && defined(__AVX__)
+# if FLOAT_SIZE == 4
+#  define to_int(x) __builtin_ia32_cvtdq2ps256(__builtin_ia32_cvtps2dq256(x))
+# elif FLOAT_SIZE == 8
+#  define to_int(x) __builtin_ia32_cvtdq2pd256(__builtin_ia32_cvtpd2dq256(x))
+# endif
 #endif
 
 #if VEC_SIZE == FLOAT_SIZE
@@ -116,7 +134,25 @@ static inline bool _to_bool(byte_vec_t b
 #endif
 
 #if FLOAT_SIZE == 4 && defined(__SSE__)
-# if VEC_SIZE == 16
+# if VEC_SIZE == 32 && defined(__AVX__)
+#  define broadcast(x) ({ float t_ = (x); __builtin_ia32_vbroadcastss256(&t_); })
+#  define max(x, y) __builtin_ia32_maxps256(x, y)
+#  define min(x, y) __builtin_ia32_minps256(x, y)
+#  define recip(x) __builtin_ia32_rcpps256(x)
+#  define rsqrt(x) __builtin_ia32_rsqrtps256(x)
+#  define sqrt(x) __builtin_ia32_sqrtps256(x)
+#  define swap(x) ({ \
+    vec_t t_ = __builtin_ia32_vpermilps256(x, 0b00011011); \
+    __builtin_ia32_vperm2f128_ps256(t_, t_, 0b00000001); \
+})
+#  define swap2(x) ({ \
+    vec_t t_ = __builtin_ia32_vpermilvarps256(x, __builtin_ia32_cvtps2dq256(inv) - 1); \
+    __builtin_ia32_vperm2f128_ps256(t_, t_, 0b00000001); \
+})
+# elif VEC_SIZE == 16
+#  ifdef __AVX__
+#   define broadcast(x) ({ float t_ = (x); __builtin_ia32_vbroadcastss(&t_); })
+#  endif
 #  define interleave_hi(x, y) __builtin_ia32_unpckhps(x, y)
 #  define interleave_lo(x, y) __builtin_ia32_unpcklps(x, y)
 #  define max(x, y) __builtin_ia32_maxps(x, y)
@@ -125,13 +161,39 @@ static inline bool _to_bool(byte_vec_t b
 #  define rsqrt(x) __builtin_ia32_rsqrtps(x)
 #  define sqrt(x) __builtin_ia32_sqrtps(x)
 #  define swap(x) __builtin_ia32_shufps(x, x, 0b00011011)
+#  ifdef __AVX__
+#   define swap2(x) __builtin_ia32_vpermilvarps(x, __builtin_ia32_cvtps2dq(inv) - 1)
+#  endif
 # elif VEC_SIZE == 4
 #  define recip(x) scalar_1op(x, "rcpss %[in], %[out]")
 #  define rsqrt(x) scalar_1op(x, "rsqrtss %[in], %[out]")
 #  define sqrt(x) scalar_1op(x, "sqrtss %[in], %[out]")
 # endif
 #elif FLOAT_SIZE == 8 && defined(__SSE2__)
-# if VEC_SIZE == 16
+# if VEC_SIZE == 32 && defined(__AVX__)
+#  define broadcast(x) ({ double t_ = (x); __builtin_ia32_vbroadcastsd256(&t_); })
+#  define max(x, y) __builtin_ia32_maxpd256(x, y)
+#  define min(x, y) __builtin_ia32_minpd256(x, y)
+#  define recip(x) ({ \
+    float __attribute__((vector_size(16))) t_ = __builtin_ia32_cvtpd2ps256(x); \
+    t_ = __builtin_ia32_vextractf128_ps256( \
+             __builtin_ia32_rcpps256( \
+                 __builtin_ia32_vbroadcastf128_ps256(&t_)), 0); \
+    __builtin_ia32_cvtps2pd256(t_); \
+})
+#  define rsqrt(x) ({ \
+    float __attribute__((vector_size(16))) t1_ = __builtin_ia32_cvtpd2ps256(x); \
+    float __attribute__((vector_size(32))) t2_ = __builtin_ia32_vinsertf128_ps256((typeof(t2_)){}, t1_, 0); \
+    t2_ = __builtin_ia32_vinsertf128_ps256(t2_, t1_, 1); \
+    t1_ = __builtin_ia32_vextractf128_ps256(__builtin_ia32_rsqrtps256(t2_), 0); \
+    __builtin_ia32_cvtps2pd256(t1_); \
+})
+#  define sqrt(x) __builtin_ia32_sqrtpd256(x)
+#  define swap(x) ({ \
+    vec_t t_ = __builtin_ia32_vpermilpd256(x, 0b00000101); \
+    __builtin_ia32_vperm2f128_pd256(t_, t_, 0b00000001); \
+})
+# elif VEC_SIZE == 16
 #  define interleave_hi(x, y) __builtin_ia32_unpckhpd(x, y)
 #  define interleave_lo(x, y) __builtin_ia32_unpcklpd(x, y)
 #  define max(x, y) __builtin_ia32_maxpd(x, y)
@@ -140,6 +202,10 @@ static inline bool _to_bool(byte_vec_t b
 #  define rsqrt(x) __builtin_ia32_cvtps2pd(__builtin_ia32_rsqrtps(__builtin_ia32_cvtpd2ps(x)))
 #  define sqrt(x) __builtin_ia32_sqrtpd(x)
 #  define swap(x) __builtin_ia32_shufpd(x, x, 0b01)
+#  ifdef __AVX__
+#   define swap2(x) __builtin_ia32_vpermilvarpd(x, __builtin_ia32_pmovsxdq128( \
+                                                       __builtin_ia32_cvtpd2dq(inv) - 1) << 1)
+#  endif
 # elif VEC_SIZE == 8
 #  define recip(x) scalar_1op(x, "cvtsd2ss %[in], %[out]; rcpss %[out], %[out]; cvtss2sd %[out], %[out]")
 #  define rsqrt(x) scalar_1op(x, "cvtsd2ss %[in], %[out]; rsqrtss %[out], %[out]; cvtss2sd %[out], %[out]")
@@ -201,6 +267,31 @@ static inline bool _to_bool(byte_vec_t b
 #  define hadd(x, y) __builtin_ia32_haddpd(x, y)
 #  define hsub(x, y) __builtin_ia32_hsubpd(x, y)
 # endif
+#elif VEC_SIZE == 32 && defined(__AVX__)
+# if FLOAT_SIZE == 4
+#  define addsub(x, y) __builtin_ia32_addsubps256(x, y)
+#  define dup_hi(x) __builtin_ia32_movshdup256(x)
+#  define dup_lo(x) __builtin_ia32_movsldup256(x)
+#  define hadd(x, y) ({ \
+        vec_t t_ = __builtin_ia32_haddps256(x, y); \
+        (vec_t){t_[0], t_[1], t_[4], t_[5], t_[2], t_[3], t_[6], t_[7]}; \
+})
+#  define hsub(x, y) ({ \
+        vec_t t_ = __builtin_ia32_hsubps256(x, y); \
+        (vec_t){t_[0], t_[1], t_[4], t_[5], t_[2], t_[3], t_[6], t_[7]}; \
+})
+# elif FLOAT_SIZE == 8
+#  define addsub(x, y) __builtin_ia32_addsubpd256(x, y)
+#  define dup_lo(x) __builtin_ia32_movddup256(x)
+#  define hadd(x, y) ({ \
+        vec_t t_ = __builtin_ia32_haddpd256(x, y); \
+        (vec_t){t_[0], t_[2], t_[1], t_[3]}; \
+})
+#  define hsub(x, y) ({ \
+        vec_t t_ = __builtin_ia32_hsubpd256(x, y); \
+        (vec_t){t_[0], t_[2], t_[1], t_[3]}; \
+})
+# endif
 #endif
 #if VEC_SIZE == 16 && defined(__SSSE3__)
 # if INT_SIZE == 1
@@ -282,6 +373,31 @@ static inline bool _to_bool(byte_vec_t b
 #  define mix(x, y) __builtin_ia32_blendpd(x, y, 0b10)
 # endif
 #endif
+#if VEC_SIZE == 32 && defined(__AVX__)
+# if FLOAT_SIZE == 4
+#  define dot_product(x, y) ({ \
+    vec_t t_ = __builtin_ia32_dpps256(x, y, 0b11110001); \
+    (vec_t){t_[0] + t_[4]}; \
+})
+#  define mix(x, y) __builtin_ia32_blendps256(x, y, 0b10101010)
+#  define select(d, x, y, m) (*(d) = __builtin_ia32_blendvps256(y, x, m))
+#  define select2(d, x, y, m) ({ \
+    vsi_t m_ = (vsi_t)(m); \
+    *(d) = __builtin_ia32_maskloadps256(&(x),  m_); \
+    __builtin_ia32_maskstoreps256(d, ~m_, y); \
+})
+#  define trunc(x) __builtin_ia32_roundps256(x, 0b1011)
+# elif FLOAT_SIZE == 8
+#  define mix(x, y) __builtin_ia32_blendpd256(x, y, 0b1010)
+#  define select(d, x, y, m) (*(d) = __builtin_ia32_blendvpd256(y, x, m))
+#  define select2(d, x, y, m) ({ \
+    vdi_t m_ = (vdi_t)(m); \
+    *(d) = __builtin_ia32_maskloadpd256(&(x),  m_); \
+    __builtin_ia32_maskstorepd256(d, ~m_, y); \
+})
+#  define trunc(x) __builtin_ia32_roundpd256(x, 0b1011)
+# endif
+#endif
 #if VEC_SIZE == FLOAT_SIZE
 # define max(x, y) ((vec_t){({ typeof(x[0]) x_ = (x)[0], y_ = (y)[0]; x_ > y_ ? x_ : y_; })})
 # define min(x, y) ((vec_t){({ typeof(x[0]) x_ = (x)[0], y_ = (y)[0]; x_ < y_ ? x_ : y_; })})
@@ -555,6 +671,15 @@ int simd_test(void)
     if ( !to_bool(swap(src) == inv) ) return __LINE__;
 #endif
 
+#ifdef swap2
+    touch(src);
+    if ( !to_bool(swap2(src) == inv) ) return __LINE__;
+#endif
+
+#if defined(broadcast)
+    if ( !to_bool(broadcast(ELEM_COUNT + 1) == src + inv) ) return __LINE__;
+#endif
+
 #if defined(interleave_lo) && defined(interleave_hi)
     touch(src);
     x = interleave_lo(inv, src);
@@ -652,6 +777,15 @@ int simd_test(void)
 # endif
     if ( !to_bool(z == y) ) return __LINE__;
 #endif
+
+#ifdef select2
+# ifdef UINT_SIZE
+    select2(&z, src, inv, alt);
+# else
+    select2(&z, src, inv, alt > 0);
+# endif
+    if ( !to_bool(z == y) ) return __LINE__;
+#endif
 
 #ifdef mix
     touch(src);
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -8,9 +8,9 @@
 #include "sse.h"
 #include "sse2.h"
 #include "sse4.h"
-#include "sse-avx.h"
 #include "sse2-avx.h"
 #include "sse4-avx.h"
+#include "avx.h"
 
 #define verbose false /* Switch to true for far more logging. */
 
@@ -44,7 +44,6 @@ static bool simd_check_avx(void)
 {
     return cpu_has_avx;
 }
-#define simd_check_sse_avx   simd_check_avx
 #define simd_check_sse2_avx  simd_check_avx
 #define simd_check_sse4_avx  simd_check_avx
 
@@ -122,12 +121,6 @@ static const struct {
     SIMD(SSE4 packed u32,        sse4,      16u4),
     SIMD(SSE4 packed s64,        sse4,      16i8),
     SIMD(SSE4 packed u64,        sse4,      16u8),
-    SIMD(SSE/AVX scalar single,  sse_avx,     f4),
-    SIMD(SSE/AVX packed single,  sse_avx,   16f4),
-    SIMD(SSE2/AVX scalar single, sse2_avx,    f4),
-    SIMD(SSE2/AVX packed single, sse2_avx,  16f4),
-    SIMD(SSE2/AVX scalar double, sse2_avx,    f8),
-    SIMD(SSE2/AVX packed double, sse2_avx,  16f8),
     SIMD(SSE2/AVX packed s8,     sse2_avx,  16i1),
     SIMD(SSE2/AVX packed u8,     sse2_avx,  16u1),
     SIMD(SSE2/AVX packed s16,    sse2_avx,  16i2),
@@ -136,10 +129,6 @@ static const struct {
     SIMD(SSE2/AVX packed u32,    sse2_avx,  16u4),
     SIMD(SSE2/AVX packed s64,    sse2_avx,  16i8),
     SIMD(SSE2/AVX packed u64,    sse2_avx,  16u8),
-    SIMD(SSE4/AVX scalar single, sse4_avx,    f4),
-    SIMD(SSE4/AVX packed single, sse4_avx,  16f4),
-    SIMD(SSE4/AVX scalar double, sse4_avx,    f8),
-    SIMD(SSE4/AVX packed double, sse4_avx,  16f8),
     SIMD(SSE4/AVX packed s8,     sse4_avx,  16i1),
     SIMD(SSE4/AVX packed u8,     sse4_avx,  16u1),
     SIMD(SSE4/AVX packed s16,    sse4_avx,  16i2),
@@ -148,6 +137,12 @@ static const struct {
     SIMD(SSE4/AVX packed u32,    sse4_avx,  16u4),
     SIMD(SSE4/AVX packed s64,    sse4_avx,  16i8),
     SIMD(SSE4/AVX packed u64,    sse4_avx,  16u8),
+    SIMD(AVX scalar single,      avx,         f4),
+    SIMD(AVX 128bit single,      avx,       16f4),
+    SIMD(AVX 256bit single,      avx,       32f4),
+    SIMD(AVX scalar double,      avx,         f8),
+    SIMD(AVX 128bit double,      avx,       16f8),
+    SIMD(AVX 256bit double,      avx,       32f8),
 #undef SIMD_
 #undef SIMD
 };
@@ -2834,6 +2829,81 @@ int main(int argc, char **argv)
         printf("okay\n");
     }
     else
+        printf("skipped\n");
+
+    /*
+     * The following "maskmov" tests are not only making sure the written data
+     * is correct, but verify (by placing operands on the mapping boundaries)
+     * that elements controlled by clear mask bits aren't being accessed.
+     */
+    printf("%-40s", "Testing vmaskmovps %xmm1,%xmm2,(%edx)...");
+    if ( stack_exec && cpu_has_avx )
+    {
+        decl_insn(vmaskmovps);
+
+        asm volatile ( "vxorps %%xmm1, %%xmm1, %%xmm1\n\t"
+                       "vcmpeqss %%xmm1, %%xmm1, %%xmm2\n\t"
+                       put_insn(vmaskmovps, "vmaskmovps %%xmm1, %%xmm2, (%0)")
+                       :: "d" (NULL) );
+
+        memset(res + MMAP_SZ / sizeof(*res) - 8, 0xdb, 32);
+        set_insn(vmaskmovps);
+        regs.edx = (unsigned long)res + MMAP_SZ - 4;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vmaskmovps) ||
+             res[MMAP_SZ / sizeof(*res) - 1] ||
+             memcmp(res + MMAP_SZ / sizeof(*res) - 8,
+                    res + MMAP_SZ / sizeof(*res) - 4, 12) )
+            goto fail;
+
+        asm volatile ( "vinsertps $0b00110111, %xmm2, %xmm2, %xmm2" );
+        memset(res, 0xdb, 32);
+        set_insn(vmaskmovps);
+        regs.edx = (unsigned long)(res - 3);
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vmaskmovps) ||
+             res[0] || memcmp(res + 1, res + 4, 12) )
+            goto fail;
+
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing vmaskmovpd %xmm1,%xmm2,(%edx)...");
+    if ( stack_exec && cpu_has_avx )
+    {
+        decl_insn(vmaskmovpd);
+
+        asm volatile ( "vxorpd %%xmm1, %%xmm1, %%xmm1\n\t"
+                       "vcmpeqsd %%xmm1, %%xmm1, %%xmm2\n\t"
+                       put_insn(vmaskmovpd, "vmaskmovpd %%xmm1, %%xmm2, (%0)")
+                       :: "d" (NULL) );
+
+        memset(res + MMAP_SZ / sizeof(*res) - 8, 0xdb, 32);
+        set_insn(vmaskmovpd);
+        regs.edx = (unsigned long)res + MMAP_SZ - 8;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vmaskmovpd) ||
+             res[MMAP_SZ / sizeof(*res) - 1] ||
+             res[MMAP_SZ / sizeof(*res) - 2] ||
+             memcmp(res + MMAP_SZ / sizeof(*res) - 8,
+                    res + MMAP_SZ / sizeof(*res) - 4, 8) )
+            goto fail;
+
+        asm volatile ( "vmovddup %xmm2, %xmm2\n\t"
+                       "vmovsd %xmm1, %xmm2, %xmm2" );
+        memset(res, 0xdb, 32);
+        set_insn(vmaskmovpd);
+        regs.edx = (unsigned long)(res - 2);
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vmaskmovpd) ||
+             res[0] || res[1] || memcmp(res + 2, res + 4, 8) )
+            goto fail;
+
+        printf("okay\n");
+    }
+    else
         printf("skipped\n");
 
     printf("%-40s", "Testing stmxcsr (%edx)...");
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -226,6 +226,12 @@ enum simd_opsize {
      */
     simd_scalar_fp,
 
+    /*
+     * 128 bits of integer or floating point data, with no further
+     * formatting information.
+     */
+    simd_128,
+
     /* Operand size encoded in non-standard way. */
     simd_other
 };
@@ -361,14 +367,19 @@ static const struct {
     uint8_t vsib:1;
 } ext0f38_table[256] = {
     [0x00 ... 0x0b] = { .simd_size = simd_packed_int },
+    [0x0c ... 0x0f] = { .simd_size = simd_packed_fp },
     [0x10] = { .simd_size = simd_packed_int },
     [0x14 ... 0x15] = { .simd_size = simd_packed_fp },
     [0x17] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0x18 ... 0x19] = { .simd_size = simd_scalar_fp, .two_op = 1 },
+    [0x1a] = { .simd_size = simd_128, .two_op = 1 },
     [0x1c ... 0x1e] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0x20 ... 0x25] = { .simd_size = simd_other, .two_op = 1 },
     [0x28 ... 0x29] = { .simd_size = simd_packed_int },
     [0x2a] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0x2b] = { .simd_size = simd_packed_int },
+    [0x2c ... 0x2d] = { .simd_size = simd_other },
+    [0x2e ... 0x2f] = { .simd_size = simd_other, .to_mem = 1 },
     [0x30 ... 0x35] = { .simd_size = simd_other, .two_op = 1 },
     [0x37 ... 0x3f] = { .simd_size = simd_packed_int },
     [0x40] = { .simd_size = simd_packed_int },
@@ -391,11 +402,15 @@ static const struct {
     uint8_t two_op:1;
     uint8_t four_op:1;
 } ext0f3a_table[256] = {
+    [0x04 ... 0x05] = { .simd_size = simd_packed_fp, .two_op = 1 },
+    [0x06] = { .simd_size = simd_packed_fp },
     [0x08 ... 0x09] = { .simd_size = simd_packed_fp, .two_op = 1 },
     [0x0a ... 0x0b] = { .simd_size = simd_scalar_fp },
     [0x0c ... 0x0d] = { .simd_size = simd_packed_fp },
     [0x0e ... 0x0f] = { .simd_size = simd_packed_int },
     [0x14 ... 0x17] = { .simd_size = simd_none, .to_mem = 1, .two_op = 1 },
+    [0x18] = { .simd_size = simd_128 },
+    [0x19] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1 },
     [0x20] = { .simd_size = simd_none },
     [0x21] = { .simd_size = simd_other },
     [0x22] = { .simd_size = simd_none },
@@ -469,15 +484,18 @@ union vex {
     buf_ + 3; \
 })
 
+#define copy_VEX(ptr, vex) ({ \
+    if ( !mode_64bit() ) \
+        (vex).reg |= 8; \
+    (ptr)[0 - PFX_BYTES] = 0xc4; \
+    (ptr)[1 - PFX_BYTES] = (vex).raw[0]; \
+    (ptr)[2 - PFX_BYTES] = (vex).raw[1]; \
+    container_of((ptr) + 1 - PFX_BYTES, typeof(vex), raw[0]); \
+})
+
 #define copy_REX_VEX(ptr, rex, vex) do { \
     if ( (vex).opcx != vex_none ) \
-    { \
-        if ( !mode_64bit() ) \
-            vex.reg |= 8; \
-        (ptr)[0 - PFX_BYTES] = 0xc4; \
-        (ptr)[1 - PFX_BYTES] = (vex).raw[0]; \
-        (ptr)[2 - PFX_BYTES] = (vex).raw[1]; \
-    } \
+        copy_VEX(ptr, vex); \
     else \
     { \
         if ( (vex).pfx ) \
@@ -2933,6 +2951,10 @@ x86_decode(
         op_bytes = 4 << (ctxt->opcode & 1);
         break;
 
+    case simd_128:
+        op_bytes = 16;
+        break;
+
     default:
         op_bytes = 0;
         break;
@@ -2959,6 +2981,7 @@ x86_emulate(
     struct x86_emulate_state state;
     int rc;
     uint8_t b, d, *opc = NULL;
+    unsigned int first_byte = 0;
     bool singlestep = (_regs.eflags & X86_EFLAGS_TF) &&
 	    !is_branch_step(ctxt, ops);
     bool sfence = false;
@@ -7099,6 +7122,18 @@ x86_emulate(
         fic.insn_bytes = PFX_BYTES + 3;
         break;
 
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x19): /* vbroadcastsd m64,ymm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x1a): /* vbroadcastf128 m128,ymm */
+        generate_exception_if(!vex.l, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x18): /* vbroadcastss m32,{x,y}mm */
+        generate_exception_if(ea.type != OP_MEM, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x0c): /* vpermilps {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x0d): /* vpermilpd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+        generate_exception_if(vex.w, EXC_UD);
+        goto simd_0f_avx;
+
     case X86EMUL_OPC_66(0x0f38, 0x20): /* pmovsxbw xmm/m64,xmm */
     case X86EMUL_OPC_66(0x0f38, 0x21): /* pmovsxbd xmm/m32,xmm */
     case X86EMUL_OPC_66(0x0f38, 0x22): /* pmovsxbq xmm/m16,xmm */
@@ -7132,6 +7167,10 @@ x86_emulate(
         host_and_vcpu_must_have(sse4_1);
         goto simd_0f38_common;
 
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x0e): /* vtestps {x,y}mm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x0f): /* vtestpd {x,y}mm/mem,{x,y}mm */
+        generate_exception_if(vex.w, EXC_UD);
+        /* fall through */
     case X86EMUL_OPC_66(0x0f38, 0x17):     /* ptest xmm/m128,xmm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x17): /* vptest {x,y}mm/mem,{x,y}mm */
         if ( vex.opcx == vex_none )
@@ -7212,6 +7251,69 @@ x86_emulate(
         }
         goto movdqa;
 
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x2c): /* vmaskmovps mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x2d): /* vmaskmovpd mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x2e): /* vmaskmovps {x,y}mm,{x,y}mm,mem */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x2f): /* vmaskmovpd {x,y}mm,{x,y}mm,mem */
+    {
+        typeof(vex) *pvex;
+
+        generate_exception_if(ea.type != OP_MEM || vex.w, EXC_UD);
+        host_and_vcpu_must_have(avx);
+        get_fpu(X86EMUL_FPU_ymm, &fic);
+
+        /*
+         * While we can't reasonably provide fully correct behavior here
+         * (in particular, for writes, avoiding the memory read in anticipation
+         * of all elements in the range eventually being written), we can (and
+         * should) still limit the memory access to the smallest possible range
+         * (suppressing it altogether if all mask bits are clear), to provide
+         * correct faulting behavior. Read the mask bits via vmovmskp{s,d}
+         * for that purpose.
+         */
+        opc = init_prefixes(stub);
+        pvex = copy_VEX(opc, vex);
+        pvex->opcx = vex_0f;
+        if ( !(b & 1) )
+            pvex->pfx = vex_none;
+        opc[0] = 0x50; /* vmovmskp{s,d} */
+        /* Use %rax as GPR destination and VEX.vvvv as source. */
+        pvex->r = 1;
+        pvex->b = !mode_64bit() || (vex.reg >> 3);
+        opc[1] = 0xc0 | (~vex.reg & 7);
+        pvex->reg = 0xf;
+        opc[2] = 0xc3;
+
+        invoke_stub("", "", "=a" (ea.val) : [dummy] "i" (0));
+        put_stub(stub);
+
+        if ( !ea.val )
+            goto complete_insn;
+
+        op_bytes = 4 << (b & 1);
+        first_byte = __builtin_ctz(ea.val);
+        ea.val >>= first_byte;
+        first_byte *= op_bytes;
+        op_bytes *= 32 - __builtin_clz(ea.val);
+
+        /*
+         * Even for the memory write variant a memory read is needed, unless
+         * all set mask bits are contiguous.
+         */
+        if ( ea.val & (ea.val + 1) )
+            d = (d & ~SrcMask) | SrcMem;
+
+        opc = init_prefixes(stub);
+        opc[0] = b;
+        /* Convert memory operand to (%rAX). */
+        rex_prefix &= ~REX_B;
+        vex.b = 1;
+        opc[1] = modrm & 0x38;
+        fic.insn_bytes = PFX_BYTES + 2;
+
+        break;
+    }
+
     case X86EMUL_OPC_66(0x0f38, 0x37): /* pcmpgtq xmm/m128,xmm */
         host_and_vcpu_must_have(sse4_2);
         goto simd_0f38_common;
@@ -7407,6 +7509,16 @@ x86_emulate(
                             : "0" ((uint32_t)src.val), "rm" (_regs.edx) );
         break;
 
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x06): /* vperm2f128 $imm8,ymm/m256,ymm,ymm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x18): /* vinsertf128 $imm8,xmm/m128,ymm,ymm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x19): /* vextractf128 $imm8,ymm,xmm/m128 */
+        generate_exception_if(!vex.l, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x04): /* vpermilps $imm8,{x,y}mm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x05): /* vpermilpd $imm8,{x,y}mm/mem,{x,y}mm */
+        generate_exception_if(vex.w, EXC_UD);
+        goto simd_0f_imm8_avx;
+
     case X86EMUL_OPC(0x0f3a, 0x0f):    /* palignr $imm8,mm/m64,mm */
     case X86EMUL_OPC_66(0x0f3a, 0x0f): /* palignr $imm8,xmm/m128,xmm */
         host_and_vcpu_must_have(ssse3);
@@ -7758,7 +7870,9 @@ x86_emulate(
             switch ( d & SrcMask )
             {
             case SrcMem:
-                rc = ops->read(ea.mem.seg, ea.mem.off, mmvalp, op_bytes, ctxt);
+                rc = ops->read(ea.mem.seg, ea.mem.off + first_byte,
+                               (void *)mmvalp + first_byte, op_bytes,
+                               ctxt);
                 if ( rc != X86EMUL_OKAY )
                     goto done;
                 /* fall through */
@@ -7776,7 +7890,22 @@ x86_emulate(
             if ( (d & DstMask) == DstMem )
             {
                 fail_if(!ops->write); /* Check before running the stub. */
-                ASSERT(d & Mov);
+                if ( (d & SrcMask) == SrcMem )
+                    d |= Mov; /* Force memory write to occur below. */
+
+                switch ( ctxt->opcode )
+                {
+                case X86EMUL_OPC_VEX_66(0x0f38, 0x2e): /* vmaskmovps */
+                case X86EMUL_OPC_VEX_66(0x0f38, 0x2f): /* vmaskmovpd */
+                case X86EMUL_OPC_VEX_66(0x0f38, 0x8e): /* vpmaskmov{d,q} */
+                    /* These have merge semantics; force write to occur. */
+                    d |= Mov;
+                    break;
+                default:
+                    ASSERT(d & Mov);
+                    break;
+                }
+
                 dst.type = OP_MEM;
                 dst.bytes = op_bytes;
                 dst.mem = ea.mem;
@@ -7824,8 +7953,9 @@ x86_emulate(
         else
         {
             fail_if(!ops->write);
-            rc = ops->write(dst.mem.seg, dst.mem.off,
-                            !state->simd_size ? &dst.val : (void *)mmvalp,
+            rc = ops->write(dst.mem.seg, dst.mem.off + first_byte,
+                            !state->simd_size ? &dst.val
+                                              : (void *)mmvalp + first_byte,
                             dst.bytes, ctxt);
             if ( sfence )
                 asm volatile ( "sfence" ::: "memory" );



[-- Attachment #2: x86emul-AVX.patch --]
[-- Type: text/plain, Size: 28262 bytes --]

x86emul: support remaining AVX insns

I.e. those not being equivalents of SSEn ones.

There's one necessary change to generic code: Faulting behavior of
VMASKMOVP{S,D} requires us to do partial reads/writes.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/.gitignore
+++ b/.gitignore
@@ -224,7 +224,7 @@
 tools/tests/x86_emulator/*.bin
 tools/tests/x86_emulator/*.tmp
 tools/tests/x86_emulator/asm
-tools/tests/x86_emulator/avx*.h
+tools/tests/x86_emulator/avx*.[ch]
 tools/tests/x86_emulator/blowfish.h
 tools/tests/x86_emulator/sse*.[ch]
 tools/tests/x86_emulator/test_x86_emulator
--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -11,8 +11,8 @@ all: $(TARGET)
 run: $(TARGET)
 	./$(TARGET)
 
-SIMD := sse sse2 sse4
-TESTCASES := blowfish $(SIMD) $(addsuffix -avx,$(filter sse%,$(SIMD)))
+SIMD := sse sse2 sse4 avx
+TESTCASES := blowfish $(SIMD) sse2-avx sse4-avx
 
 blowfish-cflags := ""
 blowfish-cflags-x86_32 := "-mno-accumulate-outgoing-args -Dstatic="
@@ -26,34 +26,36 @@ sse2-flts := 4 8
 sse4-vecs := $(sse2-vecs)
 sse4-ints := $(sse2-ints)
 sse4-flts := $(sse2-flts)
+avx-vecs := 16 32
+avx-ints :=
+avx-flts := 4 8
 
 # When converting SSE to AVX, have the compiler avoid XMM0 to widen
 # coverage of the VEX.vvvv checks in the emulator. We must not do this,
 # however, for SSE4.1 and later, as there are instructions with XMM0 as
 # an implicit operand.
-sse2avx-sse  := -ffixed-xmm0 -Wa,-msse2avx
-sse2avx-sse2 := $(sse2avx-sse)
+sse2avx-sse2 := -ffixed-xmm0 -Wa,-msse2avx
 sse2avx-sse4 := -Wa,-msse2avx
 
+# For AVX and later, have the compiler avoid XMM0 to widen coverage of
+# the VEX.vvvv checks in the emulator.
+non-sse = $(if $(filter sse%,$(1)),,-ffixed-xmm0)
+
 define simd-defs
 $(1)-cflags := \
 	$(foreach vec,$($(1)-vecs), \
 	  $(foreach int,$($(1)-ints), \
-	    "-D_$(vec)i$(int) -m$(1) -O2 -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
-	    "-D_$(vec)u$(int) -m$(1) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)") \
+	    "-D_$(vec)i$(int) -m$(1) $(call non-sse,$(1)) -O2 -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
+	    "-D_$(vec)u$(int) -m$(1) $(call non-sse,$(1)) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)") \
 	  $(foreach flt,$($(1)-flts), \
-	    "-D_$(vec)f$(flt) -m$(1) -O2 -DVEC_SIZE=$(vec) -DFLOAT_SIZE=$(flt)")) \
+	    "-D_$(vec)f$(flt) -m$(1) $(call non-sse,$(1)) -O2 -DVEC_SIZE=$(vec) -DFLOAT_SIZE=$(flt)")) \
 	$(foreach flt,$($(1)-flts), \
-	  "-D_f$(flt) -m$(1) -mfpmath=sse -O2 -DFLOAT_SIZE=$(flt)")
+	  "-D_f$(flt) -m$(1) $(call non-sse,$(1)) -mfpmath=sse -O2 -DFLOAT_SIZE=$(flt)")
 $(1)-avx-cflags := \
 	$(foreach vec,$($(1)-vecs), \
 	  $(foreach int,$($(1)-ints), \
 	    "-D_$(vec)i$(int) -m$(1) $(sse2avx-$(1)) -O2 -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
-	    "-D_$(vec)u$(int) -m$(1) $(sse2avx-$(1)) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)") \
-	  $(foreach flt,$($(1)-flts), \
-	    "-D_$(vec)f$(flt) -m$(1) $(sse2avx-$(1)) -O2 -DVEC_SIZE=$(vec) -DFLOAT_SIZE=$(flt)")) \
-	$(foreach flt,$($(1)-flts), \
-	  "-D_f$(flt) -m$(1) -mfpmath=sse $(sse2avx-$(1)) -O2 -DFLOAT_SIZE=$(flt)")
+	    "-D_$(vec)u$(int) -m$(1) $(sse2avx-$(1)) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)"))
 endef
 
 $(foreach flavor,$(SIMD),$(eval $(call simd-defs,$(flavor))))
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -70,7 +70,13 @@ typedef long long __attribute__((vector_
 #if VEC_SIZE == 8 && defined(__SSE__)
 # define to_bool(cmp) (__builtin_ia32_pmovmskb(cmp) == 0xff)
 #elif VEC_SIZE == 16
-# if defined(__SSE4_1__)
+# if defined(__AVX__) && defined(FLOAT_SIZE)
+#  if ELEM_SIZE == 4
+#   define to_bool(cmp) __builtin_ia32_vtestcps(cmp, (vec_t){} == 0)
+#  elif ELEM_SIZE == 8
+#   define to_bool(cmp) __builtin_ia32_vtestcpd(cmp, (vec_t){} == 0)
+#  endif
+# elif defined(__SSE4_1__)
 #  define to_bool(cmp) __builtin_ia32_ptestc128(cmp, (vdi_t){} == 0)
 # elif defined(__SSE__) && ELEM_SIZE == 4
 #  define to_bool(cmp) (__builtin_ia32_movmskps(cmp) == 0xf)
@@ -81,6 +87,12 @@ typedef long long __attribute__((vector_
 #   define to_bool(cmp) (__builtin_ia32_pmovmskb128(cmp) == 0xffff)
 #  endif
 # endif
+#elif VEC_SIZE == 32
+# if defined(__AVX__) && ELEM_SIZE == 4
+#  define to_bool(cmp) (__builtin_ia32_movmskps256(cmp) == 0xff)
+# elif defined(__AVX__) && ELEM_SIZE == 8
+#  define to_bool(cmp) (__builtin_ia32_movmskpd256(cmp) == 0xf)
+# endif
 #endif
 
 #ifndef to_bool
@@ -105,6 +117,12 @@ static inline bool _to_bool(byte_vec_t b
 # elif FLOAT_SIZE == 8
 #  define to_int(x) __builtin_ia32_cvtdq2pd(__builtin_ia32_cvtpd2dq(x))
 # endif
+#elif VEC_SIZE == 32 && defined(__AVX__)
+# if FLOAT_SIZE == 4
+#  define to_int(x) __builtin_ia32_cvtdq2ps256(__builtin_ia32_cvtps2dq256(x))
+# elif FLOAT_SIZE == 8
+#  define to_int(x) __builtin_ia32_cvtdq2pd256(__builtin_ia32_cvtpd2dq256(x))
+# endif
 #endif
 
 #if VEC_SIZE == FLOAT_SIZE
@@ -116,7 +134,25 @@ static inline bool _to_bool(byte_vec_t b
 #endif
 
 #if FLOAT_SIZE == 4 && defined(__SSE__)
-# if VEC_SIZE == 16
+# if VEC_SIZE == 32 && defined(__AVX__)
+#  define broadcast(x) ({ float t_ = (x); __builtin_ia32_vbroadcastss256(&t_); })
+#  define max(x, y) __builtin_ia32_maxps256(x, y)
+#  define min(x, y) __builtin_ia32_minps256(x, y)
+#  define recip(x) __builtin_ia32_rcpps256(x)
+#  define rsqrt(x) __builtin_ia32_rsqrtps256(x)
+#  define sqrt(x) __builtin_ia32_sqrtps256(x)
+#  define swap(x) ({ \
+    vec_t t_ = __builtin_ia32_vpermilps256(x, 0b00011011); \
+    __builtin_ia32_vperm2f128_ps256(t_, t_, 0b00000001); \
+})
+#  define swap2(x) ({ \
+    vec_t t_ = __builtin_ia32_vpermilvarps256(x, __builtin_ia32_cvtps2dq256(inv) - 1); \
+    __builtin_ia32_vperm2f128_ps256(t_, t_, 0b00000001); \
+})
+# elif VEC_SIZE == 16
+#  ifdef __AVX__
+#   define broadcast(x) ({ float t_ = (x); __builtin_ia32_vbroadcastss(&t_); })
+#  endif
 #  define interleave_hi(x, y) __builtin_ia32_unpckhps(x, y)
 #  define interleave_lo(x, y) __builtin_ia32_unpcklps(x, y)
 #  define max(x, y) __builtin_ia32_maxps(x, y)
@@ -125,13 +161,39 @@ static inline bool _to_bool(byte_vec_t b
 #  define rsqrt(x) __builtin_ia32_rsqrtps(x)
 #  define sqrt(x) __builtin_ia32_sqrtps(x)
 #  define swap(x) __builtin_ia32_shufps(x, x, 0b00011011)
+#  ifdef __AVX__
+#   define swap2(x) __builtin_ia32_vpermilvarps(x, __builtin_ia32_cvtps2dq(inv) - 1)
+#  endif
 # elif VEC_SIZE == 4
 #  define recip(x) scalar_1op(x, "rcpss %[in], %[out]")
 #  define rsqrt(x) scalar_1op(x, "rsqrtss %[in], %[out]")
 #  define sqrt(x) scalar_1op(x, "sqrtss %[in], %[out]")
 # endif
 #elif FLOAT_SIZE == 8 && defined(__SSE2__)
-# if VEC_SIZE == 16
+# if VEC_SIZE == 32 && defined(__AVX__)
+#  define broadcast(x) ({ double t_ = (x); __builtin_ia32_vbroadcastsd256(&t_); })
+#  define max(x, y) __builtin_ia32_maxpd256(x, y)
+#  define min(x, y) __builtin_ia32_minpd256(x, y)
+#  define recip(x) ({ \
+    float __attribute__((vector_size(16))) t_ = __builtin_ia32_cvtpd2ps256(x); \
+    t_ = __builtin_ia32_vextractf128_ps256( \
+             __builtin_ia32_rcpps256( \
+                 __builtin_ia32_vbroadcastf128_ps256(&t_)), 0); \
+    __builtin_ia32_cvtps2pd256(t_); \
+})
+#  define rsqrt(x) ({ \
+    float __attribute__((vector_size(16))) t1_ = __builtin_ia32_cvtpd2ps256(x); \
+    float __attribute__((vector_size(32))) t2_ = __builtin_ia32_vinsertf128_ps256((typeof(t2_)){}, t1_, 0); \
+    t2_ = __builtin_ia32_vinsertf128_ps256(t2_, t1_, 1); \
+    t1_ = __builtin_ia32_vextractf128_ps256(__builtin_ia32_rsqrtps256(t2_), 0); \
+    __builtin_ia32_cvtps2pd256(t1_); \
+})
+#  define sqrt(x) __builtin_ia32_sqrtpd256(x)
+#  define swap(x) ({ \
+    vec_t t_ = __builtin_ia32_vpermilpd256(x, 0b00000101); \
+    __builtin_ia32_vperm2f128_pd256(t_, t_, 0b00000001); \
+})
+# elif VEC_SIZE == 16
 #  define interleave_hi(x, y) __builtin_ia32_unpckhpd(x, y)
 #  define interleave_lo(x, y) __builtin_ia32_unpcklpd(x, y)
 #  define max(x, y) __builtin_ia32_maxpd(x, y)
@@ -140,6 +202,10 @@ static inline bool _to_bool(byte_vec_t b
 #  define rsqrt(x) __builtin_ia32_cvtps2pd(__builtin_ia32_rsqrtps(__builtin_ia32_cvtpd2ps(x)))
 #  define sqrt(x) __builtin_ia32_sqrtpd(x)
 #  define swap(x) __builtin_ia32_shufpd(x, x, 0b01)
+#  ifdef __AVX__
+#   define swap2(x) __builtin_ia32_vpermilvarpd(x, __builtin_ia32_pmovsxdq128( \
+                                                       __builtin_ia32_cvtpd2dq(inv) - 1) << 1)
+#  endif
 # elif VEC_SIZE == 8
 #  define recip(x) scalar_1op(x, "cvtsd2ss %[in], %[out]; rcpss %[out], %[out]; cvtss2sd %[out], %[out]")
 #  define rsqrt(x) scalar_1op(x, "cvtsd2ss %[in], %[out]; rsqrtss %[out], %[out]; cvtss2sd %[out], %[out]")
@@ -201,6 +267,31 @@ static inline bool _to_bool(byte_vec_t b
 #  define hadd(x, y) __builtin_ia32_haddpd(x, y)
 #  define hsub(x, y) __builtin_ia32_hsubpd(x, y)
 # endif
+#elif VEC_SIZE == 32 && defined(__AVX__)
+# if FLOAT_SIZE == 4
+#  define addsub(x, y) __builtin_ia32_addsubps256(x, y)
+#  define dup_hi(x) __builtin_ia32_movshdup256(x)
+#  define dup_lo(x) __builtin_ia32_movsldup256(x)
+#  define hadd(x, y) ({ \
+        vec_t t_ = __builtin_ia32_haddps256(x, y); \
+        (vec_t){t_[0], t_[1], t_[4], t_[5], t_[2], t_[3], t_[6], t_[7]}; \
+})
+#  define hsub(x, y) ({ \
+        vec_t t_ = __builtin_ia32_hsubps256(x, y); \
+        (vec_t){t_[0], t_[1], t_[4], t_[5], t_[2], t_[3], t_[6], t_[7]}; \
+})
+# elif FLOAT_SIZE == 8
+#  define addsub(x, y) __builtin_ia32_addsubpd256(x, y)
+#  define dup_lo(x) __builtin_ia32_movddup256(x)
+#  define hadd(x, y) ({ \
+        vec_t t_ = __builtin_ia32_haddpd256(x, y); \
+        (vec_t){t_[0], t_[2], t_[1], t_[3]}; \
+})
+#  define hsub(x, y) ({ \
+        vec_t t_ = __builtin_ia32_hsubpd256(x, y); \
+        (vec_t){t_[0], t_[2], t_[1], t_[3]}; \
+})
+# endif
 #endif
 #if VEC_SIZE == 16 && defined(__SSSE3__)
 # if INT_SIZE == 1
@@ -282,6 +373,31 @@ static inline bool _to_bool(byte_vec_t b
 #  define mix(x, y) __builtin_ia32_blendpd(x, y, 0b10)
 # endif
 #endif
+#if VEC_SIZE == 32 && defined(__AVX__)
+# if FLOAT_SIZE == 4
+#  define dot_product(x, y) ({ \
+    vec_t t_ = __builtin_ia32_dpps256(x, y, 0b11110001); \
+    (vec_t){t_[0] + t_[4]}; \
+})
+#  define mix(x, y) __builtin_ia32_blendps256(x, y, 0b10101010)
+#  define select(d, x, y, m) (*(d) = __builtin_ia32_blendvps256(y, x, m))
+#  define select2(d, x, y, m) ({ \
+    vsi_t m_ = (vsi_t)(m); \
+    *(d) = __builtin_ia32_maskloadps256(&(x),  m_); \
+    __builtin_ia32_maskstoreps256(d, ~m_, y); \
+})
+#  define trunc(x) __builtin_ia32_roundps256(x, 0b1011)
+# elif FLOAT_SIZE == 8
+#  define mix(x, y) __builtin_ia32_blendpd256(x, y, 0b1010)
+#  define select(d, x, y, m) (*(d) = __builtin_ia32_blendvpd256(y, x, m))
+#  define select2(d, x, y, m) ({ \
+    vdi_t m_ = (vdi_t)(m); \
+    *(d) = __builtin_ia32_maskloadpd256(&(x),  m_); \
+    __builtin_ia32_maskstorepd256(d, ~m_, y); \
+})
+#  define trunc(x) __builtin_ia32_roundpd256(x, 0b1011)
+# endif
+#endif
 #if VEC_SIZE == FLOAT_SIZE
 # define max(x, y) ((vec_t){({ typeof(x[0]) x_ = (x)[0], y_ = (y)[0]; x_ > y_ ? x_ : y_; })})
 # define min(x, y) ((vec_t){({ typeof(x[0]) x_ = (x)[0], y_ = (y)[0]; x_ < y_ ? x_ : y_; })})
@@ -555,6 +671,15 @@ int simd_test(void)
     if ( !to_bool(swap(src) == inv) ) return __LINE__;
 #endif
 
+#ifdef swap2
+    touch(src);
+    if ( !to_bool(swap2(src) == inv) ) return __LINE__;
+#endif
+
+#if defined(broadcast)
+    if ( !to_bool(broadcast(ELEM_COUNT + 1) == src + inv) ) return __LINE__;
+#endif
+
 #if defined(interleave_lo) && defined(interleave_hi)
     touch(src);
     x = interleave_lo(inv, src);
@@ -652,6 +777,15 @@ int simd_test(void)
 # endif
     if ( !to_bool(z == y) ) return __LINE__;
 #endif
+
+#ifdef select2
+# ifdef UINT_SIZE
+    select2(&z, src, inv, alt);
+# else
+    select2(&z, src, inv, alt > 0);
+# endif
+    if ( !to_bool(z == y) ) return __LINE__;
+#endif
 
 #ifdef mix
     touch(src);
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -8,9 +8,9 @@
 #include "sse.h"
 #include "sse2.h"
 #include "sse4.h"
-#include "sse-avx.h"
 #include "sse2-avx.h"
 #include "sse4-avx.h"
+#include "avx.h"
 
 #define verbose false /* Switch to true for far more logging. */
 
@@ -44,7 +44,6 @@ static bool simd_check_avx(void)
 {
     return cpu_has_avx;
 }
-#define simd_check_sse_avx   simd_check_avx
 #define simd_check_sse2_avx  simd_check_avx
 #define simd_check_sse4_avx  simd_check_avx
 
@@ -122,12 +121,6 @@ static const struct {
     SIMD(SSE4 packed u32,        sse4,      16u4),
     SIMD(SSE4 packed s64,        sse4,      16i8),
     SIMD(SSE4 packed u64,        sse4,      16u8),
-    SIMD(SSE/AVX scalar single,  sse_avx,     f4),
-    SIMD(SSE/AVX packed single,  sse_avx,   16f4),
-    SIMD(SSE2/AVX scalar single, sse2_avx,    f4),
-    SIMD(SSE2/AVX packed single, sse2_avx,  16f4),
-    SIMD(SSE2/AVX scalar double, sse2_avx,    f8),
-    SIMD(SSE2/AVX packed double, sse2_avx,  16f8),
     SIMD(SSE2/AVX packed s8,     sse2_avx,  16i1),
     SIMD(SSE2/AVX packed u8,     sse2_avx,  16u1),
     SIMD(SSE2/AVX packed s16,    sse2_avx,  16i2),
@@ -136,10 +129,6 @@ static const struct {
     SIMD(SSE2/AVX packed u32,    sse2_avx,  16u4),
     SIMD(SSE2/AVX packed s64,    sse2_avx,  16i8),
     SIMD(SSE2/AVX packed u64,    sse2_avx,  16u8),
-    SIMD(SSE4/AVX scalar single, sse4_avx,    f4),
-    SIMD(SSE4/AVX packed single, sse4_avx,  16f4),
-    SIMD(SSE4/AVX scalar double, sse4_avx,    f8),
-    SIMD(SSE4/AVX packed double, sse4_avx,  16f8),
     SIMD(SSE4/AVX packed s8,     sse4_avx,  16i1),
     SIMD(SSE4/AVX packed u8,     sse4_avx,  16u1),
     SIMD(SSE4/AVX packed s16,    sse4_avx,  16i2),
@@ -148,6 +137,12 @@ static const struct {
     SIMD(SSE4/AVX packed u32,    sse4_avx,  16u4),
     SIMD(SSE4/AVX packed s64,    sse4_avx,  16i8),
     SIMD(SSE4/AVX packed u64,    sse4_avx,  16u8),
+    SIMD(AVX scalar single,      avx,         f4),
+    SIMD(AVX 128bit single,      avx,       16f4),
+    SIMD(AVX 256bit single,      avx,       32f4),
+    SIMD(AVX scalar double,      avx,         f8),
+    SIMD(AVX 128bit double,      avx,       16f8),
+    SIMD(AVX 256bit double,      avx,       32f8),
 #undef SIMD_
 #undef SIMD
 };
@@ -2834,6 +2829,81 @@ int main(int argc, char **argv)
         printf("okay\n");
     }
     else
+        printf("skipped\n");
+
+    /*
+     * The following "maskmov" tests are not only making sure the written data
+     * is correct, but verify (by placing operands on the mapping boundaries)
+     * that elements controlled by clear mask bits aren't being accessed.
+     */
+    printf("%-40s", "Testing vmaskmovps %xmm1,%xmm2,(%edx)...");
+    if ( stack_exec && cpu_has_avx )
+    {
+        decl_insn(vmaskmovps);
+
+        asm volatile ( "vxorps %%xmm1, %%xmm1, %%xmm1\n\t"
+                       "vcmpeqss %%xmm1, %%xmm1, %%xmm2\n\t"
+                       put_insn(vmaskmovps, "vmaskmovps %%xmm1, %%xmm2, (%0)")
+                       :: "d" (NULL) );
+
+        memset(res + MMAP_SZ / sizeof(*res) - 8, 0xdb, 32);
+        set_insn(vmaskmovps);
+        regs.edx = (unsigned long)res + MMAP_SZ - 4;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vmaskmovps) ||
+             res[MMAP_SZ / sizeof(*res) - 1] ||
+             memcmp(res + MMAP_SZ / sizeof(*res) - 8,
+                    res + MMAP_SZ / sizeof(*res) - 4, 12) )
+            goto fail;
+
+        asm volatile ( "vinsertps $0b00110111, %xmm2, %xmm2, %xmm2" );
+        memset(res, 0xdb, 32);
+        set_insn(vmaskmovps);
+        regs.edx = (unsigned long)(res - 3);
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vmaskmovps) ||
+             res[0] || memcmp(res + 1, res + 4, 12) )
+            goto fail;
+
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing vmaskmovpd %xmm1,%xmm2,(%edx)...");
+    if ( stack_exec && cpu_has_avx )
+    {
+        decl_insn(vmaskmovpd);
+
+        asm volatile ( "vxorpd %%xmm1, %%xmm1, %%xmm1\n\t"
+                       "vcmpeqsd %%xmm1, %%xmm1, %%xmm2\n\t"
+                       put_insn(vmaskmovpd, "vmaskmovpd %%xmm1, %%xmm2, (%0)")
+                       :: "d" (NULL) );
+
+        memset(res + MMAP_SZ / sizeof(*res) - 8, 0xdb, 32);
+        set_insn(vmaskmovpd);
+        regs.edx = (unsigned long)res + MMAP_SZ - 8;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vmaskmovpd) ||
+             res[MMAP_SZ / sizeof(*res) - 1] ||
+             res[MMAP_SZ / sizeof(*res) - 2] ||
+             memcmp(res + MMAP_SZ / sizeof(*res) - 8,
+                    res + MMAP_SZ / sizeof(*res) - 4, 8) )
+            goto fail;
+
+        asm volatile ( "vmovddup %xmm2, %xmm2\n\t"
+                       "vmovsd %xmm1, %xmm2, %xmm2" );
+        memset(res, 0xdb, 32);
+        set_insn(vmaskmovpd);
+        regs.edx = (unsigned long)(res - 2);
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vmaskmovpd) ||
+             res[0] || res[1] || memcmp(res + 2, res + 4, 8) )
+            goto fail;
+
+        printf("okay\n");
+    }
+    else
         printf("skipped\n");
 
     printf("%-40s", "Testing stmxcsr (%edx)...");
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -226,6 +226,12 @@ enum simd_opsize {
      */
     simd_scalar_fp,
 
+    /*
+     * 128 bits of integer or floating point data, with no further
+     * formatting information.
+     */
+    simd_128,
+
     /* Operand size encoded in non-standard way. */
     simd_other
 };
@@ -361,14 +367,19 @@ static const struct {
     uint8_t vsib:1;
 } ext0f38_table[256] = {
     [0x00 ... 0x0b] = { .simd_size = simd_packed_int },
+    [0x0c ... 0x0f] = { .simd_size = simd_packed_fp },
     [0x10] = { .simd_size = simd_packed_int },
     [0x14 ... 0x15] = { .simd_size = simd_packed_fp },
     [0x17] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0x18 ... 0x19] = { .simd_size = simd_scalar_fp, .two_op = 1 },
+    [0x1a] = { .simd_size = simd_128, .two_op = 1 },
     [0x1c ... 0x1e] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0x20 ... 0x25] = { .simd_size = simd_other, .two_op = 1 },
     [0x28 ... 0x29] = { .simd_size = simd_packed_int },
     [0x2a] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0x2b] = { .simd_size = simd_packed_int },
+    [0x2c ... 0x2d] = { .simd_size = simd_other },
+    [0x2e ... 0x2f] = { .simd_size = simd_other, .to_mem = 1 },
     [0x30 ... 0x35] = { .simd_size = simd_other, .two_op = 1 },
     [0x37 ... 0x3f] = { .simd_size = simd_packed_int },
     [0x40] = { .simd_size = simd_packed_int },
@@ -391,11 +402,15 @@ static const struct {
     uint8_t two_op:1;
     uint8_t four_op:1;
 } ext0f3a_table[256] = {
+    [0x04 ... 0x05] = { .simd_size = simd_packed_fp, .two_op = 1 },
+    [0x06] = { .simd_size = simd_packed_fp },
     [0x08 ... 0x09] = { .simd_size = simd_packed_fp, .two_op = 1 },
     [0x0a ... 0x0b] = { .simd_size = simd_scalar_fp },
     [0x0c ... 0x0d] = { .simd_size = simd_packed_fp },
     [0x0e ... 0x0f] = { .simd_size = simd_packed_int },
     [0x14 ... 0x17] = { .simd_size = simd_none, .to_mem = 1, .two_op = 1 },
+    [0x18] = { .simd_size = simd_128 },
+    [0x19] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1 },
     [0x20] = { .simd_size = simd_none },
     [0x21] = { .simd_size = simd_other },
     [0x22] = { .simd_size = simd_none },
@@ -469,15 +484,18 @@ union vex {
     buf_ + 3; \
 })
 
+#define copy_VEX(ptr, vex) ({ \
+    if ( !mode_64bit() ) \
+        (vex).reg |= 8; \
+    (ptr)[0 - PFX_BYTES] = 0xc4; \
+    (ptr)[1 - PFX_BYTES] = (vex).raw[0]; \
+    (ptr)[2 - PFX_BYTES] = (vex).raw[1]; \
+    container_of((ptr) + 1 - PFX_BYTES, typeof(vex), raw[0]); \
+})
+
 #define copy_REX_VEX(ptr, rex, vex) do { \
     if ( (vex).opcx != vex_none ) \
-    { \
-        if ( !mode_64bit() ) \
-            vex.reg |= 8; \
-        (ptr)[0 - PFX_BYTES] = 0xc4; \
-        (ptr)[1 - PFX_BYTES] = (vex).raw[0]; \
-        (ptr)[2 - PFX_BYTES] = (vex).raw[1]; \
-    } \
+        copy_VEX(ptr, vex); \
     else \
     { \
         if ( (vex).pfx ) \
@@ -2933,6 +2951,10 @@ x86_decode(
         op_bytes = 4 << (ctxt->opcode & 1);
         break;
 
+    case simd_128:
+        op_bytes = 16;
+        break;
+
     default:
         op_bytes = 0;
         break;
@@ -2959,6 +2981,7 @@ x86_emulate(
     struct x86_emulate_state state;
     int rc;
     uint8_t b, d, *opc = NULL;
+    unsigned int first_byte = 0;
     bool singlestep = (_regs.eflags & X86_EFLAGS_TF) &&
 	    !is_branch_step(ctxt, ops);
     bool sfence = false;
@@ -7099,6 +7122,18 @@ x86_emulate(
         fic.insn_bytes = PFX_BYTES + 3;
         break;
 
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x19): /* vbroadcastsd m64,ymm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x1a): /* vbroadcastf128 m128,ymm */
+        generate_exception_if(!vex.l, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x18): /* vbroadcastss m32,{x,y}mm */
+        generate_exception_if(ea.type != OP_MEM, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x0c): /* vpermilps {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x0d): /* vpermilpd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+        generate_exception_if(vex.w, EXC_UD);
+        goto simd_0f_avx;
+
     case X86EMUL_OPC_66(0x0f38, 0x20): /* pmovsxbw xmm/m64,xmm */
     case X86EMUL_OPC_66(0x0f38, 0x21): /* pmovsxbd xmm/m32,xmm */
     case X86EMUL_OPC_66(0x0f38, 0x22): /* pmovsxbq xmm/m16,xmm */
@@ -7132,6 +7167,10 @@ x86_emulate(
         host_and_vcpu_must_have(sse4_1);
         goto simd_0f38_common;
 
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x0e): /* vtestps {x,y}mm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x0f): /* vtestpd {x,y}mm/mem,{x,y}mm */
+        generate_exception_if(vex.w, EXC_UD);
+        /* fall through */
     case X86EMUL_OPC_66(0x0f38, 0x17):     /* ptest xmm/m128,xmm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x17): /* vptest {x,y}mm/mem,{x,y}mm */
         if ( vex.opcx == vex_none )
@@ -7212,6 +7251,69 @@ x86_emulate(
         }
         goto movdqa;
 
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x2c): /* vmaskmovps mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x2d): /* vmaskmovpd mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x2e): /* vmaskmovps {x,y}mm,{x,y}mm,mem */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x2f): /* vmaskmovpd {x,y}mm,{x,y}mm,mem */
+    {
+        typeof(vex) *pvex;
+
+        generate_exception_if(ea.type != OP_MEM || vex.w, EXC_UD);
+        host_and_vcpu_must_have(avx);
+        get_fpu(X86EMUL_FPU_ymm, &fic);
+
+        /*
+         * While we can't reasonably provide fully correct behavior here
+         * (in particular, for writes, avoiding the memory read in anticipation
+         * of all elements in the range eventually being written), we can (and
+         * should) still limit the memory access to the smallest possible range
+         * (suppressing it altogether if all mask bits are clear), to provide
+         * correct faulting behavior. Read the mask bits via vmovmskp{s,d}
+         * for that purpose.
+         */
+        opc = init_prefixes(stub);
+        pvex = copy_VEX(opc, vex);
+        pvex->opcx = vex_0f;
+        if ( !(b & 1) )
+            pvex->pfx = vex_none;
+        opc[0] = 0x50; /* vmovmskp{s,d} */
+        /* Use %rax as GPR destination and VEX.vvvv as source. */
+        pvex->r = 1;
+        pvex->b = !mode_64bit() || (vex.reg >> 3);
+        opc[1] = 0xc0 | (~vex.reg & 7);
+        pvex->reg = 0xf;
+        opc[2] = 0xc3;
+
+        invoke_stub("", "", "=a" (ea.val) : [dummy] "i" (0));
+        put_stub(stub);
+
+        if ( !ea.val )
+            goto complete_insn;
+
+        op_bytes = 4 << (b & 1);
+        first_byte = __builtin_ctz(ea.val);
+        ea.val >>= first_byte;
+        first_byte *= op_bytes;
+        op_bytes *= 32 - __builtin_clz(ea.val);
+
+        /*
+         * Even for the memory write variant a memory read is needed, unless
+         * all set mask bits are contiguous.
+         */
+        if ( ea.val & (ea.val + 1) )
+            d = (d & ~SrcMask) | SrcMem;
+
+        opc = init_prefixes(stub);
+        opc[0] = b;
+        /* Convert memory operand to (%rAX). */
+        rex_prefix &= ~REX_B;
+        vex.b = 1;
+        opc[1] = modrm & 0x38;
+        fic.insn_bytes = PFX_BYTES + 2;
+
+        break;
+    }
+
     case X86EMUL_OPC_66(0x0f38, 0x37): /* pcmpgtq xmm/m128,xmm */
         host_and_vcpu_must_have(sse4_2);
         goto simd_0f38_common;
@@ -7407,6 +7509,16 @@ x86_emulate(
                             : "0" ((uint32_t)src.val), "rm" (_regs.edx) );
         break;
 
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x06): /* vperm2f128 $imm8,ymm/m256,ymm,ymm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x18): /* vinsertf128 $imm8,xmm/m128,ymm,ymm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x19): /* vextractf128 $imm8,ymm,xmm/m128 */
+        generate_exception_if(!vex.l, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x04): /* vpermilps $imm8,{x,y}mm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x05): /* vpermilpd $imm8,{x,y}mm/mem,{x,y}mm */
+        generate_exception_if(vex.w, EXC_UD);
+        goto simd_0f_imm8_avx;
+
     case X86EMUL_OPC(0x0f3a, 0x0f):    /* palignr $imm8,mm/m64,mm */
     case X86EMUL_OPC_66(0x0f3a, 0x0f): /* palignr $imm8,xmm/m128,xmm */
         host_and_vcpu_must_have(ssse3);
@@ -7758,7 +7870,9 @@ x86_emulate(
             switch ( d & SrcMask )
             {
             case SrcMem:
-                rc = ops->read(ea.mem.seg, ea.mem.off, mmvalp, op_bytes, ctxt);
+                rc = ops->read(ea.mem.seg, ea.mem.off + first_byte,
+                               (void *)mmvalp + first_byte, op_bytes,
+                               ctxt);
                 if ( rc != X86EMUL_OKAY )
                     goto done;
                 /* fall through */
@@ -7776,7 +7890,22 @@ x86_emulate(
             if ( (d & DstMask) == DstMem )
             {
                 fail_if(!ops->write); /* Check before running the stub. */
-                ASSERT(d & Mov);
+                if ( (d & SrcMask) == SrcMem )
+                    d |= Mov; /* Force memory write to occur below. */
+
+                switch ( ctxt->opcode )
+                {
+                case X86EMUL_OPC_VEX_66(0x0f38, 0x2e): /* vmaskmovps */
+                case X86EMUL_OPC_VEX_66(0x0f38, 0x2f): /* vmaskmovpd */
+                case X86EMUL_OPC_VEX_66(0x0f38, 0x8e): /* vpmaskmov{d,q} */
+                    /* These have merge semantics; force write to occur. */
+                    d |= Mov;
+                    break;
+                default:
+                    ASSERT(d & Mov);
+                    break;
+                }
+
                 dst.type = OP_MEM;
                 dst.bytes = op_bytes;
                 dst.mem = ea.mem;
@@ -7824,8 +7953,9 @@ x86_emulate(
         else
         {
             fail_if(!ops->write);
-            rc = ops->write(dst.mem.seg, dst.mem.off,
-                            !state->simd_size ? &dst.val : (void *)mmvalp,
+            rc = ops->write(dst.mem.seg, dst.mem.off + first_byte,
+                            !state->simd_size ? &dst.val
+                                              : (void *)mmvalp + first_byte,
                             dst.bytes, ctxt);
             if ( sfence )
                 asm volatile ( "sfence" ::: "memory" );

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 02/17] x86emul: re-order cases of main switch statement
  2017-06-21 11:23 [PATCH 00/17] x86: emulator enhancements Jan Beulich
  2017-06-21 11:59 ` [PATCH 01/17] x86emul: support remaining AVX insns Jan Beulich
@ 2017-06-21 11:59 ` Jan Beulich
  2017-09-13 15:15   ` George Dunlap
  2017-06-21 12:00 ` [PATCH 03/17] x86emul: build SIMD tests with -Os Jan Beulich
                   ` (15 subsequent siblings)
  17 siblings, 1 reply; 27+ messages in thread
From: Jan Beulich @ 2017-06-21 11:59 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 15330 bytes --]

Re-store intended numerical ordering, which has become "violated"
mostly by incremental additions where moving around bigger chunks did
not seem advisable. One exception though at the very top of the
switch(): Keeping the arithmetic ops together seems preferable over
entirely strict ordering.

Additionally move a few macro definitions before their first uses (the
placement is benign as long as those uses are themselves only macro
definitions, but that's going to change when those macros have helpers
broken out).

No (intended) functional change.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -843,6 +843,27 @@ do{ asm volatile (
 #define __emulate_1op_8byte(_op, _dst, _eflags)
 #endif /* __i386__ */
 
+#define fail_if(p)                                      \
+do {                                                    \
+    rc = (p) ? X86EMUL_UNHANDLEABLE : X86EMUL_OKAY;     \
+    if ( rc ) goto done;                                \
+} while (0)
+
+static inline int mkec(uint8_t e, int32_t ec, ...)
+{
+    return (e < 32 && ((1u << e) & EXC_HAS_EC)) ? ec : X86_EVENT_NO_EC;
+}
+
+#define generate_exception_if(p, e, ec...)                                \
+({  if ( (p) ) {                                                          \
+        x86_emul_hw_exception(e, mkec(e, ##ec, 0), ctxt);                 \
+        rc = X86EMUL_EXCEPTION;                                           \
+        goto done;                                                        \
+    }                                                                     \
+})
+
+#define generate_exception(e, ec...) generate_exception_if(true, e, ##ec)
+
 #ifdef __XEN__
 # define invoke_stub(pre, post, constraints...) do {                    \
     union stub_exception_token res_ = { .raw = ~0 };                    \
@@ -911,27 +932,6 @@ do{ asm volatile (
 # define mode_64bit() false
 #endif
 
-#define fail_if(p)                                      \
-do {                                                    \
-    rc = (p) ? X86EMUL_UNHANDLEABLE : X86EMUL_OKAY;     \
-    if ( rc ) goto done;                                \
-} while (0)
-
-static inline int mkec(uint8_t e, int32_t ec, ...)
-{
-    return (e < 32 && ((1u << e) & EXC_HAS_EC)) ? ec : X86_EVENT_NO_EC;
-}
-
-#define generate_exception_if(p, e, ec...)                                \
-({  if ( (p) ) {                                                          \
-        x86_emul_hw_exception(e, mkec(e, ##ec, 0), ctxt);                 \
-        rc = X86EMUL_EXCEPTION;                                           \
-        goto done;                                                        \
-    }                                                                     \
-})
-
-#define generate_exception(e, ec...) generate_exception_if(true, e, ##ec)
-
 /*
  * Given byte has even parity (even number of 1s)? SDM Vol. 1 Sec. 3.4.3.1,
  * "Status Flags": EFLAGS.PF reflects parity of least-sig. byte of result only.
@@ -3596,6 +3596,11 @@ x86_emulate(
             dst.bytes = 2;
         break;
 
+    case 0x8d: /* lea */
+        generate_exception_if(ea.type != OP_MEM, EXC_UD);
+        dst.val = ea.mem.off;
+        break;
+
     case 0x8e: /* mov r/m,Sreg */
         seg = modrm_reg & 7; /* REX.R is ignored. */
         generate_exception_if(!is_x86_user_segment(seg) ||
@@ -3607,11 +3612,6 @@ x86_emulate(
         dst.type = OP_NONE;
         break;
 
-    case 0x8d: /* lea */
-        generate_exception_if(ea.type != OP_MEM, EXC_UD);
-        dst.val = ea.mem.off;
-        break;
-
     case 0x8f: /* pop (sole member of Grp1a) */
         generate_exception_if((modrm_reg & 7) != 0, EXC_UD);
         /* 64-bit mode: POP defaults to a 64-bit operand. */
@@ -5746,12 +5746,6 @@ x86_emulate(
         _regs.r(ax) = (uint32_t)msr_val;
         break;
 
-    case X86EMUL_OPC(0x0f, 0x40) ... X86EMUL_OPC(0x0f, 0x4f): /* cmovcc */
-        vcpu_must_have(cmov);
-        if ( test_cc(b, _regs.eflags) )
-            dst.val = src.val;
-        break;
-
     case X86EMUL_OPC(0x0f, 0x34): /* sysenter */
         vcpu_must_have(sep);
         generate_exception_if(mode_ring0(), EXC_GP, 0);
@@ -5834,6 +5828,12 @@ x86_emulate(
         singlestep = _regs.eflags & X86_EFLAGS_TF;
         break;
 
+    case X86EMUL_OPC(0x0f, 0x40) ... X86EMUL_OPC(0x0f, 0x4f): /* cmovcc */
+        vcpu_must_have(cmov);
+        if ( test_cc(b, _regs.eflags) )
+            dst.val = src.val;
+        break;
+
     CASE_SIMD_PACKED_FP(, 0x0f, 0x50):     /* movmskp{s,d} xmm,reg */
     CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x50): /* vmovmskp{s,d} {x,y}mm,reg */
     CASE_SIMD_PACKED_INT(0x0f, 0xd7):      /* pmovmskb {,x}mm,reg */
@@ -6050,10 +6050,6 @@ x86_emulate(
         get_fpu(X86EMUL_FPU_mmx, &fic);
         goto simd_0f_common;
 
-    case X86EMUL_OPC_VEX_66(0x0f38, 0x41): /* vphminposuw xmm/m128,xmm,xmm */
-        generate_exception_if(vex.l, EXC_UD);
-        goto simd_0f_avx;
-
     CASE_SIMD_PACKED_INT(0x0f, 0x6e):    /* mov{d,q} r/m,{,x}mm */
     case X86EMUL_OPC_VEX_66(0x0f, 0x6e): /* vmov{d,q} r/m,xmm */
     CASE_SIMD_PACKED_INT(0x0f, 0x7e):    /* mov{d,q} {,x}mm,r/m */
@@ -6351,12 +6347,6 @@ x86_emulate(
         op_bytes = 8;
         goto simd_0f_xmm;
 
-    case X86EMUL_OPC_F3(0x0f, 0x7e):     /* movq xmm/m64,xmm */
-    case X86EMUL_OPC_VEX_F3(0x0f, 0x7e): /* vmovq xmm/m64,xmm */
-        generate_exception_if(vex.l, EXC_UD);
-        op_bytes = 8;
-        goto simd_0f_int;
-
     case X86EMUL_OPC_F2(0x0f, 0xf0):     /* lddqu m128,xmm */
     case X86EMUL_OPC_VEX_F2(0x0f, 0xf0): /* vlddqu mem,{x,y}mm */
         generate_exception_if(ea.type != OP_MEM, EXC_UD);
@@ -6376,6 +6366,12 @@ x86_emulate(
         op_bytes = 16 << vex.l;
         goto simd_0f_sse3_avx;
 
+    case X86EMUL_OPC_F3(0x0f, 0x7e):     /* movq xmm/m64,xmm */
+    case X86EMUL_OPC_VEX_F3(0x0f, 0x7e): /* vmovq xmm/m64,xmm */
+        generate_exception_if(vex.l, EXC_UD);
+        op_bytes = 8;
+        goto simd_0f_int;
+
     case X86EMUL_OPC(0x0f, 0x80) ... X86EMUL_OPC(0x0f, 0x8f): /* jcc (near) */
         if ( test_cc(b, _regs.eflags) )
             jmp_rel((int32_t)src.val);
@@ -7134,39 +7130,6 @@ x86_emulate(
         generate_exception_if(vex.w, EXC_UD);
         goto simd_0f_avx;
 
-    case X86EMUL_OPC_66(0x0f38, 0x20): /* pmovsxbw xmm/m64,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x21): /* pmovsxbd xmm/m32,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x22): /* pmovsxbq xmm/m16,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x23): /* pmovsxwd xmm/m64,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x24): /* pmovsxwq xmm/m32,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x25): /* pmovsxdq xmm/m64,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x30): /* pmovzxbw xmm/m64,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x31): /* pmovzxbd xmm/m32,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x32): /* pmovzxbq xmm/m16,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x33): /* pmovzxwd xmm/m64,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x34): /* pmovzxwq xmm/m32,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x35): /* pmovzxdq xmm/m64,xmm */
-        op_bytes = 16 >> pmov_convert_delta[b & 7];
-        /* fall through */
-    case X86EMUL_OPC_66(0x0f38, 0x10): /* pblendvb XMM0,xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x14): /* blendvps XMM0,xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x15): /* blendvpd XMM0,xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x28): /* pmuldq xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x29): /* pcmpeqq xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x2b): /* packusdw xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x38): /* pminsb xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x39): /* pminsd xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x3a): /* pminub xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x3b): /* pminud xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x3c): /* pmaxsb xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x3d): /* pmaxsd xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x3e): /* pmaxub xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x3f): /* pmaxud xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x40): /* pmulld xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x41): /* phminposuw xmm/m128,xmm */
-        host_and_vcpu_must_have(sse4_1);
-        goto simd_0f38_common;
-
     case X86EMUL_OPC_VEX_66(0x0f38, 0x0e): /* vtestps {x,y}mm/mem,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x0f): /* vtestpd {x,y}mm/mem,{x,y}mm */
         generate_exception_if(vex.w, EXC_UD);
@@ -7220,6 +7183,39 @@ x86_emulate(
         dst.type = OP_NONE;
         break;
 
+    case X86EMUL_OPC_66(0x0f38, 0x20): /* pmovsxbw xmm/m64,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x21): /* pmovsxbd xmm/m32,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x22): /* pmovsxbq xmm/m16,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x23): /* pmovsxwd xmm/m64,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x24): /* pmovsxwq xmm/m32,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x25): /* pmovsxdq xmm/m64,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x30): /* pmovzxbw xmm/m64,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x31): /* pmovzxbd xmm/m32,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x32): /* pmovzxbq xmm/m16,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x33): /* pmovzxwd xmm/m64,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x34): /* pmovzxwq xmm/m32,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x35): /* pmovzxdq xmm/m64,xmm */
+        op_bytes = 16 >> pmov_convert_delta[b & 7];
+        /* fall through */
+    case X86EMUL_OPC_66(0x0f38, 0x10): /* pblendvb XMM0,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x14): /* blendvps XMM0,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x15): /* blendvpd XMM0,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x28): /* pmuldq xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x29): /* pcmpeqq xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x2b): /* packusdw xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x38): /* pminsb xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x39): /* pminsd xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x3a): /* pminub xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x3b): /* pminud xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x3c): /* pmaxsb xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x3d): /* pmaxsd xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x3e): /* pmaxub xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x3f): /* pmaxud xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x40): /* pmulld xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x41): /* phminposuw xmm/m128,xmm */
+        host_and_vcpu_must_have(sse4_1);
+        goto simd_0f38_common;
+
     case X86EMUL_OPC_VEX_66(0x0f38, 0x20): /* vpmovsxbw xmm/mem,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x21): /* vpmovsxbd xmm/mem,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x22): /* vpmovsxbq xmm/mem,{x,y}mm */
@@ -7318,16 +7314,6 @@ x86_emulate(
         host_and_vcpu_must_have(sse4_2);
         goto simd_0f38_common;
 
-    case X86EMUL_OPC(0x0f38, 0xc8):     /* sha1nexte xmm/m128,xmm */
-    case X86EMUL_OPC(0x0f38, 0xc9):     /* sha1msg1 xmm/m128,xmm */
-    case X86EMUL_OPC(0x0f38, 0xca):     /* sha1msg2 xmm/m128,xmm */
-    case X86EMUL_OPC(0x0f38, 0xcb):     /* sha256rnds2 XMM0,xmm/m128,xmm */
-    case X86EMUL_OPC(0x0f38, 0xcc):     /* sha256msg1 xmm/m128,xmm */
-    case X86EMUL_OPC(0x0f38, 0xcd):     /* sha256msg2 xmm/m128,xmm */
-        host_and_vcpu_must_have(sha);
-        op_bytes = 16;
-        goto simd_0f38_common;
-
     case X86EMUL_OPC_66(0x0f38, 0xdb):     /* aesimc xmm/m128,xmm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0xdb): /* vaesimc xmm/m128,xmm */
     case X86EMUL_OPC_66(0x0f38, 0xdc):     /* aesenc xmm/m128,xmm,xmm */
@@ -7341,9 +7327,21 @@ x86_emulate(
         host_and_vcpu_must_have(aesni);
         if ( vex.opcx == vex_none )
             goto simd_0f38_common;
+        /* fall through */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x41): /* vphminposuw xmm/m128,xmm,xmm */
         generate_exception_if(vex.l, EXC_UD);
         goto simd_0f_avx;
 
+    case X86EMUL_OPC(0x0f38, 0xc8):     /* sha1nexte xmm/m128,xmm */
+    case X86EMUL_OPC(0x0f38, 0xc9):     /* sha1msg1 xmm/m128,xmm */
+    case X86EMUL_OPC(0x0f38, 0xca):     /* sha1msg2 xmm/m128,xmm */
+    case X86EMUL_OPC(0x0f38, 0xcb):     /* sha256rnds2 XMM0,xmm/m128,xmm */
+    case X86EMUL_OPC(0x0f38, 0xcc):     /* sha256msg1 xmm/m128,xmm */
+    case X86EMUL_OPC(0x0f38, 0xcd):     /* sha256msg2 xmm/m128,xmm */
+        host_and_vcpu_must_have(sha);
+        op_bytes = 16;
+        goto simd_0f38_common;
+
     case X86EMUL_OPC(0x0f38, 0xf0): /* movbe m,r */
     case X86EMUL_OPC(0x0f38, 0xf1): /* movbe r,m */
         vcpu_must_have(movbe);
@@ -7519,6 +7517,19 @@ x86_emulate(
         generate_exception_if(vex.w, EXC_UD);
         goto simd_0f_imm8_avx;
 
+    case X86EMUL_OPC_66(0x0f3a, 0x08): /* roundps $imm8,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f3a, 0x09): /* roundpd $imm8,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f3a, 0x0a): /* roundss $imm8,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f3a, 0x0b): /* roundsd $imm8,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f3a, 0x0c): /* blendps $imm8,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f3a, 0x0d): /* blendpd $imm8,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f3a, 0x0e): /* pblendw $imm8,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f3a, 0x40): /* dpps $imm8,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f3a, 0x41): /* dppd $imm8,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f3a, 0x42): /* mpsadbw $imm8,xmm/m128,xmm */
+        host_and_vcpu_must_have(sse4_1);
+        goto simd_0f3a_common;
+
     case X86EMUL_OPC(0x0f3a, 0x0f):    /* palignr $imm8,mm/m64,mm */
     case X86EMUL_OPC_66(0x0f3a, 0x0f): /* palignr $imm8,xmm/m128,xmm */
         host_and_vcpu_must_have(ssse3);
@@ -7547,19 +7558,6 @@ x86_emulate(
         fic.insn_bytes = PFX_BYTES + 4;
         break;
 
-    case X86EMUL_OPC_66(0x0f3a, 0x08): /* roundps $imm8,xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f3a, 0x09): /* roundpd $imm8,xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f3a, 0x0a): /* roundss $imm8,xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f3a, 0x0b): /* roundsd $imm8,xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f3a, 0x0c): /* blendps $imm8,xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f3a, 0x0d): /* blendpd $imm8,xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f3a, 0x0e): /* pblendw $imm8,xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f3a, 0x40): /* dpps $imm8,xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f3a, 0x41): /* dppd $imm8,xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f3a, 0x42): /* mpsadbw $imm8,xmm/m128,xmm */
-        host_and_vcpu_must_have(sse4_1);
-        goto simd_0f3a_common;
-
     case X86EMUL_OPC_66(0x0f3a, 0x14): /* pextrb $imm8,xmm,r/m */
     case X86EMUL_OPC_66(0x0f3a, 0x15): /* pextrw $imm8,xmm,r/m */
     case X86EMUL_OPC_66(0x0f3a, 0x16): /* pextr{d,q} $imm8,xmm,r/m */



[-- Attachment #2: x86emul-reorder.patch --]
[-- Type: text/plain, Size: 15378 bytes --]

x86emul: re-order cases of main switch statement

Re-store intended numerical ordering, which has become "violated"
mostly by incremental additions where moving around bigger chunks did
not seem advisable. One exception though at the very top of the
switch(): Keeping the arithmetic ops together seems preferable over
entirely strict ordering.

Additionally move a few macro definitions before their first uses (the
placement is benign as long as those uses are themselves only macro
definitions, but that's going to change when those macros have helpers
broken out).

No (intended) functional change.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -843,6 +843,27 @@ do{ asm volatile (
 #define __emulate_1op_8byte(_op, _dst, _eflags)
 #endif /* __i386__ */
 
+#define fail_if(p)                                      \
+do {                                                    \
+    rc = (p) ? X86EMUL_UNHANDLEABLE : X86EMUL_OKAY;     \
+    if ( rc ) goto done;                                \
+} while (0)
+
+static inline int mkec(uint8_t e, int32_t ec, ...)
+{
+    return (e < 32 && ((1u << e) & EXC_HAS_EC)) ? ec : X86_EVENT_NO_EC;
+}
+
+#define generate_exception_if(p, e, ec...)                                \
+({  if ( (p) ) {                                                          \
+        x86_emul_hw_exception(e, mkec(e, ##ec, 0), ctxt);                 \
+        rc = X86EMUL_EXCEPTION;                                           \
+        goto done;                                                        \
+    }                                                                     \
+})
+
+#define generate_exception(e, ec...) generate_exception_if(true, e, ##ec)
+
 #ifdef __XEN__
 # define invoke_stub(pre, post, constraints...) do {                    \
     union stub_exception_token res_ = { .raw = ~0 };                    \
@@ -911,27 +932,6 @@ do{ asm volatile (
 # define mode_64bit() false
 #endif
 
-#define fail_if(p)                                      \
-do {                                                    \
-    rc = (p) ? X86EMUL_UNHANDLEABLE : X86EMUL_OKAY;     \
-    if ( rc ) goto done;                                \
-} while (0)
-
-static inline int mkec(uint8_t e, int32_t ec, ...)
-{
-    return (e < 32 && ((1u << e) & EXC_HAS_EC)) ? ec : X86_EVENT_NO_EC;
-}
-
-#define generate_exception_if(p, e, ec...)                                \
-({  if ( (p) ) {                                                          \
-        x86_emul_hw_exception(e, mkec(e, ##ec, 0), ctxt);                 \
-        rc = X86EMUL_EXCEPTION;                                           \
-        goto done;                                                        \
-    }                                                                     \
-})
-
-#define generate_exception(e, ec...) generate_exception_if(true, e, ##ec)
-
 /*
  * Given byte has even parity (even number of 1s)? SDM Vol. 1 Sec. 3.4.3.1,
  * "Status Flags": EFLAGS.PF reflects parity of least-sig. byte of result only.
@@ -3596,6 +3596,11 @@ x86_emulate(
             dst.bytes = 2;
         break;
 
+    case 0x8d: /* lea */
+        generate_exception_if(ea.type != OP_MEM, EXC_UD);
+        dst.val = ea.mem.off;
+        break;
+
     case 0x8e: /* mov r/m,Sreg */
         seg = modrm_reg & 7; /* REX.R is ignored. */
         generate_exception_if(!is_x86_user_segment(seg) ||
@@ -3607,11 +3612,6 @@ x86_emulate(
         dst.type = OP_NONE;
         break;
 
-    case 0x8d: /* lea */
-        generate_exception_if(ea.type != OP_MEM, EXC_UD);
-        dst.val = ea.mem.off;
-        break;
-
     case 0x8f: /* pop (sole member of Grp1a) */
         generate_exception_if((modrm_reg & 7) != 0, EXC_UD);
         /* 64-bit mode: POP defaults to a 64-bit operand. */
@@ -5746,12 +5746,6 @@ x86_emulate(
         _regs.r(ax) = (uint32_t)msr_val;
         break;
 
-    case X86EMUL_OPC(0x0f, 0x40) ... X86EMUL_OPC(0x0f, 0x4f): /* cmovcc */
-        vcpu_must_have(cmov);
-        if ( test_cc(b, _regs.eflags) )
-            dst.val = src.val;
-        break;
-
     case X86EMUL_OPC(0x0f, 0x34): /* sysenter */
         vcpu_must_have(sep);
         generate_exception_if(mode_ring0(), EXC_GP, 0);
@@ -5834,6 +5828,12 @@ x86_emulate(
         singlestep = _regs.eflags & X86_EFLAGS_TF;
         break;
 
+    case X86EMUL_OPC(0x0f, 0x40) ... X86EMUL_OPC(0x0f, 0x4f): /* cmovcc */
+        vcpu_must_have(cmov);
+        if ( test_cc(b, _regs.eflags) )
+            dst.val = src.val;
+        break;
+
     CASE_SIMD_PACKED_FP(, 0x0f, 0x50):     /* movmskp{s,d} xmm,reg */
     CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x50): /* vmovmskp{s,d} {x,y}mm,reg */
     CASE_SIMD_PACKED_INT(0x0f, 0xd7):      /* pmovmskb {,x}mm,reg */
@@ -6050,10 +6050,6 @@ x86_emulate(
         get_fpu(X86EMUL_FPU_mmx, &fic);
         goto simd_0f_common;
 
-    case X86EMUL_OPC_VEX_66(0x0f38, 0x41): /* vphminposuw xmm/m128,xmm,xmm */
-        generate_exception_if(vex.l, EXC_UD);
-        goto simd_0f_avx;
-
     CASE_SIMD_PACKED_INT(0x0f, 0x6e):    /* mov{d,q} r/m,{,x}mm */
     case X86EMUL_OPC_VEX_66(0x0f, 0x6e): /* vmov{d,q} r/m,xmm */
     CASE_SIMD_PACKED_INT(0x0f, 0x7e):    /* mov{d,q} {,x}mm,r/m */
@@ -6351,12 +6347,6 @@ x86_emulate(
         op_bytes = 8;
         goto simd_0f_xmm;
 
-    case X86EMUL_OPC_F3(0x0f, 0x7e):     /* movq xmm/m64,xmm */
-    case X86EMUL_OPC_VEX_F3(0x0f, 0x7e): /* vmovq xmm/m64,xmm */
-        generate_exception_if(vex.l, EXC_UD);
-        op_bytes = 8;
-        goto simd_0f_int;
-
     case X86EMUL_OPC_F2(0x0f, 0xf0):     /* lddqu m128,xmm */
     case X86EMUL_OPC_VEX_F2(0x0f, 0xf0): /* vlddqu mem,{x,y}mm */
         generate_exception_if(ea.type != OP_MEM, EXC_UD);
@@ -6376,6 +6366,12 @@ x86_emulate(
         op_bytes = 16 << vex.l;
         goto simd_0f_sse3_avx;
 
+    case X86EMUL_OPC_F3(0x0f, 0x7e):     /* movq xmm/m64,xmm */
+    case X86EMUL_OPC_VEX_F3(0x0f, 0x7e): /* vmovq xmm/m64,xmm */
+        generate_exception_if(vex.l, EXC_UD);
+        op_bytes = 8;
+        goto simd_0f_int;
+
     case X86EMUL_OPC(0x0f, 0x80) ... X86EMUL_OPC(0x0f, 0x8f): /* jcc (near) */
         if ( test_cc(b, _regs.eflags) )
             jmp_rel((int32_t)src.val);
@@ -7134,39 +7130,6 @@ x86_emulate(
         generate_exception_if(vex.w, EXC_UD);
         goto simd_0f_avx;
 
-    case X86EMUL_OPC_66(0x0f38, 0x20): /* pmovsxbw xmm/m64,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x21): /* pmovsxbd xmm/m32,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x22): /* pmovsxbq xmm/m16,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x23): /* pmovsxwd xmm/m64,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x24): /* pmovsxwq xmm/m32,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x25): /* pmovsxdq xmm/m64,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x30): /* pmovzxbw xmm/m64,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x31): /* pmovzxbd xmm/m32,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x32): /* pmovzxbq xmm/m16,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x33): /* pmovzxwd xmm/m64,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x34): /* pmovzxwq xmm/m32,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x35): /* pmovzxdq xmm/m64,xmm */
-        op_bytes = 16 >> pmov_convert_delta[b & 7];
-        /* fall through */
-    case X86EMUL_OPC_66(0x0f38, 0x10): /* pblendvb XMM0,xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x14): /* blendvps XMM0,xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x15): /* blendvpd XMM0,xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x28): /* pmuldq xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x29): /* pcmpeqq xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x2b): /* packusdw xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x38): /* pminsb xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x39): /* pminsd xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x3a): /* pminub xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x3b): /* pminud xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x3c): /* pmaxsb xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x3d): /* pmaxsd xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x3e): /* pmaxub xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x3f): /* pmaxud xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x40): /* pmulld xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f38, 0x41): /* phminposuw xmm/m128,xmm */
-        host_and_vcpu_must_have(sse4_1);
-        goto simd_0f38_common;
-
     case X86EMUL_OPC_VEX_66(0x0f38, 0x0e): /* vtestps {x,y}mm/mem,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x0f): /* vtestpd {x,y}mm/mem,{x,y}mm */
         generate_exception_if(vex.w, EXC_UD);
@@ -7220,6 +7183,39 @@ x86_emulate(
         dst.type = OP_NONE;
         break;
 
+    case X86EMUL_OPC_66(0x0f38, 0x20): /* pmovsxbw xmm/m64,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x21): /* pmovsxbd xmm/m32,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x22): /* pmovsxbq xmm/m16,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x23): /* pmovsxwd xmm/m64,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x24): /* pmovsxwq xmm/m32,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x25): /* pmovsxdq xmm/m64,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x30): /* pmovzxbw xmm/m64,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x31): /* pmovzxbd xmm/m32,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x32): /* pmovzxbq xmm/m16,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x33): /* pmovzxwd xmm/m64,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x34): /* pmovzxwq xmm/m32,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x35): /* pmovzxdq xmm/m64,xmm */
+        op_bytes = 16 >> pmov_convert_delta[b & 7];
+        /* fall through */
+    case X86EMUL_OPC_66(0x0f38, 0x10): /* pblendvb XMM0,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x14): /* blendvps XMM0,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x15): /* blendvpd XMM0,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x28): /* pmuldq xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x29): /* pcmpeqq xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x2b): /* packusdw xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x38): /* pminsb xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x39): /* pminsd xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x3a): /* pminub xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x3b): /* pminud xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x3c): /* pmaxsb xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x3d): /* pmaxsd xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x3e): /* pmaxub xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x3f): /* pmaxud xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x40): /* pmulld xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x41): /* phminposuw xmm/m128,xmm */
+        host_and_vcpu_must_have(sse4_1);
+        goto simd_0f38_common;
+
     case X86EMUL_OPC_VEX_66(0x0f38, 0x20): /* vpmovsxbw xmm/mem,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x21): /* vpmovsxbd xmm/mem,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x22): /* vpmovsxbq xmm/mem,{x,y}mm */
@@ -7318,16 +7314,6 @@ x86_emulate(
         host_and_vcpu_must_have(sse4_2);
         goto simd_0f38_common;
 
-    case X86EMUL_OPC(0x0f38, 0xc8):     /* sha1nexte xmm/m128,xmm */
-    case X86EMUL_OPC(0x0f38, 0xc9):     /* sha1msg1 xmm/m128,xmm */
-    case X86EMUL_OPC(0x0f38, 0xca):     /* sha1msg2 xmm/m128,xmm */
-    case X86EMUL_OPC(0x0f38, 0xcb):     /* sha256rnds2 XMM0,xmm/m128,xmm */
-    case X86EMUL_OPC(0x0f38, 0xcc):     /* sha256msg1 xmm/m128,xmm */
-    case X86EMUL_OPC(0x0f38, 0xcd):     /* sha256msg2 xmm/m128,xmm */
-        host_and_vcpu_must_have(sha);
-        op_bytes = 16;
-        goto simd_0f38_common;
-
     case X86EMUL_OPC_66(0x0f38, 0xdb):     /* aesimc xmm/m128,xmm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0xdb): /* vaesimc xmm/m128,xmm */
     case X86EMUL_OPC_66(0x0f38, 0xdc):     /* aesenc xmm/m128,xmm,xmm */
@@ -7341,9 +7327,21 @@ x86_emulate(
         host_and_vcpu_must_have(aesni);
         if ( vex.opcx == vex_none )
             goto simd_0f38_common;
+        /* fall through */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x41): /* vphminposuw xmm/m128,xmm,xmm */
         generate_exception_if(vex.l, EXC_UD);
         goto simd_0f_avx;
 
+    case X86EMUL_OPC(0x0f38, 0xc8):     /* sha1nexte xmm/m128,xmm */
+    case X86EMUL_OPC(0x0f38, 0xc9):     /* sha1msg1 xmm/m128,xmm */
+    case X86EMUL_OPC(0x0f38, 0xca):     /* sha1msg2 xmm/m128,xmm */
+    case X86EMUL_OPC(0x0f38, 0xcb):     /* sha256rnds2 XMM0,xmm/m128,xmm */
+    case X86EMUL_OPC(0x0f38, 0xcc):     /* sha256msg1 xmm/m128,xmm */
+    case X86EMUL_OPC(0x0f38, 0xcd):     /* sha256msg2 xmm/m128,xmm */
+        host_and_vcpu_must_have(sha);
+        op_bytes = 16;
+        goto simd_0f38_common;
+
     case X86EMUL_OPC(0x0f38, 0xf0): /* movbe m,r */
     case X86EMUL_OPC(0x0f38, 0xf1): /* movbe r,m */
         vcpu_must_have(movbe);
@@ -7519,6 +7517,19 @@ x86_emulate(
         generate_exception_if(vex.w, EXC_UD);
         goto simd_0f_imm8_avx;
 
+    case X86EMUL_OPC_66(0x0f3a, 0x08): /* roundps $imm8,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f3a, 0x09): /* roundpd $imm8,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f3a, 0x0a): /* roundss $imm8,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f3a, 0x0b): /* roundsd $imm8,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f3a, 0x0c): /* blendps $imm8,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f3a, 0x0d): /* blendpd $imm8,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f3a, 0x0e): /* pblendw $imm8,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f3a, 0x40): /* dpps $imm8,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f3a, 0x41): /* dppd $imm8,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f3a, 0x42): /* mpsadbw $imm8,xmm/m128,xmm */
+        host_and_vcpu_must_have(sse4_1);
+        goto simd_0f3a_common;
+
     case X86EMUL_OPC(0x0f3a, 0x0f):    /* palignr $imm8,mm/m64,mm */
     case X86EMUL_OPC_66(0x0f3a, 0x0f): /* palignr $imm8,xmm/m128,xmm */
         host_and_vcpu_must_have(ssse3);
@@ -7547,19 +7558,6 @@ x86_emulate(
         fic.insn_bytes = PFX_BYTES + 4;
         break;
 
-    case X86EMUL_OPC_66(0x0f3a, 0x08): /* roundps $imm8,xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f3a, 0x09): /* roundpd $imm8,xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f3a, 0x0a): /* roundss $imm8,xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f3a, 0x0b): /* roundsd $imm8,xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f3a, 0x0c): /* blendps $imm8,xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f3a, 0x0d): /* blendpd $imm8,xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f3a, 0x0e): /* pblendw $imm8,xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f3a, 0x40): /* dpps $imm8,xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f3a, 0x41): /* dppd $imm8,xmm/m128,xmm */
-    case X86EMUL_OPC_66(0x0f3a, 0x42): /* mpsadbw $imm8,xmm/m128,xmm */
-        host_and_vcpu_must_have(sse4_1);
-        goto simd_0f3a_common;
-
     case X86EMUL_OPC_66(0x0f3a, 0x14): /* pextrb $imm8,xmm,r/m */
     case X86EMUL_OPC_66(0x0f3a, 0x15): /* pextrw $imm8,xmm,r/m */
     case X86EMUL_OPC_66(0x0f3a, 0x16): /* pextr{d,q} $imm8,xmm,r/m */

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 03/17] x86emul: build SIMD tests with -Os
  2017-06-21 11:23 [PATCH 00/17] x86: emulator enhancements Jan Beulich
  2017-06-21 11:59 ` [PATCH 01/17] x86emul: support remaining AVX insns Jan Beulich
  2017-06-21 11:59 ` [PATCH 02/17] x86emul: re-order cases of main switch statement Jan Beulich
@ 2017-06-21 12:00 ` Jan Beulich
  2017-09-13 15:19   ` George Dunlap
  2017-06-21 12:01 ` [PATCH 04/17] x86emul: support F16C insns Jan Beulich
                   ` (14 subsequent siblings)
  17 siblings, 1 reply; 27+ messages in thread
From: Jan Beulich @ 2017-06-21 12:00 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 1861 bytes --]

Namely in the context of putting together subsequent patches I've
noticed that together with the touch() macro using -Os further
increases the chances of the compiler using memory operands for the
instructions we actually care to test.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -45,17 +45,17 @@ define simd-defs
 $(1)-cflags := \
 	$(foreach vec,$($(1)-vecs), \
 	  $(foreach int,$($(1)-ints), \
-	    "-D_$(vec)i$(int) -m$(1) $(call non-sse,$(1)) -O2 -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
-	    "-D_$(vec)u$(int) -m$(1) $(call non-sse,$(1)) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)") \
+	    "-D_$(vec)i$(int) -m$(1) $(call non-sse,$(1)) -Os -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
+	    "-D_$(vec)u$(int) -m$(1) $(call non-sse,$(1)) -Os -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)") \
 	  $(foreach flt,$($(1)-flts), \
-	    "-D_$(vec)f$(flt) -m$(1) $(call non-sse,$(1)) -O2 -DVEC_SIZE=$(vec) -DFLOAT_SIZE=$(flt)")) \
+	    "-D_$(vec)f$(flt) -m$(1) $(call non-sse,$(1)) -Os -DVEC_SIZE=$(vec) -DFLOAT_SIZE=$(flt)")) \
 	$(foreach flt,$($(1)-flts), \
-	  "-D_f$(flt) -m$(1) $(call non-sse,$(1)) -mfpmath=sse -O2 -DFLOAT_SIZE=$(flt)")
+	  "-D_f$(flt) -m$(1) $(call non-sse,$(1)) -mfpmath=sse -Os -DFLOAT_SIZE=$(flt)")
 $(1)-avx-cflags := \
 	$(foreach vec,$($(1)-vecs), \
 	  $(foreach int,$($(1)-ints), \
-	    "-D_$(vec)i$(int) -m$(1) $(sse2avx-$(1)) -O2 -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
-	    "-D_$(vec)u$(int) -m$(1) $(sse2avx-$(1)) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)"))
+	    "-D_$(vec)i$(int) -m$(1) $(sse2avx-$(1)) -Os -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
+	    "-D_$(vec)u$(int) -m$(1) $(sse2avx-$(1)) -Os -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)"))
 endef
 
 $(foreach flavor,$(SIMD),$(eval $(call simd-defs,$(flavor))))




[-- Attachment #2: x86emul-test-Os.patch --]
[-- Type: text/plain, Size: 1893 bytes --]

x86emul: build SIMD tests with -Os

Namely in the context of putting together subsequent patches I've
noticed that together with the touch() macro using -Os further
increases the chances of the compiler using memory operands for the
instructions we actually care to test.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -45,17 +45,17 @@ define simd-defs
 $(1)-cflags := \
 	$(foreach vec,$($(1)-vecs), \
 	  $(foreach int,$($(1)-ints), \
-	    "-D_$(vec)i$(int) -m$(1) $(call non-sse,$(1)) -O2 -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
-	    "-D_$(vec)u$(int) -m$(1) $(call non-sse,$(1)) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)") \
+	    "-D_$(vec)i$(int) -m$(1) $(call non-sse,$(1)) -Os -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
+	    "-D_$(vec)u$(int) -m$(1) $(call non-sse,$(1)) -Os -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)") \
 	  $(foreach flt,$($(1)-flts), \
-	    "-D_$(vec)f$(flt) -m$(1) $(call non-sse,$(1)) -O2 -DVEC_SIZE=$(vec) -DFLOAT_SIZE=$(flt)")) \
+	    "-D_$(vec)f$(flt) -m$(1) $(call non-sse,$(1)) -Os -DVEC_SIZE=$(vec) -DFLOAT_SIZE=$(flt)")) \
 	$(foreach flt,$($(1)-flts), \
-	  "-D_f$(flt) -m$(1) $(call non-sse,$(1)) -mfpmath=sse -O2 -DFLOAT_SIZE=$(flt)")
+	  "-D_f$(flt) -m$(1) $(call non-sse,$(1)) -mfpmath=sse -Os -DFLOAT_SIZE=$(flt)")
 $(1)-avx-cflags := \
 	$(foreach vec,$($(1)-vecs), \
 	  $(foreach int,$($(1)-ints), \
-	    "-D_$(vec)i$(int) -m$(1) $(sse2avx-$(1)) -O2 -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
-	    "-D_$(vec)u$(int) -m$(1) $(sse2avx-$(1)) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)"))
+	    "-D_$(vec)i$(int) -m$(1) $(sse2avx-$(1)) -Os -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
+	    "-D_$(vec)u$(int) -m$(1) $(sse2avx-$(1)) -Os -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)"))
 endef
 
 $(foreach flavor,$(SIMD),$(eval $(call simd-defs,$(flavor))))

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 04/17] x86emul: support F16C insns
  2017-06-21 11:23 [PATCH 00/17] x86: emulator enhancements Jan Beulich
                   ` (2 preceding siblings ...)
  2017-06-21 12:00 ` [PATCH 03/17] x86emul: build SIMD tests with -Os Jan Beulich
@ 2017-06-21 12:01 ` Jan Beulich
  2017-09-13 17:10   ` George Dunlap
  2017-06-21 12:01 ` [PATCH 05/17] x86emul: support FMA4 insns Jan Beulich
                   ` (13 subsequent siblings)
  17 siblings, 1 reply; 27+ messages in thread
From: Jan Beulich @ 2017-06-21 12:01 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 6751 bytes --]

Note that this avoids emulating the behavior of VCVTPS2PH found on at
least some Intel CPUs, which update MXCSR even when the memory write
faults.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -3028,6 +3028,47 @@ int main(int argc, char **argv)
         printf("skipped\n");
 #endif
 
+    printf("%-40s", "Testing vcvtph2ps (%ecx),%ymm1...");
+    if ( stack_exec && cpu_has_f16c )
+    {
+        decl_insn(vcvtph2ps);
+        decl_insn(vcvtps2ph);
+
+        asm volatile ( "vxorps %%xmm1, %%xmm1, %%xmm1\n"
+                       put_insn(vcvtph2ps, "vcvtph2ps (%0), %%ymm1")
+                       :: "c" (NULL) );
+
+        set_insn(vcvtph2ps);
+        res[1] = 0x40003c00; /* (1.0, 2.0) */
+        res[2] = 0x44004200; /* (3.0, 4.0) */
+        res[3] = 0x3400b800; /* (-.5, .25) */
+        res[4] = 0xbc000000; /* (0.0, -1.) */
+        memset(res + 5, 0xff, 16);
+        regs.ecx = (unsigned long)(res + 1);
+        rc = x86_emulate(&ctxt, &emulops);
+        asm volatile ( "vmovups %%ymm1, %0" : "=m" (res[16]) );
+        if ( rc != X86EMUL_OKAY || !check_eip(vcvtph2ps) )
+            goto fail;
+        printf("okay\n");
+
+        printf("%-40s", "Testing vcvtps2ph $0,%ymm1,(%edx)...");
+        asm volatile ( "vmovups %0, %%ymm1\n"
+                       put_insn(vcvtps2ph, "vcvtps2ph $0, %%ymm1, (%1)")
+                       :: "m" (res[16]), "d" (NULL) );
+
+        set_insn(vcvtps2ph);
+        memset(res + 7, 0, 32);
+        regs.edx = (unsigned long)(res + 7);
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vcvtps2ph) ||
+             memcmp(res + 1, res + 7, 16) ||
+             res[11] || res[12] || res[13] || res[14] )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
 #undef decl_insn
 #undef put_insn
 #undef set_insn
--- a/tools/tests/x86_emulator/x86_emulate.h
+++ b/tools/tests/x86_emulator/x86_emulate.h
@@ -127,6 +127,14 @@ static inline uint64_t xgetbv(uint32_t x
     (res.c & (1U << 28)) != 0; \
 })
 
+#define cpu_has_f16c ({ \
+    struct cpuid_leaf res; \
+    emul_test_cpuid(1, 0, &res, NULL); \
+    if ( !(res.c & (1U << 27)) || ((xgetbv(0) & 6) != 6) ) \
+        res.c = 0; \
+    (res.c & (1U << 29)) != 0; \
+})
+
 #define cpu_has_avx2 ({ \
     struct cpuid_leaf res; \
     emul_test_cpuid(1, 0, &res, NULL); \
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -369,6 +369,7 @@ static const struct {
     [0x00 ... 0x0b] = { .simd_size = simd_packed_int },
     [0x0c ... 0x0f] = { .simd_size = simd_packed_fp },
     [0x10] = { .simd_size = simd_packed_int },
+    [0x13] = { .simd_size = simd_other, .two_op = 1 },
     [0x14 ... 0x15] = { .simd_size = simd_packed_fp },
     [0x17] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0x18 ... 0x19] = { .simd_size = simd_scalar_fp, .two_op = 1 },
@@ -411,6 +412,7 @@ static const struct {
     [0x14 ... 0x17] = { .simd_size = simd_none, .to_mem = 1, .two_op = 1 },
     [0x18] = { .simd_size = simd_128 },
     [0x19] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1 },
+    [0x1d] = { .simd_size = simd_other, .to_mem = 1, .two_op = 1 },
     [0x20] = { .simd_size = simd_none },
     [0x21] = { .simd_size = simd_other },
     [0x22] = { .simd_size = simd_none },
@@ -1601,6 +1603,7 @@ static bool vcpu_has(
 #define vcpu_has_popcnt()      vcpu_has(         1, ECX, 23, ctxt, ops)
 #define vcpu_has_aesni()       vcpu_has(         1, ECX, 25, ctxt, ops)
 #define vcpu_has_avx()         vcpu_has(         1, ECX, 28, ctxt, ops)
+#define vcpu_has_f16c()        vcpu_has(         1, ECX, 29, ctxt, ops)
 #define vcpu_has_rdrand()      vcpu_has(         1, ECX, 30, ctxt, ops)
 #define vcpu_has_mmxext()     (vcpu_has(0x80000001, EDX, 22, ctxt, ops) || \
                                vcpu_has_sse())
@@ -7216,6 +7219,12 @@ x86_emulate(
         host_and_vcpu_must_have(sse4_1);
         goto simd_0f38_common;
 
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x13): /* vcvtph2ps xmm/mem,{x,y}mm */
+        generate_exception_if(vex.w, EXC_UD);
+        host_and_vcpu_must_have(f16c);
+        op_bytes = 8 << vex.l;
+        goto simd_0f_ymm;
+
     case X86EMUL_OPC_VEX_66(0x0f38, 0x20): /* vpmovsxbw xmm/mem,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x21): /* vpmovsxbd xmm/mem,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x22): /* vpmovsxbq xmm/mem,{x,y}mm */
@@ -7607,6 +7616,50 @@ x86_emulate(
         opc = init_prefixes(stub);
         goto pextr;
 
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x1d): /* vcvtps2ph $imm8,{x,y}mm,xmm/mem */
+    {
+        uint32_t mxcsr;
+
+        generate_exception_if(vex.w || vex.reg != 0xf, EXC_UD);
+        host_and_vcpu_must_have(f16c);
+        fail_if(!ops->write);
+
+        opc = init_prefixes(stub);
+        opc[0] = b;
+        opc[1] = modrm;
+        if ( ea.type == OP_MEM )
+        {
+            /* Convert memory operand to (%rAX). */
+            vex.b = 1;
+            opc[1] &= 0x38;
+        }
+        opc[2] = imm1;
+        fic.insn_bytes = PFX_BYTES + 3;
+        opc[3] = 0xc3;
+
+        copy_VEX(opc, vex);
+        /* Latch MXCSR - we may need to restore it below. */
+        invoke_stub("stmxcsr %[mxcsr]", "",
+                    "=m" (*mmvalp), "+m" (fic.exn_raised), [mxcsr] "=m" (mxcsr)
+                    : "a" (mmvalp));
+
+        put_stub(stub);
+        check_xmm_exn(&fic);
+
+        if ( ea.type == OP_MEM )
+        {
+            rc = ops->write(ea.mem.seg, ea.mem.off, mmvalp, 8 << vex.l, ctxt);
+            if ( rc != X86EMUL_OKAY )
+            {
+                asm volatile ( "ldmxcsr %0" :: "m" (mxcsr) );
+                goto done;
+            }
+        }
+
+        state->simd_size = simd_none;
+        break;
+    }
+
     case X86EMUL_OPC_66(0x0f3a, 0x20): /* pinsrb $imm8,r32/m8,xmm */
     case X86EMUL_OPC_66(0x0f3a, 0x22): /* pinsr{d,q} $imm8,r/m,xmm */
         host_and_vcpu_must_have(sse4_1);
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -60,6 +60,7 @@
 #define cpu_has_aesni           boot_cpu_has(X86_FEATURE_AESNI)
 #define cpu_has_xsave           boot_cpu_has(X86_FEATURE_XSAVE)
 #define cpu_has_avx             boot_cpu_has(X86_FEATURE_AVX)
+#define cpu_has_f16c            boot_cpu_has(X86_FEATURE_F16C)
 #define cpu_has_rdrand          boot_cpu_has(X86_FEATURE_RDRAND)
 #define cpu_has_hypervisor      boot_cpu_has(X86_FEATURE_HYPERVISOR)
 



[-- Attachment #2: x86emul-F16C.patch --]
[-- Type: text/plain, Size: 6778 bytes --]

x86emul: support F16C insns

Note that this avoids emulating the behavior of VCVTPS2PH found on at
least some Intel CPUs, which update MXCSR even when the memory write
faults.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -3028,6 +3028,47 @@ int main(int argc, char **argv)
         printf("skipped\n");
 #endif
 
+    printf("%-40s", "Testing vcvtph2ps (%ecx),%ymm1...");
+    if ( stack_exec && cpu_has_f16c )
+    {
+        decl_insn(vcvtph2ps);
+        decl_insn(vcvtps2ph);
+
+        asm volatile ( "vxorps %%xmm1, %%xmm1, %%xmm1\n"
+                       put_insn(vcvtph2ps, "vcvtph2ps (%0), %%ymm1")
+                       :: "c" (NULL) );
+
+        set_insn(vcvtph2ps);
+        res[1] = 0x40003c00; /* (1.0, 2.0) */
+        res[2] = 0x44004200; /* (3.0, 4.0) */
+        res[3] = 0x3400b800; /* (-.5, .25) */
+        res[4] = 0xbc000000; /* (0.0, -1.) */
+        memset(res + 5, 0xff, 16);
+        regs.ecx = (unsigned long)(res + 1);
+        rc = x86_emulate(&ctxt, &emulops);
+        asm volatile ( "vmovups %%ymm1, %0" : "=m" (res[16]) );
+        if ( rc != X86EMUL_OKAY || !check_eip(vcvtph2ps) )
+            goto fail;
+        printf("okay\n");
+
+        printf("%-40s", "Testing vcvtps2ph $0,%ymm1,(%edx)...");
+        asm volatile ( "vmovups %0, %%ymm1\n"
+                       put_insn(vcvtps2ph, "vcvtps2ph $0, %%ymm1, (%1)")
+                       :: "m" (res[16]), "d" (NULL) );
+
+        set_insn(vcvtps2ph);
+        memset(res + 7, 0, 32);
+        regs.edx = (unsigned long)(res + 7);
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vcvtps2ph) ||
+             memcmp(res + 1, res + 7, 16) ||
+             res[11] || res[12] || res[13] || res[14] )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
 #undef decl_insn
 #undef put_insn
 #undef set_insn
--- a/tools/tests/x86_emulator/x86_emulate.h
+++ b/tools/tests/x86_emulator/x86_emulate.h
@@ -127,6 +127,14 @@ static inline uint64_t xgetbv(uint32_t x
     (res.c & (1U << 28)) != 0; \
 })
 
+#define cpu_has_f16c ({ \
+    struct cpuid_leaf res; \
+    emul_test_cpuid(1, 0, &res, NULL); \
+    if ( !(res.c & (1U << 27)) || ((xgetbv(0) & 6) != 6) ) \
+        res.c = 0; \
+    (res.c & (1U << 29)) != 0; \
+})
+
 #define cpu_has_avx2 ({ \
     struct cpuid_leaf res; \
     emul_test_cpuid(1, 0, &res, NULL); \
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -369,6 +369,7 @@ static const struct {
     [0x00 ... 0x0b] = { .simd_size = simd_packed_int },
     [0x0c ... 0x0f] = { .simd_size = simd_packed_fp },
     [0x10] = { .simd_size = simd_packed_int },
+    [0x13] = { .simd_size = simd_other, .two_op = 1 },
     [0x14 ... 0x15] = { .simd_size = simd_packed_fp },
     [0x17] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0x18 ... 0x19] = { .simd_size = simd_scalar_fp, .two_op = 1 },
@@ -411,6 +412,7 @@ static const struct {
     [0x14 ... 0x17] = { .simd_size = simd_none, .to_mem = 1, .two_op = 1 },
     [0x18] = { .simd_size = simd_128 },
     [0x19] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1 },
+    [0x1d] = { .simd_size = simd_other, .to_mem = 1, .two_op = 1 },
     [0x20] = { .simd_size = simd_none },
     [0x21] = { .simd_size = simd_other },
     [0x22] = { .simd_size = simd_none },
@@ -1601,6 +1603,7 @@ static bool vcpu_has(
 #define vcpu_has_popcnt()      vcpu_has(         1, ECX, 23, ctxt, ops)
 #define vcpu_has_aesni()       vcpu_has(         1, ECX, 25, ctxt, ops)
 #define vcpu_has_avx()         vcpu_has(         1, ECX, 28, ctxt, ops)
+#define vcpu_has_f16c()        vcpu_has(         1, ECX, 29, ctxt, ops)
 #define vcpu_has_rdrand()      vcpu_has(         1, ECX, 30, ctxt, ops)
 #define vcpu_has_mmxext()     (vcpu_has(0x80000001, EDX, 22, ctxt, ops) || \
                                vcpu_has_sse())
@@ -7216,6 +7219,12 @@ x86_emulate(
         host_and_vcpu_must_have(sse4_1);
         goto simd_0f38_common;
 
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x13): /* vcvtph2ps xmm/mem,{x,y}mm */
+        generate_exception_if(vex.w, EXC_UD);
+        host_and_vcpu_must_have(f16c);
+        op_bytes = 8 << vex.l;
+        goto simd_0f_ymm;
+
     case X86EMUL_OPC_VEX_66(0x0f38, 0x20): /* vpmovsxbw xmm/mem,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x21): /* vpmovsxbd xmm/mem,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x22): /* vpmovsxbq xmm/mem,{x,y}mm */
@@ -7607,6 +7616,50 @@ x86_emulate(
         opc = init_prefixes(stub);
         goto pextr;
 
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x1d): /* vcvtps2ph $imm8,{x,y}mm,xmm/mem */
+    {
+        uint32_t mxcsr;
+
+        generate_exception_if(vex.w || vex.reg != 0xf, EXC_UD);
+        host_and_vcpu_must_have(f16c);
+        fail_if(!ops->write);
+
+        opc = init_prefixes(stub);
+        opc[0] = b;
+        opc[1] = modrm;
+        if ( ea.type == OP_MEM )
+        {
+            /* Convert memory operand to (%rAX). */
+            vex.b = 1;
+            opc[1] &= 0x38;
+        }
+        opc[2] = imm1;
+        fic.insn_bytes = PFX_BYTES + 3;
+        opc[3] = 0xc3;
+
+        copy_VEX(opc, vex);
+        /* Latch MXCSR - we may need to restore it below. */
+        invoke_stub("stmxcsr %[mxcsr]", "",
+                    "=m" (*mmvalp), "+m" (fic.exn_raised), [mxcsr] "=m" (mxcsr)
+                    : "a" (mmvalp));
+
+        put_stub(stub);
+        check_xmm_exn(&fic);
+
+        if ( ea.type == OP_MEM )
+        {
+            rc = ops->write(ea.mem.seg, ea.mem.off, mmvalp, 8 << vex.l, ctxt);
+            if ( rc != X86EMUL_OKAY )
+            {
+                asm volatile ( "ldmxcsr %0" :: "m" (mxcsr) );
+                goto done;
+            }
+        }
+
+        state->simd_size = simd_none;
+        break;
+    }
+
     case X86EMUL_OPC_66(0x0f3a, 0x20): /* pinsrb $imm8,r32/m8,xmm */
     case X86EMUL_OPC_66(0x0f3a, 0x22): /* pinsr{d,q} $imm8,r/m,xmm */
         host_and_vcpu_must_have(sse4_1);
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -60,6 +60,7 @@
 #define cpu_has_aesni           boot_cpu_has(X86_FEATURE_AESNI)
 #define cpu_has_xsave           boot_cpu_has(X86_FEATURE_XSAVE)
 #define cpu_has_avx             boot_cpu_has(X86_FEATURE_AVX)
+#define cpu_has_f16c            boot_cpu_has(X86_FEATURE_F16C)
 #define cpu_has_rdrand          boot_cpu_has(X86_FEATURE_RDRAND)
 #define cpu_has_hypervisor      boot_cpu_has(X86_FEATURE_HYPERVISOR)
 

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 05/17] x86emul: support FMA4 insns
  2017-06-21 11:23 [PATCH 00/17] x86: emulator enhancements Jan Beulich
                   ` (3 preceding siblings ...)
  2017-06-21 12:01 ` [PATCH 04/17] x86emul: support F16C insns Jan Beulich
@ 2017-06-21 12:01 ` Jan Beulich
  2017-06-21 12:02 ` [PATCH 06/17] x86emul: support FMA insns Jan Beulich
                   ` (12 subsequent siblings)
  17 siblings, 0 replies; 27+ messages in thread
From: Jan Beulich @ 2017-06-21 12:01 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 17643 bytes --]

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/.gitignore
+++ b/.gitignore
@@ -226,7 +226,7 @@
 tools/tests/x86_emulator/asm
 tools/tests/x86_emulator/avx*.[ch]
 tools/tests/x86_emulator/blowfish.h
+tools/tests/x86_emulator/fma*.[ch]
 tools/tests/x86_emulator/sse*.[ch]
 tools/tests/x86_emulator/test_x86_emulator
 tools/tests/x86_emulator/x86_emulate
--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -12,7 +12,8 @@ run: $(TARGET)
 	./$(TARGET)
 
 SIMD := sse sse2 sse4 avx
-TESTCASES := blowfish $(SIMD) sse2-avx sse4-avx
+FMA := fma4
+TESTCASES := blowfish $(SIMD) sse2-avx sse4-avx $(FMA)
 
 blowfish-cflags := ""
 blowfish-cflags-x86_32 := "-mno-accumulate-outgoing-args -Dstatic="
@@ -29,6 +30,9 @@ sse4-flts := $(sse2-flts)
 avx-vecs := 16 32
 avx-ints :=
 avx-flts := 4 8
+fma4-vecs := $(avx-vecs)
+fma4-ints :=
+fma4-flts := $(avx-flts)
 
 # When converting SSE to AVX, have the compiler avoid XMM0 to widen
 # coverage of the VEX.vvvv checks in the emulator. We must not do this,
@@ -58,7 +62,7 @@ $(1)-avx-cflags := \
 	    "-D_$(vec)u$(int) -m$(1) $(sse2avx-$(1)) -Os -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)"))
 endef
 
-$(foreach flavor,$(SIMD),$(eval $(call simd-defs,$(flavor))))
+$(foreach flavor,$(SIMD) $(FMA),$(eval $(call simd-defs,$(flavor))))
 
 $(addsuffix .h,$(TESTCASES)): %.h: %.c testcase.mk Makefile
 	rm -f $@.new $*.bin
@@ -77,6 +81,11 @@ $(addsuffix .h,$(TESTCASES)): %.h: %.c t
 $(addsuffix .c,$(SIMD)) $(addsuffix -avx.c,$(filter sse%,$(SIMD))):
 	ln -sf simd.c $@
 
+$(addsuffix .c,$(FMA)):
+	ln -sf simd-fma.c $@
+
+$(addsuffix .o,$(SIMD) $(FMA)) $(addsuffix -avx.o,$(filter sse%,$(SIMD))): simd.h
+
 $(TARGET): x86_emulate.o test_x86_emulator.o
 	$(HOSTCC) -o $@ $^
 
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -1,71 +1,6 @@
-#include <stdbool.h>
+#include "simd.h"
 
-asm (
-    "\t.text\n"
-    "\t.globl _start\n"
-    "_start:\n"
-#if defined(__i386__) && VEC_SIZE == 16
-    "\tpush %ebp\n"
-    "\tmov %esp,%ebp\n"
-    "\tand $~0xf,%esp\n"
-    "\tcall simd_test\n"
-    "\tleave\n"
-    "\tret"
-#else
-    "\tjmp simd_test"
-#endif
-    );
-
-typedef
-#if defined(INT_SIZE)
-# define ELEM_SIZE INT_SIZE
-signed int
-# if INT_SIZE == 1
-#  define MODE QI
-# elif INT_SIZE == 2
-#  define MODE HI
-# elif INT_SIZE == 4
-#  define MODE SI
-# elif INT_SIZE == 8
-#  define MODE DI
-# endif
-#elif defined(UINT_SIZE)
-# define ELEM_SIZE UINT_SIZE
-unsigned int
-# if UINT_SIZE == 1
-#  define MODE QI
-# elif UINT_SIZE == 2
-#  define MODE HI
-# elif UINT_SIZE == 4
-#  define MODE SI
-# elif UINT_SIZE == 8
-#  define MODE DI
-# endif
-#elif defined(FLOAT_SIZE)
-float
-# define ELEM_SIZE FLOAT_SIZE
-# if FLOAT_SIZE == 4
-#  define MODE SF
-# elif FLOAT_SIZE == 8
-#  define MODE DF
-# endif
-#endif
-#ifndef VEC_SIZE
-# define VEC_SIZE ELEM_SIZE
-#endif
-__attribute__((mode(MODE), vector_size(VEC_SIZE))) vec_t;
-
-#define ELEM_COUNT (VEC_SIZE / ELEM_SIZE)
-
-typedef unsigned int __attribute__((mode(QI), vector_size(VEC_SIZE))) byte_vec_t;
-
-/* Various builtins want plain char / int / long long vector types ... */
-typedef char __attribute__((vector_size(VEC_SIZE))) vqi_t;
-typedef short __attribute__((vector_size(VEC_SIZE))) vhi_t;
-typedef int __attribute__((vector_size(VEC_SIZE))) vsi_t;
-#if VEC_SIZE >= 8
-typedef long long __attribute__((vector_size(VEC_SIZE))) vdi_t;
-#endif
+ENTRY(simd_test);
 
 #if VEC_SIZE == 8 && defined(__SSE__)
 # define to_bool(cmp) (__builtin_ia32_pmovmskb(cmp) == 0xff)
@@ -418,13 +353,6 @@ static inline bool _to_bool(byte_vec_t b
 # endif
 #endif
 
-/*
- * Suppress value propagation by the compiler, preventing unwanted
- * optimization. This at once makes the compiler use memory operands
- * more often, which for our purposes is the more interesting case.
- */
-#define touch(var) asm volatile ( "" : "+m" (var) )
-
 int simd_test(void)
 {
     unsigned int i, j;
--- /dev/null
+++ b/tools/tests/x86_emulator/simd.h
@@ -0,0 +1,78 @@
+#include <stdbool.h>
+
+#if defined(__i386__) && VEC_SIZE == 16
+# define ENTRY(name) \
+asm ( "\t.text\n" \
+      "\t.globl _start\n" \
+      "_start:\n" \
+      "\tpush %ebp\n" \
+      "\tmov %esp,%ebp\n" \
+      "\tand $~0xf,%esp\n" \
+      "\tcall " #name "\n" \
+      "\tleave\n" \
+      "\tret" )
+#else
+# define ENTRY(name) \
+asm ( "\t.text\n" \
+      "\t.globl _start\n" \
+      "_start:\n" \
+      "\tjmp " #name )
+#endif
+
+typedef
+#if defined(INT_SIZE)
+# define ELEM_SIZE INT_SIZE
+signed int
+# if INT_SIZE == 1
+#  define MODE QI
+# elif INT_SIZE == 2
+#  define MODE HI
+# elif INT_SIZE == 4
+#  define MODE SI
+# elif INT_SIZE == 8
+#  define MODE DI
+# endif
+#elif defined(UINT_SIZE)
+# define ELEM_SIZE UINT_SIZE
+unsigned int
+# if UINT_SIZE == 1
+#  define MODE QI
+# elif UINT_SIZE == 2
+#  define MODE HI
+# elif UINT_SIZE == 4
+#  define MODE SI
+# elif UINT_SIZE == 8
+#  define MODE DI
+# endif
+#elif defined(FLOAT_SIZE)
+float
+# define ELEM_SIZE FLOAT_SIZE
+# if FLOAT_SIZE == 4
+#  define MODE SF
+# elif FLOAT_SIZE == 8
+#  define MODE DF
+# endif
+#endif
+#ifndef VEC_SIZE
+# define VEC_SIZE ELEM_SIZE
+#endif
+__attribute__((mode(MODE), vector_size(VEC_SIZE))) vec_t;
+
+#define ELEM_COUNT (VEC_SIZE / ELEM_SIZE)
+
+typedef unsigned int __attribute__((mode(QI), vector_size(VEC_SIZE))) byte_vec_t;
+
+/* Various builtins want plain char / int / long long vector types ... */
+typedef char __attribute__((vector_size(VEC_SIZE))) vqi_t;
+typedef short __attribute__((vector_size(VEC_SIZE))) vhi_t;
+typedef int __attribute__((vector_size(VEC_SIZE))) vsi_t;
+#if VEC_SIZE >= 8
+typedef long long __attribute__((vector_size(VEC_SIZE))) vdi_t;
+#endif
+
+/*
+ * Suppress value propagation by the compiler, preventing unwanted
+ * optimization. This at once makes the compiler use memory operands
+ * more often, which for our purposes is the more interesting case.
+ */
+#define touch(var) asm volatile ( "" : "+m" (var) )
--- /dev/null
+++ b/tools/tests/x86_emulator/simd-fma.c
@@ -0,0 +1,121 @@
+#include "simd.h"
+
+ENTRY(fma_test);
+
+#if VEC_SIZE < 16
+# define to_bool(cmp) (!~(cmp)[0])
+#elif VEC_SIZE == 16
+# if FLOAT_SIZE == 4
+#  define to_bool(cmp) __builtin_ia32_vtestcps(cmp, (vec_t){} == 0)
+# elif FLOAT_SIZE == 8
+#  define to_bool(cmp) __builtin_ia32_vtestcpd(cmp, (vec_t){} == 0)
+# endif
+#elif VEC_SIZE == 32
+# if FLOAT_SIZE == 4
+#  define to_bool(cmp) __builtin_ia32_vtestcps256(cmp, (vec_t){} == 0)
+# elif FLOAT_SIZE == 8
+#  define to_bool(cmp) __builtin_ia32_vtestcpd256(cmp, (vec_t){} == 0)
+# endif
+#endif
+
+#if VEC_SIZE == 16
+# if FLOAT_SIZE == 4
+#  define addsub(x, y) __builtin_ia32_addsubps(x, y)
+#  if defined(__FMA4__)
+#   define fmaddsub(x, y, z) __builtin_ia32_vfmaddsubps(x, y, z)
+#  endif
+# elif FLOAT_SIZE == 8
+#  define addsub(x, y) __builtin_ia32_addsubpd(x, y)
+#  if defined(__FMA4__)
+#   define fmaddsub(x, y, z) __builtin_ia32_vfmaddsubpd(x, y, z)
+#  endif
+# endif
+#elif VEC_SIZE == 32
+# if FLOAT_SIZE == 4
+#  define addsub(x, y) __builtin_ia32_addsubps256(x, y)
+#  if defined(__FMA4__)
+#   define fmaddsub(x, y, z) __builtin_ia32_vfmaddsubps256(x, y, z)
+#  endif
+# elif FLOAT_SIZE == 8
+#  define addsub(x, y) __builtin_ia32_addsubpd256(x, y)
+#  if defined(__FMA4__)
+#   define fmaddsub(x, y, z) __builtin_ia32_vfmaddsubpd256(x, y, z)
+#  endif
+# endif
+#endif
+
+int fma_test(void)
+{
+    unsigned int i;
+    vec_t x, y, z, src, inv, one;
+
+    for ( i = 0; i < ELEM_COUNT; ++i )
+    {
+        src[i] = i + 1;
+        inv[i] = ELEM_COUNT - i;
+        one[i] = 1;
+    }
+
+    x = (src + one) * inv;
+    y = (src - one) * inv;
+    touch(src);
+    z = inv * src + inv;
+    if ( !to_bool(x == z) ) return __LINE__;
+
+    touch(src);
+    z = -inv * src - inv;
+    if ( !to_bool(-x == z) ) return __LINE__;
+
+    touch(src);
+    z = inv * src - inv;
+    if ( !to_bool(y == z) ) return __LINE__;
+
+    touch(src);
+    z = -inv * src + inv;
+    if ( !to_bool(-y == z) ) return __LINE__;
+    touch(src);
+
+    x = src + inv;
+    y = src - inv;
+    touch(inv);
+    z = src * one + inv;
+    if ( !to_bool(x == z) ) return __LINE__;
+
+    touch(inv);
+    z = -src * one - inv;
+    if ( !to_bool(-x == z) ) return __LINE__;
+
+    touch(inv);
+    z = src * one - inv;
+    if ( !to_bool(y == z) ) return __LINE__;
+
+    touch(inv);
+    z = -src * one + inv;
+    if ( !to_bool(-y == z) ) return __LINE__;
+    touch(inv);
+
+#if defined(addsub) && defined(fmaddsub)
+    x = addsub(src * inv, one);
+    y = addsub(src * inv, -one);
+    touch(one);
+    z = fmaddsub(src, inv, one);
+    if ( !to_bool(x == z) ) return __LINE__;
+
+    touch(one);
+    z = fmaddsub(src, inv, -one);
+    if ( !to_bool(y == z) ) return __LINE__;
+    touch(one);
+
+    x = addsub(src * inv, one);
+    touch(inv);
+    z = fmaddsub(src, inv, one);
+    if ( !to_bool(x == z) ) return __LINE__;
+
+    touch(inv);
+    z = fmaddsub(src, inv, -one);
+    if ( !to_bool(y == z) ) return __LINE__;
+    touch(inv);
+#endif
+
+    return 0;
+}
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -11,6 +11,7 @@
 #include "sse2-avx.h"
 #include "sse4-avx.h"
 #include "avx.h"
+#include "fma4.h"
 
 #define verbose false /* Switch to true for far more logging. */
 
@@ -47,6 +48,11 @@ static bool simd_check_avx(void)
 #define simd_check_sse2_avx  simd_check_avx
 #define simd_check_sse4_avx  simd_check_avx
 
+static bool simd_check_fma4(void)
+{
+    return cpu_has_fma4;
+}
+
 static void simd_set_regs(struct cpu_user_regs *regs)
 {
     if ( cpu_has_mmx )
@@ -143,6 +149,12 @@ static const struct {
     SIMD(AVX scalar double,      avx,         f8),
     SIMD(AVX 128bit double,      avx,       16f8),
     SIMD(AVX 256bit double,      avx,       32f8),
+    SIMD(FMA4 scalar single,     fma4,        f4),
+    SIMD(FMA4 128bit single,     fma4,      16f4),
+    SIMD(FMA4 256bit single,     fma4,      32f4),
+    SIMD(FMA4 scalar double,     fma4,        f8),
+    SIMD(FMA4 128bit double,     fma4,      16f8),
+    SIMD(FMA4 256bit double,     fma4,      32f8),
 #undef SIMD_
 #undef SIMD
 };
--- a/tools/tests/x86_emulator/x86_emulate.h
+++ b/tools/tests/x86_emulator/x86_emulate.h
@@ -164,6 +164,16 @@ static inline uint64_t xgetbv(uint32_t x
     (res.c & (1U << 6)) != 0; \
 })
 
+#define cpu_has_fma4 ({ \
+    struct cpuid_leaf res; \
+    emul_test_cpuid(1, 0, &res, NULL); \
+    if ( !(res.c & (1U << 27)) || ((xgetbv(0) & 6) != 6) ) \
+        res.c = 0; \
+    else \
+        emul_test_cpuid(0x80000001, 0, &res, NULL); \
+    (res.c & (1U << 16)) != 0; \
+})
+
 #define cpu_has_tbm ({ \
     struct cpuid_leaf res; \
     emul_test_cpuid(0x80000001, 0, &res, NULL); \
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -421,7 +421,16 @@ static const struct {
     [0x44] = { .simd_size = simd_packed_int },
     [0x4a ... 0x4b] = { .simd_size = simd_packed_fp, .four_op = 1 },
     [0x4c] = { .simd_size = simd_packed_int, .four_op = 1 },
+    [0x5c ... 0x5f] = { .simd_size = simd_packed_fp, .four_op = 1 },
     [0x60 ... 0x63] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0x68 ... 0x69] = { .simd_size = simd_packed_fp, .four_op = 1 },
+    [0x6a ... 0x6b] = { .simd_size = simd_scalar_fp, .four_op = 1 },
+    [0x6c ... 0x6d] = { .simd_size = simd_packed_fp, .four_op = 1 },
+    [0x6e ... 0x6f] = { .simd_size = simd_scalar_fp, .four_op = 1 },
+    [0x78 ... 0x79] = { .simd_size = simd_packed_fp, .four_op = 1 },
+    [0x7a ... 0x7b] = { .simd_size = simd_scalar_fp, .four_op = 1 },
+    [0x7c ... 0x7d] = { .simd_size = simd_packed_fp, .four_op = 1 },
+    [0x7e ... 0x7f] = { .simd_size = simd_scalar_fp, .four_op = 1 },
     [0xcc] = { .simd_size = simd_other },
     [0xdf] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0xf0] = {},
@@ -1612,6 +1621,7 @@ static bool vcpu_has(
 #define vcpu_has_lzcnt()       vcpu_has(0x80000001, ECX,  5, ctxt, ops)
 #define vcpu_has_sse4a()       vcpu_has(0x80000001, ECX,  6, ctxt, ops)
 #define vcpu_has_misalignsse() vcpu_has(0x80000001, ECX,  7, ctxt, ops)
+#define vcpu_has_fma4()        vcpu_has(0x80000001, ECX, 16, ctxt, ops)
 #define vcpu_has_tbm()         vcpu_has(0x80000001, ECX, 21, ctxt, ops)
 #define vcpu_has_bmi1()        vcpu_has(         7, EBX,  3, ctxt, ops)
 #define vcpu_has_hle()         vcpu_has(         7, EBX,  4, ctxt, ops)
@@ -6155,6 +6165,7 @@ x86_emulate(
     simd_0f_imm8_avx:
                 host_and_vcpu_must_have(avx);
             }
+    simd_0f_imm8_ymm:
             get_fpu(X86EMUL_FPU_ymm, &fic);
         }
         else if ( vex.pfx )
@@ -7710,6 +7721,49 @@ x86_emulate(
         generate_exception_if(vex.w, EXC_UD);
         goto simd_0f_int_imm8;
 
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x5c): /* vfmaddsubps {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+                                           /* vfmaddsubps {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x5d): /* vfmaddsubpd {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+                                           /* vfmaddsubpd {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x5e): /* vfmsubaddps {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+                                           /* vfmsubaddps {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x5f): /* vfmsubaddpd {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+                                           /* vfmsubaddpd {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x68): /* vfmaddps {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+                                           /* vfmaddps {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x69): /* vfmaddpd {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+                                           /* vfmaddpd {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x6a): /* vfmaddss xmm,xmm/m32,xmm,xmm */
+                                           /* vfmaddss xmm/m32,xmm,xmm,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x6b): /* vfmaddsd xmm,xmm/m64,xmm,xmm */
+                                           /* vfmaddsd xmm/m64,xmm,xmm,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x6c): /* vfmsubps {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+                                           /* vfmsubps {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x6d): /* vfmsubpd {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+                                           /* vfmsubpd {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x6e): /* vfmsubss xmm,xmm/m32,xmm,xmm */
+                                           /* vfmsubss xmm/m32,xmm,xmm,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x6f): /* vfmsubsd xmm,xmm/m64,xmm,xmm */
+                                           /* vfmsubsd xmm/m64,xmm,xmm,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x78): /* vfnmaddps {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+                                           /* vfnmaddps {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x79): /* vfnmaddpd {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+                                           /* vfnmaddpd {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x7a): /* vfnmaddss xmm,xmm/m32,xmm,xmm */
+                                           /* vfnmaddss xmm/m32,xmm,xmm,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x7b): /* vfnmaddsd xmm,xmm/m64,xmm,xmm */
+                                           /* vfnmaddsd xmm/m64,xmm,xmm,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x7c): /* vfnmsubps {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+                                           /* vfnmsubps {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x7d): /* vfnmsubpd {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+                                           /* vfnmsubpd {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x7e): /* vfnmsubss xmm,xmm/m32,xmm,xmm */
+                                           /* vfnmsubss xmm/m32,xmm,xmm,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x7f): /* vfnmsubsd xmm,xmm/m64,xmm,xmm */
+                                           /* vfnmsubsd xmm/m64,xmm,xmm,xmm */
+        host_and_vcpu_must_have(fma4);
+        goto simd_0f_imm8_ymm;
+
     case X86EMUL_OPC_66(0x0f3a, 0x60):     /* pcmpestrm $imm8,xmm/m128,xmm */
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x60): /* vpcmpestrm $imm8,xmm/m128,xmm */
     case X86EMUL_OPC_66(0x0f3a, 0x61):     /* pcmpestri $imm8,xmm/m128,xmm */
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -76,6 +76,7 @@
 #define cpu_has_svm             boot_cpu_has(X86_FEATURE_SVM)
 #define cpu_has_sse4a           boot_cpu_has(X86_FEATURE_SSE4A)
 #define cpu_has_lwp             boot_cpu_has(X86_FEATURE_LWP)
+#define cpu_has_fma4            boot_cpu_has(X86_FEATURE_FMA4)
 #define cpu_has_tbm             boot_cpu_has(X86_FEATURE_TBM)
 
 /* CPUID level 0x0000000D:1.eax */



[-- Attachment #2: x86emul-FMA4.patch --]
[-- Type: text/plain, Size: 17670 bytes --]

x86emul: support FMA4 insns

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/.gitignore
+++ b/.gitignore
@@ -226,7 +226,7 @@
 tools/tests/x86_emulator/asm
 tools/tests/x86_emulator/avx*.[ch]
 tools/tests/x86_emulator/blowfish.h
+tools/tests/x86_emulator/fma*.[ch]
 tools/tests/x86_emulator/sse*.[ch]
 tools/tests/x86_emulator/test_x86_emulator
 tools/tests/x86_emulator/x86_emulate
--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -12,7 +12,8 @@ run: $(TARGET)
 	./$(TARGET)
 
 SIMD := sse sse2 sse4 avx
-TESTCASES := blowfish $(SIMD) sse2-avx sse4-avx
+FMA := fma4
+TESTCASES := blowfish $(SIMD) sse2-avx sse4-avx $(FMA)
 
 blowfish-cflags := ""
 blowfish-cflags-x86_32 := "-mno-accumulate-outgoing-args -Dstatic="
@@ -29,6 +30,9 @@ sse4-flts := $(sse2-flts)
 avx-vecs := 16 32
 avx-ints :=
 avx-flts := 4 8
+fma4-vecs := $(avx-vecs)
+fma4-ints :=
+fma4-flts := $(avx-flts)
 
 # When converting SSE to AVX, have the compiler avoid XMM0 to widen
 # coverage of the VEX.vvvv checks in the emulator. We must not do this,
@@ -58,7 +62,7 @@ $(1)-avx-cflags := \
 	    "-D_$(vec)u$(int) -m$(1) $(sse2avx-$(1)) -Os -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)"))
 endef
 
-$(foreach flavor,$(SIMD),$(eval $(call simd-defs,$(flavor))))
+$(foreach flavor,$(SIMD) $(FMA),$(eval $(call simd-defs,$(flavor))))
 
 $(addsuffix .h,$(TESTCASES)): %.h: %.c testcase.mk Makefile
 	rm -f $@.new $*.bin
@@ -77,6 +81,11 @@ $(addsuffix .h,$(TESTCASES)): %.h: %.c t
 $(addsuffix .c,$(SIMD)) $(addsuffix -avx.c,$(filter sse%,$(SIMD))):
 	ln -sf simd.c $@
 
+$(addsuffix .c,$(FMA)):
+	ln -sf simd-fma.c $@
+
+$(addsuffix .o,$(SIMD) $(FMA)) $(addsuffix -avx.o,$(filter sse%,$(SIMD))): simd.h
+
 $(TARGET): x86_emulate.o test_x86_emulator.o
 	$(HOSTCC) -o $@ $^
 
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -1,71 +1,6 @@
-#include <stdbool.h>
+#include "simd.h"
 
-asm (
-    "\t.text\n"
-    "\t.globl _start\n"
-    "_start:\n"
-#if defined(__i386__) && VEC_SIZE == 16
-    "\tpush %ebp\n"
-    "\tmov %esp,%ebp\n"
-    "\tand $~0xf,%esp\n"
-    "\tcall simd_test\n"
-    "\tleave\n"
-    "\tret"
-#else
-    "\tjmp simd_test"
-#endif
-    );
-
-typedef
-#if defined(INT_SIZE)
-# define ELEM_SIZE INT_SIZE
-signed int
-# if INT_SIZE == 1
-#  define MODE QI
-# elif INT_SIZE == 2
-#  define MODE HI
-# elif INT_SIZE == 4
-#  define MODE SI
-# elif INT_SIZE == 8
-#  define MODE DI
-# endif
-#elif defined(UINT_SIZE)
-# define ELEM_SIZE UINT_SIZE
-unsigned int
-# if UINT_SIZE == 1
-#  define MODE QI
-# elif UINT_SIZE == 2
-#  define MODE HI
-# elif UINT_SIZE == 4
-#  define MODE SI
-# elif UINT_SIZE == 8
-#  define MODE DI
-# endif
-#elif defined(FLOAT_SIZE)
-float
-# define ELEM_SIZE FLOAT_SIZE
-# if FLOAT_SIZE == 4
-#  define MODE SF
-# elif FLOAT_SIZE == 8
-#  define MODE DF
-# endif
-#endif
-#ifndef VEC_SIZE
-# define VEC_SIZE ELEM_SIZE
-#endif
-__attribute__((mode(MODE), vector_size(VEC_SIZE))) vec_t;
-
-#define ELEM_COUNT (VEC_SIZE / ELEM_SIZE)
-
-typedef unsigned int __attribute__((mode(QI), vector_size(VEC_SIZE))) byte_vec_t;
-
-/* Various builtins want plain char / int / long long vector types ... */
-typedef char __attribute__((vector_size(VEC_SIZE))) vqi_t;
-typedef short __attribute__((vector_size(VEC_SIZE))) vhi_t;
-typedef int __attribute__((vector_size(VEC_SIZE))) vsi_t;
-#if VEC_SIZE >= 8
-typedef long long __attribute__((vector_size(VEC_SIZE))) vdi_t;
-#endif
+ENTRY(simd_test);
 
 #if VEC_SIZE == 8 && defined(__SSE__)
 # define to_bool(cmp) (__builtin_ia32_pmovmskb(cmp) == 0xff)
@@ -418,13 +353,6 @@ static inline bool _to_bool(byte_vec_t b
 # endif
 #endif
 
-/*
- * Suppress value propagation by the compiler, preventing unwanted
- * optimization. This at once makes the compiler use memory operands
- * more often, which for our purposes is the more interesting case.
- */
-#define touch(var) asm volatile ( "" : "+m" (var) )
-
 int simd_test(void)
 {
     unsigned int i, j;
--- /dev/null
+++ b/tools/tests/x86_emulator/simd.h
@@ -0,0 +1,78 @@
+#include <stdbool.h>
+
+#if defined(__i386__) && VEC_SIZE == 16
+# define ENTRY(name) \
+asm ( "\t.text\n" \
+      "\t.globl _start\n" \
+      "_start:\n" \
+      "\tpush %ebp\n" \
+      "\tmov %esp,%ebp\n" \
+      "\tand $~0xf,%esp\n" \
+      "\tcall " #name "\n" \
+      "\tleave\n" \
+      "\tret" )
+#else
+# define ENTRY(name) \
+asm ( "\t.text\n" \
+      "\t.globl _start\n" \
+      "_start:\n" \
+      "\tjmp " #name )
+#endif
+
+typedef
+#if defined(INT_SIZE)
+# define ELEM_SIZE INT_SIZE
+signed int
+# if INT_SIZE == 1
+#  define MODE QI
+# elif INT_SIZE == 2
+#  define MODE HI
+# elif INT_SIZE == 4
+#  define MODE SI
+# elif INT_SIZE == 8
+#  define MODE DI
+# endif
+#elif defined(UINT_SIZE)
+# define ELEM_SIZE UINT_SIZE
+unsigned int
+# if UINT_SIZE == 1
+#  define MODE QI
+# elif UINT_SIZE == 2
+#  define MODE HI
+# elif UINT_SIZE == 4
+#  define MODE SI
+# elif UINT_SIZE == 8
+#  define MODE DI
+# endif
+#elif defined(FLOAT_SIZE)
+float
+# define ELEM_SIZE FLOAT_SIZE
+# if FLOAT_SIZE == 4
+#  define MODE SF
+# elif FLOAT_SIZE == 8
+#  define MODE DF
+# endif
+#endif
+#ifndef VEC_SIZE
+# define VEC_SIZE ELEM_SIZE
+#endif
+__attribute__((mode(MODE), vector_size(VEC_SIZE))) vec_t;
+
+#define ELEM_COUNT (VEC_SIZE / ELEM_SIZE)
+
+typedef unsigned int __attribute__((mode(QI), vector_size(VEC_SIZE))) byte_vec_t;
+
+/* Various builtins want plain char / int / long long vector types ... */
+typedef char __attribute__((vector_size(VEC_SIZE))) vqi_t;
+typedef short __attribute__((vector_size(VEC_SIZE))) vhi_t;
+typedef int __attribute__((vector_size(VEC_SIZE))) vsi_t;
+#if VEC_SIZE >= 8
+typedef long long __attribute__((vector_size(VEC_SIZE))) vdi_t;
+#endif
+
+/*
+ * Suppress value propagation by the compiler, preventing unwanted
+ * optimization. This at once makes the compiler use memory operands
+ * more often, which for our purposes is the more interesting case.
+ */
+#define touch(var) asm volatile ( "" : "+m" (var) )
--- /dev/null
+++ b/tools/tests/x86_emulator/simd-fma.c
@@ -0,0 +1,121 @@
+#include "simd.h"
+
+ENTRY(fma_test);
+
+#if VEC_SIZE < 16
+# define to_bool(cmp) (!~(cmp)[0])
+#elif VEC_SIZE == 16
+# if FLOAT_SIZE == 4
+#  define to_bool(cmp) __builtin_ia32_vtestcps(cmp, (vec_t){} == 0)
+# elif FLOAT_SIZE == 8
+#  define to_bool(cmp) __builtin_ia32_vtestcpd(cmp, (vec_t){} == 0)
+# endif
+#elif VEC_SIZE == 32
+# if FLOAT_SIZE == 4
+#  define to_bool(cmp) __builtin_ia32_vtestcps256(cmp, (vec_t){} == 0)
+# elif FLOAT_SIZE == 8
+#  define to_bool(cmp) __builtin_ia32_vtestcpd256(cmp, (vec_t){} == 0)
+# endif
+#endif
+
+#if VEC_SIZE == 16
+# if FLOAT_SIZE == 4
+#  define addsub(x, y) __builtin_ia32_addsubps(x, y)
+#  if defined(__FMA4__)
+#   define fmaddsub(x, y, z) __builtin_ia32_vfmaddsubps(x, y, z)
+#  endif
+# elif FLOAT_SIZE == 8
+#  define addsub(x, y) __builtin_ia32_addsubpd(x, y)
+#  if defined(__FMA4__)
+#   define fmaddsub(x, y, z) __builtin_ia32_vfmaddsubpd(x, y, z)
+#  endif
+# endif
+#elif VEC_SIZE == 32
+# if FLOAT_SIZE == 4
+#  define addsub(x, y) __builtin_ia32_addsubps256(x, y)
+#  if defined(__FMA4__)
+#   define fmaddsub(x, y, z) __builtin_ia32_vfmaddsubps256(x, y, z)
+#  endif
+# elif FLOAT_SIZE == 8
+#  define addsub(x, y) __builtin_ia32_addsubpd256(x, y)
+#  if defined(__FMA4__)
+#   define fmaddsub(x, y, z) __builtin_ia32_vfmaddsubpd256(x, y, z)
+#  endif
+# endif
+#endif
+
+int fma_test(void)
+{
+    unsigned int i;
+    vec_t x, y, z, src, inv, one;
+
+    for ( i = 0; i < ELEM_COUNT; ++i )
+    {
+        src[i] = i + 1;
+        inv[i] = ELEM_COUNT - i;
+        one[i] = 1;
+    }
+
+    x = (src + one) * inv;
+    y = (src - one) * inv;
+    touch(src);
+    z = inv * src + inv;
+    if ( !to_bool(x == z) ) return __LINE__;
+
+    touch(src);
+    z = -inv * src - inv;
+    if ( !to_bool(-x == z) ) return __LINE__;
+
+    touch(src);
+    z = inv * src - inv;
+    if ( !to_bool(y == z) ) return __LINE__;
+
+    touch(src);
+    z = -inv * src + inv;
+    if ( !to_bool(-y == z) ) return __LINE__;
+    touch(src);
+
+    x = src + inv;
+    y = src - inv;
+    touch(inv);
+    z = src * one + inv;
+    if ( !to_bool(x == z) ) return __LINE__;
+
+    touch(inv);
+    z = -src * one - inv;
+    if ( !to_bool(-x == z) ) return __LINE__;
+
+    touch(inv);
+    z = src * one - inv;
+    if ( !to_bool(y == z) ) return __LINE__;
+
+    touch(inv);
+    z = -src * one + inv;
+    if ( !to_bool(-y == z) ) return __LINE__;
+    touch(inv);
+
+#if defined(addsub) && defined(fmaddsub)
+    x = addsub(src * inv, one);
+    y = addsub(src * inv, -one);
+    touch(one);
+    z = fmaddsub(src, inv, one);
+    if ( !to_bool(x == z) ) return __LINE__;
+
+    touch(one);
+    z = fmaddsub(src, inv, -one);
+    if ( !to_bool(y == z) ) return __LINE__;
+    touch(one);
+
+    x = addsub(src * inv, one);
+    touch(inv);
+    z = fmaddsub(src, inv, one);
+    if ( !to_bool(x == z) ) return __LINE__;
+
+    touch(inv);
+    z = fmaddsub(src, inv, -one);
+    if ( !to_bool(y == z) ) return __LINE__;
+    touch(inv);
+#endif
+
+    return 0;
+}
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -11,6 +11,7 @@
 #include "sse2-avx.h"
 #include "sse4-avx.h"
 #include "avx.h"
+#include "fma4.h"
 
 #define verbose false /* Switch to true for far more logging. */
 
@@ -47,6 +48,11 @@ static bool simd_check_avx(void)
 #define simd_check_sse2_avx  simd_check_avx
 #define simd_check_sse4_avx  simd_check_avx
 
+static bool simd_check_fma4(void)
+{
+    return cpu_has_fma4;
+}
+
 static void simd_set_regs(struct cpu_user_regs *regs)
 {
     if ( cpu_has_mmx )
@@ -143,6 +149,12 @@ static const struct {
     SIMD(AVX scalar double,      avx,         f8),
     SIMD(AVX 128bit double,      avx,       16f8),
     SIMD(AVX 256bit double,      avx,       32f8),
+    SIMD(FMA4 scalar single,     fma4,        f4),
+    SIMD(FMA4 128bit single,     fma4,      16f4),
+    SIMD(FMA4 256bit single,     fma4,      32f4),
+    SIMD(FMA4 scalar double,     fma4,        f8),
+    SIMD(FMA4 128bit double,     fma4,      16f8),
+    SIMD(FMA4 256bit double,     fma4,      32f8),
 #undef SIMD_
 #undef SIMD
 };
--- a/tools/tests/x86_emulator/x86_emulate.h
+++ b/tools/tests/x86_emulator/x86_emulate.h
@@ -164,6 +164,16 @@ static inline uint64_t xgetbv(uint32_t x
     (res.c & (1U << 6)) != 0; \
 })
 
+#define cpu_has_fma4 ({ \
+    struct cpuid_leaf res; \
+    emul_test_cpuid(1, 0, &res, NULL); \
+    if ( !(res.c & (1U << 27)) || ((xgetbv(0) & 6) != 6) ) \
+        res.c = 0; \
+    else \
+        emul_test_cpuid(0x80000001, 0, &res, NULL); \
+    (res.c & (1U << 16)) != 0; \
+})
+
 #define cpu_has_tbm ({ \
     struct cpuid_leaf res; \
     emul_test_cpuid(0x80000001, 0, &res, NULL); \
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -421,7 +421,16 @@ static const struct {
     [0x44] = { .simd_size = simd_packed_int },
     [0x4a ... 0x4b] = { .simd_size = simd_packed_fp, .four_op = 1 },
     [0x4c] = { .simd_size = simd_packed_int, .four_op = 1 },
+    [0x5c ... 0x5f] = { .simd_size = simd_packed_fp, .four_op = 1 },
     [0x60 ... 0x63] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0x68 ... 0x69] = { .simd_size = simd_packed_fp, .four_op = 1 },
+    [0x6a ... 0x6b] = { .simd_size = simd_scalar_fp, .four_op = 1 },
+    [0x6c ... 0x6d] = { .simd_size = simd_packed_fp, .four_op = 1 },
+    [0x6e ... 0x6f] = { .simd_size = simd_scalar_fp, .four_op = 1 },
+    [0x78 ... 0x79] = { .simd_size = simd_packed_fp, .four_op = 1 },
+    [0x7a ... 0x7b] = { .simd_size = simd_scalar_fp, .four_op = 1 },
+    [0x7c ... 0x7d] = { .simd_size = simd_packed_fp, .four_op = 1 },
+    [0x7e ... 0x7f] = { .simd_size = simd_scalar_fp, .four_op = 1 },
     [0xcc] = { .simd_size = simd_other },
     [0xdf] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0xf0] = {},
@@ -1612,6 +1621,7 @@ static bool vcpu_has(
 #define vcpu_has_lzcnt()       vcpu_has(0x80000001, ECX,  5, ctxt, ops)
 #define vcpu_has_sse4a()       vcpu_has(0x80000001, ECX,  6, ctxt, ops)
 #define vcpu_has_misalignsse() vcpu_has(0x80000001, ECX,  7, ctxt, ops)
+#define vcpu_has_fma4()        vcpu_has(0x80000001, ECX, 16, ctxt, ops)
 #define vcpu_has_tbm()         vcpu_has(0x80000001, ECX, 21, ctxt, ops)
 #define vcpu_has_bmi1()        vcpu_has(         7, EBX,  3, ctxt, ops)
 #define vcpu_has_hle()         vcpu_has(         7, EBX,  4, ctxt, ops)
@@ -6155,6 +6165,7 @@ x86_emulate(
     simd_0f_imm8_avx:
                 host_and_vcpu_must_have(avx);
             }
+    simd_0f_imm8_ymm:
             get_fpu(X86EMUL_FPU_ymm, &fic);
         }
         else if ( vex.pfx )
@@ -7710,6 +7721,49 @@ x86_emulate(
         generate_exception_if(vex.w, EXC_UD);
         goto simd_0f_int_imm8;
 
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x5c): /* vfmaddsubps {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+                                           /* vfmaddsubps {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x5d): /* vfmaddsubpd {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+                                           /* vfmaddsubpd {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x5e): /* vfmsubaddps {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+                                           /* vfmsubaddps {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x5f): /* vfmsubaddpd {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+                                           /* vfmsubaddpd {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x68): /* vfmaddps {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+                                           /* vfmaddps {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x69): /* vfmaddpd {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+                                           /* vfmaddpd {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x6a): /* vfmaddss xmm,xmm/m32,xmm,xmm */
+                                           /* vfmaddss xmm/m32,xmm,xmm,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x6b): /* vfmaddsd xmm,xmm/m64,xmm,xmm */
+                                           /* vfmaddsd xmm/m64,xmm,xmm,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x6c): /* vfmsubps {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+                                           /* vfmsubps {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x6d): /* vfmsubpd {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+                                           /* vfmsubpd {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x6e): /* vfmsubss xmm,xmm/m32,xmm,xmm */
+                                           /* vfmsubss xmm/m32,xmm,xmm,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x6f): /* vfmsubsd xmm,xmm/m64,xmm,xmm */
+                                           /* vfmsubsd xmm/m64,xmm,xmm,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x78): /* vfnmaddps {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+                                           /* vfnmaddps {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x79): /* vfnmaddpd {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+                                           /* vfnmaddpd {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x7a): /* vfnmaddss xmm,xmm/m32,xmm,xmm */
+                                           /* vfnmaddss xmm/m32,xmm,xmm,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x7b): /* vfnmaddsd xmm,xmm/m64,xmm,xmm */
+                                           /* vfnmaddsd xmm/m64,xmm,xmm,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x7c): /* vfnmsubps {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+                                           /* vfnmsubps {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x7d): /* vfnmsubpd {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+                                           /* vfnmsubpd {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x7e): /* vfnmsubss xmm,xmm/m32,xmm,xmm */
+                                           /* vfnmsubss xmm/m32,xmm,xmm,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x7f): /* vfnmsubsd xmm,xmm/m64,xmm,xmm */
+                                           /* vfnmsubsd xmm/m64,xmm,xmm,xmm */
+        host_and_vcpu_must_have(fma4);
+        goto simd_0f_imm8_ymm;
+
     case X86EMUL_OPC_66(0x0f3a, 0x60):     /* pcmpestrm $imm8,xmm/m128,xmm */
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x60): /* vpcmpestrm $imm8,xmm/m128,xmm */
     case X86EMUL_OPC_66(0x0f3a, 0x61):     /* pcmpestri $imm8,xmm/m128,xmm */
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -76,6 +76,7 @@
 #define cpu_has_svm             boot_cpu_has(X86_FEATURE_SVM)
 #define cpu_has_sse4a           boot_cpu_has(X86_FEATURE_SSE4A)
 #define cpu_has_lwp             boot_cpu_has(X86_FEATURE_LWP)
+#define cpu_has_fma4            boot_cpu_has(X86_FEATURE_FMA4)
 #define cpu_has_tbm             boot_cpu_has(X86_FEATURE_TBM)
 
 /* CPUID level 0x0000000D:1.eax */

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 06/17] x86emul: support FMA insns
  2017-06-21 11:23 [PATCH 00/17] x86: emulator enhancements Jan Beulich
                   ` (4 preceding siblings ...)
  2017-06-21 12:01 ` [PATCH 05/17] x86emul: support FMA4 insns Jan Beulich
@ 2017-06-21 12:02 ` Jan Beulich
  2017-06-21 12:02 ` [PATCH 07/17] x86emul: support most remaining AVX2 insns Jan Beulich
                   ` (11 subsequent siblings)
  17 siblings, 0 replies; 27+ messages in thread
From: Jan Beulich @ 2017-06-21 12:02 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 8414 bytes --]

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -12,7 +12,7 @@ run: $(TARGET)
 	./$(TARGET)
 
 SIMD := sse sse2 sse4 avx
-FMA := fma4
+FMA := fma4 fma
 TESTCASES := blowfish $(SIMD) sse2-avx sse4-avx $(FMA)
 
 blowfish-cflags := ""
@@ -33,6 +33,9 @@ avx-flts := 4 8
 fma4-vecs := $(avx-vecs)
 fma4-ints :=
 fma4-flts := $(avx-flts)
+fma-vecs := $(avx-vecs)
+fma-ints :=
+fma-flts := $(avx-flts)
 
 # When converting SSE to AVX, have the compiler avoid XMM0 to widen
 # coverage of the VEX.vvvv checks in the emulator. We must not do this,
--- a/tools/tests/x86_emulator/simd-fma.c
+++ b/tools/tests/x86_emulator/simd-fma.c
@@ -21,24 +21,24 @@ ENTRY(fma_test);
 #if VEC_SIZE == 16
 # if FLOAT_SIZE == 4
 #  define addsub(x, y) __builtin_ia32_addsubps(x, y)
-#  if defined(__FMA4__)
+#  if defined(__FMA4__) || defined(__FMA__)
 #   define fmaddsub(x, y, z) __builtin_ia32_vfmaddsubps(x, y, z)
 #  endif
 # elif FLOAT_SIZE == 8
 #  define addsub(x, y) __builtin_ia32_addsubpd(x, y)
-#  if defined(__FMA4__)
+#  if defined(__FMA4__) || defined(__FMA__)
 #   define fmaddsub(x, y, z) __builtin_ia32_vfmaddsubpd(x, y, z)
 #  endif
 # endif
 #elif VEC_SIZE == 32
 # if FLOAT_SIZE == 4
 #  define addsub(x, y) __builtin_ia32_addsubps256(x, y)
-#  if defined(__FMA4__)
+#  if defined(__FMA4__) || defined(__FMA__)
 #   define fmaddsub(x, y, z) __builtin_ia32_vfmaddsubps256(x, y, z)
 #  endif
 # elif FLOAT_SIZE == 8
 #  define addsub(x, y) __builtin_ia32_addsubpd256(x, y)
-#  if defined(__FMA4__)
+#  if defined(__FMA4__) || defined(__FMA__)
 #   define fmaddsub(x, y, z) __builtin_ia32_vfmaddsubpd256(x, y, z)
 #  endif
 # endif
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -12,6 +12,7 @@
 #include "sse4-avx.h"
 #include "avx.h"
 #include "fma4.h"
+#include "fma.h"
 
 #define verbose false /* Switch to true for far more logging. */
 
@@ -53,6 +54,11 @@ static bool simd_check_fma4(void)
     return cpu_has_fma4;
 }
 
+static bool simd_check_fma(void)
+{
+    return cpu_has_fma;
+}
+
 static void simd_set_regs(struct cpu_user_regs *regs)
 {
     if ( cpu_has_mmx )
@@ -155,6 +161,12 @@ static const struct {
     SIMD(FMA4 scalar double,     fma4,        f8),
     SIMD(FMA4 128bit double,     fma4,      16f8),
     SIMD(FMA4 256bit double,     fma4,      32f8),
+    SIMD(FMA scalar single,      fma,         f4),
+    SIMD(FMA 128bit single,      fma,       16f4),
+    SIMD(FMA 256bit single,      fma,       32f4),
+    SIMD(FMA scalar double,      fma,         f8),
+    SIMD(FMA 128bit double,      fma,       16f8),
+    SIMD(FMA 256bit double,      fma,       32f8),
 #undef SIMD_
 #undef SIMD
 };
--- a/tools/tests/x86_emulator/x86_emulate.h
+++ b/tools/tests/x86_emulator/x86_emulate.h
@@ -94,6 +94,14 @@ static inline uint64_t xgetbv(uint32_t x
     (res.c & (1U << 0)) != 0; \
 })
 
+#define cpu_has_fma ({ \
+    struct cpuid_leaf res; \
+    emul_test_cpuid(1, 0, &res, NULL); \
+    if ( !(res.c & (1U << 27)) || ((xgetbv(0) & 6) != 6) ) \
+        res.c = 0; \
+    (res.c & (1U << 12)) != 0; \
+})
+
 #define cpu_has_sse4_1 ({ \
     struct cpuid_leaf res; \
     emul_test_cpuid(1, 0, &res, NULL); \
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -385,6 +385,9 @@ static const struct {
     [0x37 ... 0x3f] = { .simd_size = simd_packed_int },
     [0x40] = { .simd_size = simd_packed_int },
     [0x41] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0x96 ... 0x9f] = { .simd_size = simd_packed_fp },
+    [0xa6 ... 0xaf] = { .simd_size = simd_packed_fp },
+    [0xb6 ... 0xbf] = { .simd_size = simd_packed_fp },
     [0xc8 ... 0xcd] = { .simd_size = simd_other },
     [0xdb] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0xdc ... 0xdf] = { .simd_size = simd_packed_int },
@@ -1605,6 +1608,7 @@ static bool vcpu_has(
 #define vcpu_has_sse3()        vcpu_has(         1, ECX,  0, ctxt, ops)
 #define vcpu_has_pclmulqdq()   vcpu_has(         1, ECX,  1, ctxt, ops)
 #define vcpu_has_ssse3()       vcpu_has(         1, ECX,  9, ctxt, ops)
+#define vcpu_has_fma()         vcpu_has(         1, ECX, 12, ctxt, ops)
 #define vcpu_has_cx16()        vcpu_has(         1, ECX, 13, ctxt, ops)
 #define vcpu_has_sse4_1()      vcpu_has(         1, ECX, 19, ctxt, ops)
 #define vcpu_has_sse4_2()      vcpu_has(         1, ECX, 20, ctxt, ops)
@@ -7352,6 +7356,39 @@ x86_emulate(
         generate_exception_if(vex.l, EXC_UD);
         goto simd_0f_avx;
 
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x96): /* vfmaddsub132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x97): /* vfmsubadd132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x98): /* vfmadd132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x99): /* vfmadd132s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x9a): /* vfmsub132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x9b): /* vfmsub132s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x9c): /* vfnmadd132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x9d): /* vfnmadd132s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x9e): /* vfnmsub132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x9f): /* vfnmsub132s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xa6): /* vfmaddsub213p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xa7): /* vfmsubadd213p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xa8): /* vfmadd213p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xa9): /* vfmadd213s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xaa): /* vfmsub213p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xab): /* vfmsub213s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xac): /* vfnmadd213p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xad): /* vfnmadd213s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xae): /* vfnmsub213p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xaf): /* vfnmsub213s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xb6): /* vfmaddsub231p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xb7): /* vfmsubadd231p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xb8): /* vfmadd231p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xb9): /* vfmadd231s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xba): /* vfmsub231p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xbb): /* vfmsub231s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xbc): /* vfnmadd231p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xbd): /* vfnmadd231s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xbe): /* vfnmsub231p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xbf): /* vfnmsub231s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+        host_and_vcpu_must_have(fma);
+        goto simd_0f_ymm;
+
     case X86EMUL_OPC(0x0f38, 0xc8):     /* sha1nexte xmm/m128,xmm */
     case X86EMUL_OPC(0x0f38, 0xc9):     /* sha1msg1 xmm/m128,xmm */
     case X86EMUL_OPC(0x0f38, 0xca):     /* sha1msg2 xmm/m128,xmm */
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -50,6 +50,7 @@
 #define cpu_has_vmx             boot_cpu_has(X86_FEATURE_VMX)
 #define cpu_has_eist            boot_cpu_has(X86_FEATURE_EIST)
 #define cpu_has_ssse3           boot_cpu_has(X86_FEATURE_SSSE3)
+#define cpu_has_fma             boot_cpu_has(X86_FEATURE_FMA)
 #define cpu_has_cx16            boot_cpu_has(X86_FEATURE_CX16)
 #define cpu_has_pdcm            boot_cpu_has(X86_FEATURE_PDCM)
 #define cpu_has_pcid            boot_cpu_has(X86_FEATURE_PCID)



[-- Attachment #2: x86emul-FMA.patch --]
[-- Type: text/plain, Size: 8440 bytes --]

x86emul: support FMA insns

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -12,7 +12,7 @@ run: $(TARGET)
 	./$(TARGET)
 
 SIMD := sse sse2 sse4 avx
-FMA := fma4
+FMA := fma4 fma
 TESTCASES := blowfish $(SIMD) sse2-avx sse4-avx $(FMA)
 
 blowfish-cflags := ""
@@ -33,6 +33,9 @@ avx-flts := 4 8
 fma4-vecs := $(avx-vecs)
 fma4-ints :=
 fma4-flts := $(avx-flts)
+fma-vecs := $(avx-vecs)
+fma-ints :=
+fma-flts := $(avx-flts)
 
 # When converting SSE to AVX, have the compiler avoid XMM0 to widen
 # coverage of the VEX.vvvv checks in the emulator. We must not do this,
--- a/tools/tests/x86_emulator/simd-fma.c
+++ b/tools/tests/x86_emulator/simd-fma.c
@@ -21,24 +21,24 @@ ENTRY(fma_test);
 #if VEC_SIZE == 16
 # if FLOAT_SIZE == 4
 #  define addsub(x, y) __builtin_ia32_addsubps(x, y)
-#  if defined(__FMA4__)
+#  if defined(__FMA4__) || defined(__FMA__)
 #   define fmaddsub(x, y, z) __builtin_ia32_vfmaddsubps(x, y, z)
 #  endif
 # elif FLOAT_SIZE == 8
 #  define addsub(x, y) __builtin_ia32_addsubpd(x, y)
-#  if defined(__FMA4__)
+#  if defined(__FMA4__) || defined(__FMA__)
 #   define fmaddsub(x, y, z) __builtin_ia32_vfmaddsubpd(x, y, z)
 #  endif
 # endif
 #elif VEC_SIZE == 32
 # if FLOAT_SIZE == 4
 #  define addsub(x, y) __builtin_ia32_addsubps256(x, y)
-#  if defined(__FMA4__)
+#  if defined(__FMA4__) || defined(__FMA__)
 #   define fmaddsub(x, y, z) __builtin_ia32_vfmaddsubps256(x, y, z)
 #  endif
 # elif FLOAT_SIZE == 8
 #  define addsub(x, y) __builtin_ia32_addsubpd256(x, y)
-#  if defined(__FMA4__)
+#  if defined(__FMA4__) || defined(__FMA__)
 #   define fmaddsub(x, y, z) __builtin_ia32_vfmaddsubpd256(x, y, z)
 #  endif
 # endif
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -12,6 +12,7 @@
 #include "sse4-avx.h"
 #include "avx.h"
 #include "fma4.h"
+#include "fma.h"
 
 #define verbose false /* Switch to true for far more logging. */
 
@@ -53,6 +54,11 @@ static bool simd_check_fma4(void)
     return cpu_has_fma4;
 }
 
+static bool simd_check_fma(void)
+{
+    return cpu_has_fma;
+}
+
 static void simd_set_regs(struct cpu_user_regs *regs)
 {
     if ( cpu_has_mmx )
@@ -155,6 +161,12 @@ static const struct {
     SIMD(FMA4 scalar double,     fma4,        f8),
     SIMD(FMA4 128bit double,     fma4,      16f8),
     SIMD(FMA4 256bit double,     fma4,      32f8),
+    SIMD(FMA scalar single,      fma,         f4),
+    SIMD(FMA 128bit single,      fma,       16f4),
+    SIMD(FMA 256bit single,      fma,       32f4),
+    SIMD(FMA scalar double,      fma,         f8),
+    SIMD(FMA 128bit double,      fma,       16f8),
+    SIMD(FMA 256bit double,      fma,       32f8),
 #undef SIMD_
 #undef SIMD
 };
--- a/tools/tests/x86_emulator/x86_emulate.h
+++ b/tools/tests/x86_emulator/x86_emulate.h
@@ -94,6 +94,14 @@ static inline uint64_t xgetbv(uint32_t x
     (res.c & (1U << 0)) != 0; \
 })
 
+#define cpu_has_fma ({ \
+    struct cpuid_leaf res; \
+    emul_test_cpuid(1, 0, &res, NULL); \
+    if ( !(res.c & (1U << 27)) || ((xgetbv(0) & 6) != 6) ) \
+        res.c = 0; \
+    (res.c & (1U << 12)) != 0; \
+})
+
 #define cpu_has_sse4_1 ({ \
     struct cpuid_leaf res; \
     emul_test_cpuid(1, 0, &res, NULL); \
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -385,6 +385,9 @@ static const struct {
     [0x37 ... 0x3f] = { .simd_size = simd_packed_int },
     [0x40] = { .simd_size = simd_packed_int },
     [0x41] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0x96 ... 0x9f] = { .simd_size = simd_packed_fp },
+    [0xa6 ... 0xaf] = { .simd_size = simd_packed_fp },
+    [0xb6 ... 0xbf] = { .simd_size = simd_packed_fp },
     [0xc8 ... 0xcd] = { .simd_size = simd_other },
     [0xdb] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0xdc ... 0xdf] = { .simd_size = simd_packed_int },
@@ -1605,6 +1608,7 @@ static bool vcpu_has(
 #define vcpu_has_sse3()        vcpu_has(         1, ECX,  0, ctxt, ops)
 #define vcpu_has_pclmulqdq()   vcpu_has(         1, ECX,  1, ctxt, ops)
 #define vcpu_has_ssse3()       vcpu_has(         1, ECX,  9, ctxt, ops)
+#define vcpu_has_fma()         vcpu_has(         1, ECX, 12, ctxt, ops)
 #define vcpu_has_cx16()        vcpu_has(         1, ECX, 13, ctxt, ops)
 #define vcpu_has_sse4_1()      vcpu_has(         1, ECX, 19, ctxt, ops)
 #define vcpu_has_sse4_2()      vcpu_has(         1, ECX, 20, ctxt, ops)
@@ -7352,6 +7356,39 @@ x86_emulate(
         generate_exception_if(vex.l, EXC_UD);
         goto simd_0f_avx;
 
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x96): /* vfmaddsub132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x97): /* vfmsubadd132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x98): /* vfmadd132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x99): /* vfmadd132s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x9a): /* vfmsub132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x9b): /* vfmsub132s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x9c): /* vfnmadd132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x9d): /* vfnmadd132s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x9e): /* vfnmsub132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x9f): /* vfnmsub132s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xa6): /* vfmaddsub213p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xa7): /* vfmsubadd213p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xa8): /* vfmadd213p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xa9): /* vfmadd213s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xaa): /* vfmsub213p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xab): /* vfmsub213s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xac): /* vfnmadd213p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xad): /* vfnmadd213s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xae): /* vfnmsub213p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xaf): /* vfnmsub213s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xb6): /* vfmaddsub231p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xb7): /* vfmsubadd231p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xb8): /* vfmadd231p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xb9): /* vfmadd231s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xba): /* vfmsub231p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xbb): /* vfmsub231s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xbc): /* vfnmadd231p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xbd): /* vfnmadd231s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xbe): /* vfnmsub231p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xbf): /* vfnmsub231s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+        host_and_vcpu_must_have(fma);
+        goto simd_0f_ymm;
+
     case X86EMUL_OPC(0x0f38, 0xc8):     /* sha1nexte xmm/m128,xmm */
     case X86EMUL_OPC(0x0f38, 0xc9):     /* sha1msg1 xmm/m128,xmm */
     case X86EMUL_OPC(0x0f38, 0xca):     /* sha1msg2 xmm/m128,xmm */
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -50,6 +50,7 @@
 #define cpu_has_vmx             boot_cpu_has(X86_FEATURE_VMX)
 #define cpu_has_eist            boot_cpu_has(X86_FEATURE_EIST)
 #define cpu_has_ssse3           boot_cpu_has(X86_FEATURE_SSSE3)
+#define cpu_has_fma             boot_cpu_has(X86_FEATURE_FMA)
 #define cpu_has_cx16            boot_cpu_has(X86_FEATURE_CX16)
 #define cpu_has_pdcm            boot_cpu_has(X86_FEATURE_PDCM)
 #define cpu_has_pcid            boot_cpu_has(X86_FEATURE_PCID)

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 07/17] x86emul: support most remaining AVX2 insns
  2017-06-21 11:23 [PATCH 00/17] x86: emulator enhancements Jan Beulich
                   ` (5 preceding siblings ...)
  2017-06-21 12:02 ` [PATCH 06/17] x86emul: support FMA insns Jan Beulich
@ 2017-06-21 12:02 ` Jan Beulich
  2017-06-21 12:03 ` [PATCH 08/17] x86emul: fold/eliminate some local variables Jan Beulich
                   ` (10 subsequent siblings)
  17 siblings, 0 replies; 27+ messages in thread
From: Jan Beulich @ 2017-06-21 12:02 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 28665 bytes --]

I.e. those not being equivalents of SSEn ones, but with the exception
of the various gather operations.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -11,9 +11,9 @@ all: $(TARGET)
 run: $(TARGET)
 	./$(TARGET)
 
-SIMD := sse sse2 sse4 avx
+SIMD := sse sse2 sse4 avx avx2
 FMA := fma4 fma
-TESTCASES := blowfish $(SIMD) sse2-avx sse4-avx $(FMA)
+TESTCASES := blowfish $(SIMD) $(FMA)
 
 blowfish-cflags := ""
 blowfish-cflags-x86_32 := "-mno-accumulate-outgoing-args -Dstatic="
@@ -36,13 +36,9 @@ fma4-flts := $(avx-flts)
 fma-vecs := $(avx-vecs)
 fma-ints :=
 fma-flts := $(avx-flts)
-
-# When converting SSE to AVX, have the compiler avoid XMM0 to widen
-# coverage of the VEX.vvvv checks in the emulator. We must not do this,
-# however, for SSE4.1 and later, as there are instructions with XMM0 as
-# an implicit operand.
-sse2avx-sse2 := -ffixed-xmm0 -Wa,-msse2avx
-sse2avx-sse4 := -Wa,-msse2avx
+avx2-vecs := $(avx-vecs)
+avx2-ints := 1 2 4 8
+avx2-flts := 4 8
 
 # For AVX and later, have the compiler avoid XMM0 to widen coverage of
 # the VEX.vvvv checks in the emulator.
@@ -58,11 +54,6 @@ $(1)-cflags := \
 	    "-D_$(vec)f$(flt) -m$(1) $(call non-sse,$(1)) -Os -DVEC_SIZE=$(vec) -DFLOAT_SIZE=$(flt)")) \
 	$(foreach flt,$($(1)-flts), \
 	  "-D_f$(flt) -m$(1) $(call non-sse,$(1)) -mfpmath=sse -Os -DFLOAT_SIZE=$(flt)")
-$(1)-avx-cflags := \
-	$(foreach vec,$($(1)-vecs), \
-	  $(foreach int,$($(1)-ints), \
-	    "-D_$(vec)i$(int) -m$(1) $(sse2avx-$(1)) -Os -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
-	    "-D_$(vec)u$(int) -m$(1) $(sse2avx-$(1)) -Os -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)"))
 endef
 
 $(foreach flavor,$(SIMD) $(FMA),$(eval $(call simd-defs,$(flavor))))
@@ -81,13 +72,13 @@ $(addsuffix .h,$(TESTCASES)): %.h: %.c t
 	)
 	mv $@.new $@
 
-$(addsuffix .c,$(SIMD)) $(addsuffix -avx.c,$(filter sse%,$(SIMD))):
+$(addsuffix .c,$(SIMD)):
 	ln -sf simd.c $@
 
 $(addsuffix .c,$(FMA)):
 	ln -sf simd-fma.c $@
 
-$(addsuffix .o,$(SIMD) $(FMA)) $(addsuffix -avx.o,$(filter sse%,$(SIMD))): simd.h
+$(addsuffix .o,$(SIMD) $(FMA)): simd.h
 
 $(TARGET): x86_emulate.o test_x86_emulator.o
 	$(HOSTCC) -o $@ $^
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -23,7 +23,9 @@ ENTRY(simd_test);
 #  endif
 # endif
 #elif VEC_SIZE == 32
-# if defined(__AVX__) && ELEM_SIZE == 4
+# if defined(__AVX2__)
+#  define to_bool(cmp) __builtin_ia32_ptestc256(cmp, (vdi_t){} == 0)
+# elif defined(__AVX__) && ELEM_SIZE == 4
 #  define to_bool(cmp) (__builtin_ia32_movmskps256(cmp) == 0xff)
 # elif defined(__AVX__) && ELEM_SIZE == 8
 #  define to_bool(cmp) (__builtin_ia32_movmskpd256(cmp) == 0xf)
@@ -80,10 +82,14 @@ static inline bool _to_bool(byte_vec_t b
     vec_t t_ = __builtin_ia32_vpermilps256(x, 0b00011011); \
     __builtin_ia32_vperm2f128_ps256(t_, t_, 0b00000001); \
 })
-#  define swap2(x) ({ \
-    vec_t t_ = __builtin_ia32_vpermilvarps256(x, __builtin_ia32_cvtps2dq256(inv) - 1); \
-    __builtin_ia32_vperm2f128_ps256(t_, t_, 0b00000001); \
+#  ifdef __AVX2__
+#   define swap2(x) __builtin_ia32_permvarsf256(x, __builtin_ia32_cvtps2dq256(inv) - 1)
+#  else
+#   define swap2(x) ({ \
+        vec_t t_ = __builtin_ia32_vpermilvarps256(x, __builtin_ia32_cvtps2dq256(inv) - 1); \
+        __builtin_ia32_vperm2f128_ps256(t_, t_, 0b00000001); \
 })
+#  endif
 # elif VEC_SIZE == 16
 #  ifdef __AVX__
 #   define broadcast(x) ({ float t_ = (x); __builtin_ia32_vbroadcastss(&t_); })
@@ -128,6 +134,9 @@ static inline bool _to_bool(byte_vec_t b
     vec_t t_ = __builtin_ia32_vpermilpd256(x, 0b00000101); \
     __builtin_ia32_vperm2f128_pd256(t_, t_, 0b00000001); \
 })
+#  ifdef __AVX2__
+#   define swap2(x) __builtin_ia32_permdf256(x, 0b00011011)
+#  endif
 # elif VEC_SIZE == 16
 #  define interleave_hi(x, y) __builtin_ia32_unpckhpd(x, y)
 #  define interleave_lo(x, y) __builtin_ia32_unpcklpd(x, y)
@@ -184,6 +193,104 @@ static inline bool _to_bool(byte_vec_t b
     __builtin_ia32_maskmovdqu((vqi_t)(x),  m_, d_); \
     __builtin_ia32_maskmovdqu((vqi_t)(y), ~m_, d_); \
 })
+#elif VEC_SIZE == 32 && defined(__AVX2__)
+# define swap_lanes(x, y, func, type) ({ \
+    long long __attribute__((vector_size(16))) t_ = __builtin_ia32_extract128i256((vdi_t)(y), 0); \
+    type t1_ = (type)__builtin_ia32_insert128i256((vdi_t)(x), t_, 1), t2_; \
+    t_ = __builtin_ia32_extract128i256((vdi_t)(x), 1); \
+    t2_ = (type)__builtin_ia32_insert128i256((vdi_t)(y), t_, 0); \
+    func(t1_, t2_); \
+})
+# if INT_SIZE == 1 || UINT_SIZE == 1
+#  define broadcast(x) ({ char s_ = (x); vec_t d_; asm ( "vpbroadcastb %1,%0" : "=x" (d_) : "m" (s_)); d_; })
+#  define copysignz(x, y) ((vec_t)__builtin_ia32_psignb256((vqi_t)(x), (vqi_t)(y)))
+#  define rotr(x, n) ((vec_t)__builtin_ia32_palignr256(__builtin_ia32_permti256((vdi_t)(x), (vdi_t)(x), 0b00000001), \
+                                                       (vdi_t)(x), (n) * 8))
+# elif INT_SIZE == 2 || UINT_SIZE == 2
+#  define broadcast(x) ({ short s_ = (x); vec_t d_; asm ( "vpbroadcastw %1,%0" : "=x" (d_) : "m" (s_)); d_; })
+#  define copysignz(x, y) ((vec_t)__builtin_ia32_psignw256((vhi_t)(x), (vhi_t)(y)))
+#  define hadd(x, y) ((vec_t)swap_lanes(x, y, __builtin_ia32_phaddw256, vhi_t))
+#  define hsub(x, y) ((vec_t)swap_lanes(x, y, __builtin_ia32_phsubw256, vhi_t))
+#  define mix(x, y) ((vec_t)__builtin_ia32_pblendw256((vhi_t)(x), (vhi_t)(y), 0b10101010))
+#  define rotr(x, n) ((vec_t)__builtin_ia32_palignr256(__builtin_ia32_permti256((vdi_t)(x), (vdi_t)(x), 0b00000001), \
+                                                       (vdi_t)(x), (n) * 16))
+# elif INT_SIZE == 4 || UINT_SIZE == 4
+#  define broadcast(x) ({ int s_ = (x); vec_t d_; asm ( "vpbroadcastd %1,%0" : "=x" (d_) : "m" (s_)); d_; })
+#  define copysignz(x, y) ((vec_t)__builtin_ia32_psignd256((vsi_t)(x), (vsi_t)(y)))
+#  define hadd(x, y) ((vec_t)swap_lanes(x, y, __builtin_ia32_phaddd256, vsi_t))
+#  define hsub(x, y) ((vec_t)swap_lanes(x, y, __builtin_ia32_phsubd256, vsi_t))
+#  define mix(x, y) ((vec_t)__builtin_ia32_pblendd256((vsi_t)(x), (vsi_t)(y), 0b10101010))
+#  define rotr(x, n) ((vec_t)__builtin_ia32_palignr256(__builtin_ia32_permti256((vdi_t)(x), (vdi_t)(x), 0b00000001), \
+                                                       (vdi_t)(x), (n) * 32))
+#  define select(d, x, y, m) ({ \
+    vsi_t m_ = (vsi_t)(m); \
+    *(d) = (vec_t)__builtin_ia32_maskloadd256((vsi_t *)&(x),  m_); \
+    __builtin_ia32_maskstored256((vsi_t *)(d), ~m_, (vsi_t)(y)); \
+})
+#  define swap(x) ((vec_t)__builtin_ia32_permvarsi256((vsi_t)(x), (vsi_t)inv - 1))
+# elif INT_SIZE == 8 || UINT_SIZE == 8
+#  define mix(x, y) ((vec_t)__builtin_ia32_pblendd256((vsi_t)(x), (vsi_t)(y), 0b11001100))
+#  define rotr(x, n) ((vec_t)__builtin_ia32_palignr256(__builtin_ia32_permti256((vdi_t)(x), (vdi_t)(x), 0b00000001), \
+                                                       (vdi_t)(x), (n) * 64))
+#  define select(d, x, y, m) ({ \
+    vdi_t m_ = (vdi_t)(m); \
+    *(d) = (vec_t)__builtin_ia32_maskloadq256((vdi_t *)&(x),  m_); \
+    __builtin_ia32_maskstoreq256((vdi_t *)(d), ~m_, (vdi_t)(y)); \
+})
+#  define swap(x) ((vec_t)__builtin_ia32_permdi256((vdi_t)(x), 0b00011011))
+#  define swap2(x) ({ \
+    vdi_t t_ = __builtin_ia32_permdi256((vdi_t)(x), 0b10110001); \
+    (vec_t)__builtin_ia32_permti256(t_, t_, 0b00000001); \
+})
+# endif
+# if INT_SIZE == 1
+#  define abs(x) ((vec_t)__builtin_ia32_pabsb256((vqi_t)(x)))
+#  define max(x, y) ((vec_t)__builtin_ia32_pmaxsb256((vqi_t)(x), (vqi_t)(y)))
+#  define min(x, y) ((vec_t)__builtin_ia32_pminsb256((vqi_t)(x), (vqi_t)(y)))
+#  define widen1(x) ((vec_t)__builtin_ia32_pmovsxbw256((vqi_t)(x)))
+#  define widen2(x) ((vec_t)__builtin_ia32_pmovsxbd256((vqi_t)(x)))
+#  define widen3(x) ((vec_t)__builtin_ia32_pmovsxbq256((vqi_t)(x)))
+# elif UINT_SIZE == 1
+#  define max(x, y) ((vec_t)__builtin_ia32_pmaxub256((vqi_t)(x), (vqi_t)(y)))
+#  define min(x, y) ((vec_t)__builtin_ia32_pminub256((vqi_t)(x), (vqi_t)(y)))
+#  define widen1(x) ((vec_t)__builtin_ia32_pmovzxbw256((vqi_t)(x)))
+#  define widen2(x) ((vec_t)__builtin_ia32_pmovzxbd256((vqi_t)(x)))
+#  define widen3(x) ((vec_t)__builtin_ia32_pmovzxbq256((vqi_t)(x)))
+# elif INT_SIZE == 2
+#  define abs(x) __builtin_ia32_pabsw256(x)
+#  define max(x, y) __builtin_ia32_pmaxsw256(x, y)
+#  define min(x, y) __builtin_ia32_pminsw256(x, y)
+#  define mul_hi(x, y) __builtin_ia32_pmulhw256(x, y)
+#  define widen1(x) ((vec_t)__builtin_ia32_pmovsxwd256(x))
+#  define widen2(x) ((vec_t)__builtin_ia32_pmovsxwq256(x))
+# elif UINT_SIZE == 2
+#  define max(x, y) ((vec_t)__builtin_ia32_pmaxuw256((vhi_t)(x), (vhi_t)(y)))
+#  define min(x, y) ((vec_t)__builtin_ia32_pminuw256((vhi_t)(x), (vhi_t)(y)))
+#  define mul_hi(x, y) ((vec_t)__builtin_ia32_pmulhuw256((vhi_t)(x), (vhi_t)(y)))
+#  define widen1(x) ((vec_t)__builtin_ia32_pmovzxwd256((vhi_t)(x)))
+#  define widen2(x) ((vec_t)__builtin_ia32_pmovzxwq256((vhi_t)(x)))
+# elif INT_SIZE == 4
+#  define abs(x) __builtin_ia32_pabsd256(x)
+#  define max(x, y) __builtin_ia32_pmaxsd256(x, y)
+#  define min(x, y) __builtin_ia32_pminsd256(x, y)
+#  define widen1(x) ((vec_t)__builtin_ia32_pmovsxdq256(x))
+# elif UINT_SIZE == 4
+#  define max(x, y) ((vec_t)__builtin_ia32_pmaxud256((vsi_t)(x), (vsi_t)(y)))
+#  define min(x, y) ((vec_t)__builtin_ia32_pminud256((vsi_t)(x), (vsi_t)(y)))
+#  define mul_full(x, y) ((vec_t)__builtin_ia32_pmuludq256((vsi_t)(x), (vsi_t)(y)))
+#  define widen1(x) ((vec_t)__builtin_ia32_pmovzxdq256((vsi_t)(x)))
+# elif INT_SIZE == 8
+#  define broadcast(x) ({ \
+    long long s_ = (x); \
+    long long __attribute__((vector_size(16))) t_; \
+    vec_t d_; \
+    asm ( "vpbroadcastq %1,%0" : "=x" (t_) : "m" (s_)); \
+    asm ( "vbroadcasti128 %1,%0" : "=x" (d_) : "m" (t_)); \
+    d_; \
+})
+# elif UINT_SIZE == 8
+#  define broadcast(x) ({ long long s_ = (x); vec_t d_; asm ( "vpbroadcastq %1,%0" : "=x" (d_) : "m" (s_)); d_; })
+# endif
 #endif
 #if VEC_SIZE == 16 && defined(__SSE3__)
 # if FLOAT_SIZE == 4
@@ -207,25 +314,37 @@ static inline bool _to_bool(byte_vec_t b
 #  define addsub(x, y) __builtin_ia32_addsubps256(x, y)
 #  define dup_hi(x) __builtin_ia32_movshdup256(x)
 #  define dup_lo(x) __builtin_ia32_movsldup256(x)
-#  define hadd(x, y) ({ \
+#  ifdef __AVX2__
+#   define hadd(x, y) __builtin_ia32_permvarsf256(__builtin_ia32_haddps256(x, y), \
+                                                  (vsi_t){0, 1, 4, 5, 2, 3, 6, 7})
+#   define hsub(x, y) __builtin_ia32_permvarsf256(__builtin_ia32_hsubps256(x, y), \
+                                                  (vsi_t){0, 1, 4, 5, 2, 3, 6, 7})
+#  else
+#   define hadd(x, y) ({ \
         vec_t t_ = __builtin_ia32_haddps256(x, y); \
         (vec_t){t_[0], t_[1], t_[4], t_[5], t_[2], t_[3], t_[6], t_[7]}; \
 })
-#  define hsub(x, y) ({ \
+#   define hsub(x, y) ({ \
         vec_t t_ = __builtin_ia32_hsubps256(x, y); \
         (vec_t){t_[0], t_[1], t_[4], t_[5], t_[2], t_[3], t_[6], t_[7]}; \
 })
+#  endif
 # elif FLOAT_SIZE == 8
 #  define addsub(x, y) __builtin_ia32_addsubpd256(x, y)
 #  define dup_lo(x) __builtin_ia32_movddup256(x)
-#  define hadd(x, y) ({ \
+#  ifdef __AVX2__
+#   define hadd(x, y) __builtin_ia32_permdf256(__builtin_ia32_haddpd256(x, y), 0b11011000)
+#   define hsub(x, y) __builtin_ia32_permdf256(__builtin_ia32_hsubpd256(x, y), 0b11011000)
+#  else
+#   define hadd(x, y) ({ \
         vec_t t_ = __builtin_ia32_haddpd256(x, y); \
         (vec_t){t_[0], t_[2], t_[1], t_[3]}; \
 })
-#  define hsub(x, y) ({ \
+#   define hsub(x, y) ({ \
         vec_t t_ = __builtin_ia32_hsubpd256(x, y); \
         (vec_t){t_[0], t_[2], t_[1], t_[3]}; \
 })
+#  endif
 # endif
 #endif
 #if VEC_SIZE == 16 && defined(__SSSE3__)
@@ -546,7 +665,7 @@ int simd_test(void)
     z *= alt;
 #  endif
     /*
-     * Zap elements for which the shift count is negative (and the hence the
+     * Zap elements for which the shift count is zero (and the hence the
      * decrement below would yield a negative count.
      */
     z &= (sh > 0);
@@ -556,9 +675,14 @@ int simd_test(void)
     --sh;
     touch(sh);
     y = z << sh;
-    touch(sh);
     if ( !to_bool(x == y + y) ) return __LINE__;
 
+#  if defined(__AVX2__) && ELEM_SIZE >= 4
+    touch(sh);
+    x = y >> sh;
+    if ( !to_bool(x == z) ) return __LINE__;
+#  endif
+
 # endif
 
 #endif
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -8,11 +8,10 @@
 #include "sse.h"
 #include "sse2.h"
 #include "sse4.h"
-#include "sse2-avx.h"
-#include "sse4-avx.h"
 #include "avx.h"
 #include "fma4.h"
 #include "fma.h"
+#include "avx2.h"
 
 #define verbose false /* Switch to true for far more logging. */
 
@@ -46,8 +45,6 @@ static bool simd_check_avx(void)
 {
     return cpu_has_avx;
 }
-#define simd_check_sse2_avx  simd_check_avx
-#define simd_check_sse4_avx  simd_check_avx
 
 static bool simd_check_fma4(void)
 {
@@ -59,6 +56,11 @@ static bool simd_check_fma(void)
     return cpu_has_fma;
 }
 
+static bool simd_check_avx2(void)
+{
+    return cpu_has_avx2;
+}
+
 static void simd_set_regs(struct cpu_user_regs *regs)
 {
     if ( cpu_has_mmx )
@@ -133,22 +135,6 @@ static const struct {
     SIMD(SSE4 packed u32,        sse4,      16u4),
     SIMD(SSE4 packed s64,        sse4,      16i8),
     SIMD(SSE4 packed u64,        sse4,      16u8),
-    SIMD(SSE2/AVX packed s8,     sse2_avx,  16i1),
-    SIMD(SSE2/AVX packed u8,     sse2_avx,  16u1),
-    SIMD(SSE2/AVX packed s16,    sse2_avx,  16i2),
-    SIMD(SSE2/AVX packed u16,    sse2_avx,  16u2),
-    SIMD(SSE2/AVX packed s32,    sse2_avx,  16i4),
-    SIMD(SSE2/AVX packed u32,    sse2_avx,  16u4),
-    SIMD(SSE2/AVX packed s64,    sse2_avx,  16i8),
-    SIMD(SSE2/AVX packed u64,    sse2_avx,  16u8),
-    SIMD(SSE4/AVX packed s8,     sse4_avx,  16i1),
-    SIMD(SSE4/AVX packed u8,     sse4_avx,  16u1),
-    SIMD(SSE4/AVX packed s16,    sse4_avx,  16i2),
-    SIMD(SSE4/AVX packed u16,    sse4_avx,  16u2),
-    SIMD(SSE4/AVX packed s32,    sse4_avx,  16i4),
-    SIMD(SSE4/AVX packed u32,    sse4_avx,  16u4),
-    SIMD(SSE4/AVX packed s64,    sse4_avx,  16i8),
-    SIMD(SSE4/AVX packed u64,    sse4_avx,  16u8),
     SIMD(AVX scalar single,      avx,         f4),
     SIMD(AVX 128bit single,      avx,       16f4),
     SIMD(AVX 256bit single,      avx,       32f4),
@@ -167,6 +153,26 @@ static const struct {
     SIMD(FMA scalar double,      fma,         f8),
     SIMD(FMA 128bit double,      fma,       16f8),
     SIMD(FMA 256bit double,      fma,       32f8),
+    SIMD(AVX2 128bit single,     avx2,      16f4),
+    SIMD(AVX2 256bit single,     avx2,      32f4),
+    SIMD(AVX2 128bit double,     avx2,      16f8),
+    SIMD(AVX2 256bit double,     avx2,      32f8),
+    SIMD(AVX2 s8x16,             avx2,      16i1),
+    SIMD(AVX2 u8x16,             avx2,      16u1),
+    SIMD(AVX2 s16x8,             avx2,      16i2),
+    SIMD(AVX2 u16x8,             avx2,      16u2),
+    SIMD(AVX2 s32x4,             avx2,      16i4),
+    SIMD(AVX2 u32x4,             avx2,      16u4),
+    SIMD(AVX2 s64x2,             avx2,      16i8),
+    SIMD(AVX2 u64x2,             avx2,      16u8),
+    SIMD(AVX2 s8x32,             avx2,      32i1),
+    SIMD(AVX2 u8x32,             avx2,      32u1),
+    SIMD(AVX2 s16x16,            avx2,      32i2),
+    SIMD(AVX2 u16x16,            avx2,      32u2),
+    SIMD(AVX2 s32x8,             avx2,      32i4),
+    SIMD(AVX2 u32x8,             avx2,      32u4),
+    SIMD(AVX2 s64x4,             avx2,      32i8),
+    SIMD(AVX2 u64x4,             avx2,      32u8),
 #undef SIMD_
 #undef SIMD
 };
@@ -2925,6 +2931,91 @@ int main(int argc, char **argv)
              res[0] || res[1] || memcmp(res + 2, res + 4, 8) )
             goto fail;
 
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing vpmaskmovd %xmm1,%xmm2,(%edx)...");
+    if ( stack_exec && cpu_has_avx2 )
+    {
+        decl_insn(vpmaskmovd);
+
+        asm volatile ( "vpxor %%xmm1, %%xmm1, %%xmm1\n\t"
+                       "vpinsrd $0b00, %1, %%xmm1, %%xmm2\n\t"
+#if 0 /* Don't use AVX2 instructions for now */
+                       put_insn(vpmaskmovd, "vpmaskmovd %%xmm1, %%xmm2, (%0)")
+#else
+                       put_insn(vpmaskmovd,
+                                ".byte 0xc4, 0xe2, 0x69, 0x8e, 0x0a")
+#endif
+                       :: "d" (NULL), "r" (~0) );
+
+        memset(res + MMAP_SZ / sizeof(*res) - 8, 0xdb, 32);
+        set_insn(vpmaskmovd);
+        regs.edx = (unsigned long)res + MMAP_SZ - 4;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vpmaskmovd) ||
+             res[MMAP_SZ / sizeof(*res) - 1] ||
+             memcmp(res + MMAP_SZ / sizeof(*res) - 8,
+                    res + MMAP_SZ / sizeof(*res) - 4, 12) )
+            goto fail;
+
+        asm volatile ( "vpinsrd $0b11, %0, %%xmm1, %%xmm2" :: "r" (~0) );
+        memset(res, 0xdb, 32);
+        set_insn(vpmaskmovd);
+        regs.edx = (unsigned long)(res - 3);
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vpmaskmovd) ||
+             res[0] || memcmp(res + 1, res + 4, 12) )
+            goto fail;
+
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing vpmaskmovq %xmm1,%xmm2,(%edx)...");
+    if ( stack_exec && cpu_has_avx2 )
+    {
+        decl_insn(vpmaskmovq);
+
+        asm volatile ( "vpxor %%xmm1, %%xmm1, %%xmm1\n\t"
+                       "vpcmpeqd %%xmm0, %%xmm0, %%xmm0\n\t"
+#if 0 /* Don't use AVX2 instructions for now */
+                       "vpblendd $0b0011, %%xmm0, %%xmm1, %%xmm2\n\t"
+                       put_insn(vpmaskmovq, "vpmaskmovq %%xmm1, %%xmm2, (%0)")
+#else
+                       ".byte 0xc4, 0xe3, 0x71, 0x02, 0xd0, 0b0011\n\t"
+                       put_insn(vpmaskmovq,
+                                ".byte 0xc4, 0xe2, 0xe9, 0x8e, 0x0a")
+#endif
+                       :: "d" (NULL) );
+
+        memset(res + MMAP_SZ / sizeof(*res) - 8, 0xdb, 32);
+        set_insn(vpmaskmovq);
+        regs.edx = (unsigned long)res + MMAP_SZ - 8;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vpmaskmovq) ||
+             res[MMAP_SZ / sizeof(*res) - 1] ||
+             res[MMAP_SZ / sizeof(*res) - 2] ||
+             memcmp(res + MMAP_SZ / sizeof(*res) - 8,
+                    res + MMAP_SZ / sizeof(*res) - 4, 8) )
+            goto fail;
+
+#if 0 /* Don't use AVX2 instructions for now */
+        asm volatile ( "vpermq $0b00000001, %ymm2, %ymm2" );
+#else
+        asm volatile ( ".byte 0xc4, 0xe3, 0xfd, 0x00, 0xd2, 0b00000001" );
+#endif
+        memset(res, 0xdb, 32);
+        set_insn(vpmaskmovq);
+        regs.edx = (unsigned long)(res - 2);
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vpmaskmovq) ||
+             res[0] || res[1] || memcmp(res + 2, res + 4, 8) )
+            goto fail;
+
         printf("okay\n");
     }
     else
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -370,7 +370,7 @@ static const struct {
     [0x0c ... 0x0f] = { .simd_size = simd_packed_fp },
     [0x10] = { .simd_size = simd_packed_int },
     [0x13] = { .simd_size = simd_other, .two_op = 1 },
-    [0x14 ... 0x15] = { .simd_size = simd_packed_fp },
+    [0x14 ... 0x16] = { .simd_size = simd_packed_fp },
     [0x17] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0x18 ... 0x19] = { .simd_size = simd_scalar_fp, .two_op = 1 },
     [0x1a] = { .simd_size = simd_128, .two_op = 1 },
@@ -382,9 +382,15 @@ static const struct {
     [0x2c ... 0x2d] = { .simd_size = simd_other },
     [0x2e ... 0x2f] = { .simd_size = simd_other, .to_mem = 1 },
     [0x30 ... 0x35] = { .simd_size = simd_other, .two_op = 1 },
-    [0x37 ... 0x3f] = { .simd_size = simd_packed_int },
+    [0x36 ... 0x3f] = { .simd_size = simd_packed_int },
     [0x40] = { .simd_size = simd_packed_int },
     [0x41] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0x45 ... 0x47] = { .simd_size = simd_packed_int },
+    [0x58 ... 0x59] = { .simd_size = simd_other, .two_op = 1 },
+    [0x5a] = { .simd_size = simd_128, .two_op = 1 },
+    [0x78 ... 0x79] = { .simd_size = simd_other, .two_op = 1 },
+    [0x8c] = { .simd_size = simd_other },
+    [0x8e] = { .simd_size = simd_other, .to_mem = 1 },
     [0x96 ... 0x9f] = { .simd_size = simd_packed_fp },
     [0xa6 ... 0xaf] = { .simd_size = simd_packed_fp },
     [0xb6 ... 0xbf] = { .simd_size = simd_packed_fp },
@@ -406,6 +412,9 @@ static const struct {
     uint8_t two_op:1;
     uint8_t four_op:1;
 } ext0f3a_table[256] = {
+    [0x00] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0x01] = { .simd_size = simd_packed_fp, .two_op = 1 },
+    [0x02] = { .simd_size = simd_packed_int },
     [0x04 ... 0x05] = { .simd_size = simd_packed_fp, .two_op = 1 },
     [0x06] = { .simd_size = simd_packed_fp },
     [0x08 ... 0x09] = { .simd_size = simd_packed_fp, .two_op = 1 },
@@ -419,9 +428,12 @@ static const struct {
     [0x20] = { .simd_size = simd_none },
     [0x21] = { .simd_size = simd_other },
     [0x22] = { .simd_size = simd_none },
+    [0x38] = { .simd_size = simd_128 },
+    [0x39] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1 },
     [0x40 ... 0x41] = { .simd_size = simd_packed_fp },
     [0x42] = { .simd_size = simd_packed_int },
     [0x44] = { .simd_size = simd_packed_int },
+    [0x46] = { .simd_size = simd_packed_int },
     [0x4a ... 0x4b] = { .simd_size = simd_packed_fp, .four_op = 1 },
     [0x4c] = { .simd_size = simd_packed_int, .four_op = 1 },
     [0x5c ... 0x5f] = { .simd_size = simd_packed_fp, .four_op = 1 },
@@ -2964,7 +2976,7 @@ x86_decode(
         }
         break;
 
-    case simd_scalar_fp:
+    case simd_scalar_fp: /* case simd_scalar_dq: */
         op_bytes = 4 << (ctxt->opcode & 1);
         break;
 
@@ -6057,6 +6069,10 @@ x86_emulate(
     case X86EMUL_OPC_VEX_66(0x0f38, 0x40): /* vpmulld {x,y}mm/mem,{x,y}mm,{x,y}mm */
             if ( !vex.l )
                 goto simd_0f_avx;
+            /* fall through */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x45): /* vpsrlv{d,q} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x47): /* vpsllv{d,q} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    simd_0f_avx2:
             host_and_vcpu_must_have(avx2);
             goto simd_0f_ymm;
         }
@@ -6156,7 +6172,10 @@ x86_emulate(
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x0f): /* vpalignr $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x42): /* vmpsadbw $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
             if ( vex.l )
+            {
+    simd_0f_imm8_avx2:
                 host_and_vcpu_must_have(avx2);
+            }
             else
             {
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x08): /* vroundps $imm8,{x,y}mm/mem,{x,y}mm */
@@ -7240,6 +7259,11 @@ x86_emulate(
         op_bytes = 8 << vex.l;
         goto simd_0f_ymm;
 
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x16): /* vpermps ymm/m256,ymm,ymm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x36): /* vpermd ymm/m256,ymm,ymm */
+        generate_exception_if(!vex.l || vex.w, EXC_UD);
+        goto simd_0f_avx2;
+
     case X86EMUL_OPC_VEX_66(0x0f38, 0x20): /* vpmovsxbw xmm/mem,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x21): /* vpmovsxbd xmm/mem,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x22): /* vpmovsxbq xmm/mem,{x,y}mm */
@@ -7356,6 +7380,80 @@ x86_emulate(
         generate_exception_if(vex.l, EXC_UD);
         goto simd_0f_avx;
 
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x58): /* vpbroadcastd xmm/m32,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x59): /* vpbroadcastq xmm/m64,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x78): /* vpbroadcastb xmm/m8,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x79): /* vpbroadcastw xmm/m16,{x,y}mm */
+        op_bytes = 1 << ((!(b & 0x20) * 2) + (b & 1));
+        /* fall through */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x46): /* vpsravd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+        generate_exception_if(vex.w, EXC_UD);
+        goto simd_0f_avx2;
+
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x5a): /* vbroadcasti128 m128,ymm */
+        generate_exception_if(ea.type != OP_MEM || !vex.l || vex.w, EXC_UD);
+        goto simd_0f_avx2;
+
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x8c): /* vpmaskmov{d,q} mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x8e): /* vpmaskmov{d,q} {x,y}mm,{x,y}mm,mem */
+    {
+        typeof(vex) *pvex;
+        unsigned int mask = vex.w ? 0x80808080U : 0x88888888U;
+
+        generate_exception_if(ea.type != OP_MEM, EXC_UD);
+        host_and_vcpu_must_have(avx2);
+        get_fpu(X86EMUL_FPU_ymm, &fic);
+
+        /*
+         * While we can't reasonably provide fully correct behavior here
+         * (in particular, for writes, avoiding the memory read in anticipation
+         * of all elements in the range eventually being written), we can (and
+         * should) still limit the memory access to the smallest possible range
+         * (suppressing it altogether if all mask bits are clear), to provide
+         * correct faulting behavior. Read the mask bits via vmovmskp{s,d}
+         * for that purpose.
+         */
+        opc = init_prefixes(stub);
+        pvex = copy_VEX(opc, vex);
+        pvex->opcx = vex_0f;
+        opc[0] = 0xd7; /* vpmovmskb */
+        /* Use %rax as GPR destination and VEX.vvvv as source. */
+        pvex->r = 1;
+        pvex->b = !mode_64bit() || (vex.reg >> 3);
+        opc[1] = 0xc0 | (~vex.reg & 7);
+        pvex->reg = 0xf;
+        opc[2] = 0xc3;
+
+        invoke_stub("", "", "=a" (ea.val) : [dummy] "i" (0));
+        put_stub(stub);
+
+        /* Convert byte granular result to dword/qword granularity. */
+        ea.val &= mask;
+        if ( !ea.val )
+            goto complete_insn;
+
+        first_byte = __builtin_ctz(ea.val) & ~((4 << vex.w) - 1);
+        ea.val >>= first_byte;
+        op_bytes = 32 - __builtin_clz(ea.val);
+
+        /*
+         * Even for the memory write variant a memory read is needed, unless
+         * all set mask bits are contiguous.
+         */
+        if ( ea.val & (ea.val + ~mask + 1) )
+            d = (d & ~SrcMask) | SrcMem;
+
+        opc = init_prefixes(stub);
+        opc[0] = b;
+        /* Convert memory operand to (%rAX). */
+        rex_prefix &= ~REX_B;
+        vex.b = 1;
+        opc[1] = modrm & 0x38;
+        fic.insn_bytes = PFX_BYTES + 2;
+
+        break;
+    }
+
     case X86EMUL_OPC_VEX_66(0x0f38, 0x96): /* vfmaddsub132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x97): /* vfmsubadd132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x98): /* vfmadd132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
@@ -7564,6 +7662,20 @@ x86_emulate(
                             : "0" ((uint32_t)src.val), "rm" (_regs.edx) );
         break;
 
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x00): /* vpermq $imm8,ymm/m256,ymm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x01): /* vpermpd $imm8,ymm/m256,ymm */
+        generate_exception_if(!vex.l || !vex.w, EXC_UD);
+        goto simd_0f_imm8_avx2;
+
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x38): /* vinserti128 $imm8,xmm/m128,ymm,ymm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x39): /* vextracti128 $imm8,ymm,xmm/m128 */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x46): /* vperm2i128 $imm8,ymm/m256,ymm,ymm */
+        generate_exception_if(!vex.l, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x02): /* vpblendd $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+        generate_exception_if(vex.w, EXC_UD);
+        goto simd_0f_imm8_avx2;
+
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x06): /* vperm2f128 $imm8,ymm/m256,ymm,ymm */
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x18): /* vinsertf128 $imm8,xmm/m128,ymm,ymm */
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x19): /* vextractf128 $imm8,ymm,xmm/m128 */



[-- Attachment #2: x86emul-AVX2.patch --]
[-- Type: text/plain, Size: 28707 bytes --]

x86emul: support most remaining AVX2 insns

I.e. those not being equivalents of SSEn ones, but with the exception
of the various gather operations.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -11,9 +11,9 @@ all: $(TARGET)
 run: $(TARGET)
 	./$(TARGET)
 
-SIMD := sse sse2 sse4 avx
+SIMD := sse sse2 sse4 avx avx2
 FMA := fma4 fma
-TESTCASES := blowfish $(SIMD) sse2-avx sse4-avx $(FMA)
+TESTCASES := blowfish $(SIMD) $(FMA)
 
 blowfish-cflags := ""
 blowfish-cflags-x86_32 := "-mno-accumulate-outgoing-args -Dstatic="
@@ -36,13 +36,9 @@ fma4-flts := $(avx-flts)
 fma-vecs := $(avx-vecs)
 fma-ints :=
 fma-flts := $(avx-flts)
-
-# When converting SSE to AVX, have the compiler avoid XMM0 to widen
-# coverage of the VEX.vvvv checks in the emulator. We must not do this,
-# however, for SSE4.1 and later, as there are instructions with XMM0 as
-# an implicit operand.
-sse2avx-sse2 := -ffixed-xmm0 -Wa,-msse2avx
-sse2avx-sse4 := -Wa,-msse2avx
+avx2-vecs := $(avx-vecs)
+avx2-ints := 1 2 4 8
+avx2-flts := 4 8
 
 # For AVX and later, have the compiler avoid XMM0 to widen coverage of
 # the VEX.vvvv checks in the emulator.
@@ -58,11 +54,6 @@ $(1)-cflags := \
 	    "-D_$(vec)f$(flt) -m$(1) $(call non-sse,$(1)) -Os -DVEC_SIZE=$(vec) -DFLOAT_SIZE=$(flt)")) \
 	$(foreach flt,$($(1)-flts), \
 	  "-D_f$(flt) -m$(1) $(call non-sse,$(1)) -mfpmath=sse -Os -DFLOAT_SIZE=$(flt)")
-$(1)-avx-cflags := \
-	$(foreach vec,$($(1)-vecs), \
-	  $(foreach int,$($(1)-ints), \
-	    "-D_$(vec)i$(int) -m$(1) $(sse2avx-$(1)) -Os -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
-	    "-D_$(vec)u$(int) -m$(1) $(sse2avx-$(1)) -Os -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)"))
 endef
 
 $(foreach flavor,$(SIMD) $(FMA),$(eval $(call simd-defs,$(flavor))))
@@ -81,13 +72,13 @@ $(addsuffix .h,$(TESTCASES)): %.h: %.c t
 	)
 	mv $@.new $@
 
-$(addsuffix .c,$(SIMD)) $(addsuffix -avx.c,$(filter sse%,$(SIMD))):
+$(addsuffix .c,$(SIMD)):
 	ln -sf simd.c $@
 
 $(addsuffix .c,$(FMA)):
 	ln -sf simd-fma.c $@
 
-$(addsuffix .o,$(SIMD) $(FMA)) $(addsuffix -avx.o,$(filter sse%,$(SIMD))): simd.h
+$(addsuffix .o,$(SIMD) $(FMA)): simd.h
 
 $(TARGET): x86_emulate.o test_x86_emulator.o
 	$(HOSTCC) -o $@ $^
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -23,7 +23,9 @@ ENTRY(simd_test);
 #  endif
 # endif
 #elif VEC_SIZE == 32
-# if defined(__AVX__) && ELEM_SIZE == 4
+# if defined(__AVX2__)
+#  define to_bool(cmp) __builtin_ia32_ptestc256(cmp, (vdi_t){} == 0)
+# elif defined(__AVX__) && ELEM_SIZE == 4
 #  define to_bool(cmp) (__builtin_ia32_movmskps256(cmp) == 0xff)
 # elif defined(__AVX__) && ELEM_SIZE == 8
 #  define to_bool(cmp) (__builtin_ia32_movmskpd256(cmp) == 0xf)
@@ -80,10 +82,14 @@ static inline bool _to_bool(byte_vec_t b
     vec_t t_ = __builtin_ia32_vpermilps256(x, 0b00011011); \
     __builtin_ia32_vperm2f128_ps256(t_, t_, 0b00000001); \
 })
-#  define swap2(x) ({ \
-    vec_t t_ = __builtin_ia32_vpermilvarps256(x, __builtin_ia32_cvtps2dq256(inv) - 1); \
-    __builtin_ia32_vperm2f128_ps256(t_, t_, 0b00000001); \
+#  ifdef __AVX2__
+#   define swap2(x) __builtin_ia32_permvarsf256(x, __builtin_ia32_cvtps2dq256(inv) - 1)
+#  else
+#   define swap2(x) ({ \
+        vec_t t_ = __builtin_ia32_vpermilvarps256(x, __builtin_ia32_cvtps2dq256(inv) - 1); \
+        __builtin_ia32_vperm2f128_ps256(t_, t_, 0b00000001); \
 })
+#  endif
 # elif VEC_SIZE == 16
 #  ifdef __AVX__
 #   define broadcast(x) ({ float t_ = (x); __builtin_ia32_vbroadcastss(&t_); })
@@ -128,6 +134,9 @@ static inline bool _to_bool(byte_vec_t b
     vec_t t_ = __builtin_ia32_vpermilpd256(x, 0b00000101); \
     __builtin_ia32_vperm2f128_pd256(t_, t_, 0b00000001); \
 })
+#  ifdef __AVX2__
+#   define swap2(x) __builtin_ia32_permdf256(x, 0b00011011)
+#  endif
 # elif VEC_SIZE == 16
 #  define interleave_hi(x, y) __builtin_ia32_unpckhpd(x, y)
 #  define interleave_lo(x, y) __builtin_ia32_unpcklpd(x, y)
@@ -184,6 +193,104 @@ static inline bool _to_bool(byte_vec_t b
     __builtin_ia32_maskmovdqu((vqi_t)(x),  m_, d_); \
     __builtin_ia32_maskmovdqu((vqi_t)(y), ~m_, d_); \
 })
+#elif VEC_SIZE == 32 && defined(__AVX2__)
+# define swap_lanes(x, y, func, type) ({ \
+    long long __attribute__((vector_size(16))) t_ = __builtin_ia32_extract128i256((vdi_t)(y), 0); \
+    type t1_ = (type)__builtin_ia32_insert128i256((vdi_t)(x), t_, 1), t2_; \
+    t_ = __builtin_ia32_extract128i256((vdi_t)(x), 1); \
+    t2_ = (type)__builtin_ia32_insert128i256((vdi_t)(y), t_, 0); \
+    func(t1_, t2_); \
+})
+# if INT_SIZE == 1 || UINT_SIZE == 1
+#  define broadcast(x) ({ char s_ = (x); vec_t d_; asm ( "vpbroadcastb %1,%0" : "=x" (d_) : "m" (s_)); d_; })
+#  define copysignz(x, y) ((vec_t)__builtin_ia32_psignb256((vqi_t)(x), (vqi_t)(y)))
+#  define rotr(x, n) ((vec_t)__builtin_ia32_palignr256(__builtin_ia32_permti256((vdi_t)(x), (vdi_t)(x), 0b00000001), \
+                                                       (vdi_t)(x), (n) * 8))
+# elif INT_SIZE == 2 || UINT_SIZE == 2
+#  define broadcast(x) ({ short s_ = (x); vec_t d_; asm ( "vpbroadcastw %1,%0" : "=x" (d_) : "m" (s_)); d_; })
+#  define copysignz(x, y) ((vec_t)__builtin_ia32_psignw256((vhi_t)(x), (vhi_t)(y)))
+#  define hadd(x, y) ((vec_t)swap_lanes(x, y, __builtin_ia32_phaddw256, vhi_t))
+#  define hsub(x, y) ((vec_t)swap_lanes(x, y, __builtin_ia32_phsubw256, vhi_t))
+#  define mix(x, y) ((vec_t)__builtin_ia32_pblendw256((vhi_t)(x), (vhi_t)(y), 0b10101010))
+#  define rotr(x, n) ((vec_t)__builtin_ia32_palignr256(__builtin_ia32_permti256((vdi_t)(x), (vdi_t)(x), 0b00000001), \
+                                                       (vdi_t)(x), (n) * 16))
+# elif INT_SIZE == 4 || UINT_SIZE == 4
+#  define broadcast(x) ({ int s_ = (x); vec_t d_; asm ( "vpbroadcastd %1,%0" : "=x" (d_) : "m" (s_)); d_; })
+#  define copysignz(x, y) ((vec_t)__builtin_ia32_psignd256((vsi_t)(x), (vsi_t)(y)))
+#  define hadd(x, y) ((vec_t)swap_lanes(x, y, __builtin_ia32_phaddd256, vsi_t))
+#  define hsub(x, y) ((vec_t)swap_lanes(x, y, __builtin_ia32_phsubd256, vsi_t))
+#  define mix(x, y) ((vec_t)__builtin_ia32_pblendd256((vsi_t)(x), (vsi_t)(y), 0b10101010))
+#  define rotr(x, n) ((vec_t)__builtin_ia32_palignr256(__builtin_ia32_permti256((vdi_t)(x), (vdi_t)(x), 0b00000001), \
+                                                       (vdi_t)(x), (n) * 32))
+#  define select(d, x, y, m) ({ \
+    vsi_t m_ = (vsi_t)(m); \
+    *(d) = (vec_t)__builtin_ia32_maskloadd256((vsi_t *)&(x),  m_); \
+    __builtin_ia32_maskstored256((vsi_t *)(d), ~m_, (vsi_t)(y)); \
+})
+#  define swap(x) ((vec_t)__builtin_ia32_permvarsi256((vsi_t)(x), (vsi_t)inv - 1))
+# elif INT_SIZE == 8 || UINT_SIZE == 8
+#  define mix(x, y) ((vec_t)__builtin_ia32_pblendd256((vsi_t)(x), (vsi_t)(y), 0b11001100))
+#  define rotr(x, n) ((vec_t)__builtin_ia32_palignr256(__builtin_ia32_permti256((vdi_t)(x), (vdi_t)(x), 0b00000001), \
+                                                       (vdi_t)(x), (n) * 64))
+#  define select(d, x, y, m) ({ \
+    vdi_t m_ = (vdi_t)(m); \
+    *(d) = (vec_t)__builtin_ia32_maskloadq256((vdi_t *)&(x),  m_); \
+    __builtin_ia32_maskstoreq256((vdi_t *)(d), ~m_, (vdi_t)(y)); \
+})
+#  define swap(x) ((vec_t)__builtin_ia32_permdi256((vdi_t)(x), 0b00011011))
+#  define swap2(x) ({ \
+    vdi_t t_ = __builtin_ia32_permdi256((vdi_t)(x), 0b10110001); \
+    (vec_t)__builtin_ia32_permti256(t_, t_, 0b00000001); \
+})
+# endif
+# if INT_SIZE == 1
+#  define abs(x) ((vec_t)__builtin_ia32_pabsb256((vqi_t)(x)))
+#  define max(x, y) ((vec_t)__builtin_ia32_pmaxsb256((vqi_t)(x), (vqi_t)(y)))
+#  define min(x, y) ((vec_t)__builtin_ia32_pminsb256((vqi_t)(x), (vqi_t)(y)))
+#  define widen1(x) ((vec_t)__builtin_ia32_pmovsxbw256((vqi_t)(x)))
+#  define widen2(x) ((vec_t)__builtin_ia32_pmovsxbd256((vqi_t)(x)))
+#  define widen3(x) ((vec_t)__builtin_ia32_pmovsxbq256((vqi_t)(x)))
+# elif UINT_SIZE == 1
+#  define max(x, y) ((vec_t)__builtin_ia32_pmaxub256((vqi_t)(x), (vqi_t)(y)))
+#  define min(x, y) ((vec_t)__builtin_ia32_pminub256((vqi_t)(x), (vqi_t)(y)))
+#  define widen1(x) ((vec_t)__builtin_ia32_pmovzxbw256((vqi_t)(x)))
+#  define widen2(x) ((vec_t)__builtin_ia32_pmovzxbd256((vqi_t)(x)))
+#  define widen3(x) ((vec_t)__builtin_ia32_pmovzxbq256((vqi_t)(x)))
+# elif INT_SIZE == 2
+#  define abs(x) __builtin_ia32_pabsw256(x)
+#  define max(x, y) __builtin_ia32_pmaxsw256(x, y)
+#  define min(x, y) __builtin_ia32_pminsw256(x, y)
+#  define mul_hi(x, y) __builtin_ia32_pmulhw256(x, y)
+#  define widen1(x) ((vec_t)__builtin_ia32_pmovsxwd256(x))
+#  define widen2(x) ((vec_t)__builtin_ia32_pmovsxwq256(x))
+# elif UINT_SIZE == 2
+#  define max(x, y) ((vec_t)__builtin_ia32_pmaxuw256((vhi_t)(x), (vhi_t)(y)))
+#  define min(x, y) ((vec_t)__builtin_ia32_pminuw256((vhi_t)(x), (vhi_t)(y)))
+#  define mul_hi(x, y) ((vec_t)__builtin_ia32_pmulhuw256((vhi_t)(x), (vhi_t)(y)))
+#  define widen1(x) ((vec_t)__builtin_ia32_pmovzxwd256((vhi_t)(x)))
+#  define widen2(x) ((vec_t)__builtin_ia32_pmovzxwq256((vhi_t)(x)))
+# elif INT_SIZE == 4
+#  define abs(x) __builtin_ia32_pabsd256(x)
+#  define max(x, y) __builtin_ia32_pmaxsd256(x, y)
+#  define min(x, y) __builtin_ia32_pminsd256(x, y)
+#  define widen1(x) ((vec_t)__builtin_ia32_pmovsxdq256(x))
+# elif UINT_SIZE == 4
+#  define max(x, y) ((vec_t)__builtin_ia32_pmaxud256((vsi_t)(x), (vsi_t)(y)))
+#  define min(x, y) ((vec_t)__builtin_ia32_pminud256((vsi_t)(x), (vsi_t)(y)))
+#  define mul_full(x, y) ((vec_t)__builtin_ia32_pmuludq256((vsi_t)(x), (vsi_t)(y)))
+#  define widen1(x) ((vec_t)__builtin_ia32_pmovzxdq256((vsi_t)(x)))
+# elif INT_SIZE == 8
+#  define broadcast(x) ({ \
+    long long s_ = (x); \
+    long long __attribute__((vector_size(16))) t_; \
+    vec_t d_; \
+    asm ( "vpbroadcastq %1,%0" : "=x" (t_) : "m" (s_)); \
+    asm ( "vbroadcasti128 %1,%0" : "=x" (d_) : "m" (t_)); \
+    d_; \
+})
+# elif UINT_SIZE == 8
+#  define broadcast(x) ({ long long s_ = (x); vec_t d_; asm ( "vpbroadcastq %1,%0" : "=x" (d_) : "m" (s_)); d_; })
+# endif
 #endif
 #if VEC_SIZE == 16 && defined(__SSE3__)
 # if FLOAT_SIZE == 4
@@ -207,25 +314,37 @@ static inline bool _to_bool(byte_vec_t b
 #  define addsub(x, y) __builtin_ia32_addsubps256(x, y)
 #  define dup_hi(x) __builtin_ia32_movshdup256(x)
 #  define dup_lo(x) __builtin_ia32_movsldup256(x)
-#  define hadd(x, y) ({ \
+#  ifdef __AVX2__
+#   define hadd(x, y) __builtin_ia32_permvarsf256(__builtin_ia32_haddps256(x, y), \
+                                                  (vsi_t){0, 1, 4, 5, 2, 3, 6, 7})
+#   define hsub(x, y) __builtin_ia32_permvarsf256(__builtin_ia32_hsubps256(x, y), \
+                                                  (vsi_t){0, 1, 4, 5, 2, 3, 6, 7})
+#  else
+#   define hadd(x, y) ({ \
         vec_t t_ = __builtin_ia32_haddps256(x, y); \
         (vec_t){t_[0], t_[1], t_[4], t_[5], t_[2], t_[3], t_[6], t_[7]}; \
 })
-#  define hsub(x, y) ({ \
+#   define hsub(x, y) ({ \
         vec_t t_ = __builtin_ia32_hsubps256(x, y); \
         (vec_t){t_[0], t_[1], t_[4], t_[5], t_[2], t_[3], t_[6], t_[7]}; \
 })
+#  endif
 # elif FLOAT_SIZE == 8
 #  define addsub(x, y) __builtin_ia32_addsubpd256(x, y)
 #  define dup_lo(x) __builtin_ia32_movddup256(x)
-#  define hadd(x, y) ({ \
+#  ifdef __AVX2__
+#   define hadd(x, y) __builtin_ia32_permdf256(__builtin_ia32_haddpd256(x, y), 0b11011000)
+#   define hsub(x, y) __builtin_ia32_permdf256(__builtin_ia32_hsubpd256(x, y), 0b11011000)
+#  else
+#   define hadd(x, y) ({ \
         vec_t t_ = __builtin_ia32_haddpd256(x, y); \
         (vec_t){t_[0], t_[2], t_[1], t_[3]}; \
 })
-#  define hsub(x, y) ({ \
+#   define hsub(x, y) ({ \
         vec_t t_ = __builtin_ia32_hsubpd256(x, y); \
         (vec_t){t_[0], t_[2], t_[1], t_[3]}; \
 })
+#  endif
 # endif
 #endif
 #if VEC_SIZE == 16 && defined(__SSSE3__)
@@ -546,7 +665,7 @@ int simd_test(void)
     z *= alt;
 #  endif
     /*
-     * Zap elements for which the shift count is negative (and the hence the
+     * Zap elements for which the shift count is zero (and the hence the
      * decrement below would yield a negative count.
      */
     z &= (sh > 0);
@@ -556,9 +675,14 @@ int simd_test(void)
     --sh;
     touch(sh);
     y = z << sh;
-    touch(sh);
     if ( !to_bool(x == y + y) ) return __LINE__;
 
+#  if defined(__AVX2__) && ELEM_SIZE >= 4
+    touch(sh);
+    x = y >> sh;
+    if ( !to_bool(x == z) ) return __LINE__;
+#  endif
+
 # endif
 
 #endif
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -8,11 +8,10 @@
 #include "sse.h"
 #include "sse2.h"
 #include "sse4.h"
-#include "sse2-avx.h"
-#include "sse4-avx.h"
 #include "avx.h"
 #include "fma4.h"
 #include "fma.h"
+#include "avx2.h"
 
 #define verbose false /* Switch to true for far more logging. */
 
@@ -46,8 +45,6 @@ static bool simd_check_avx(void)
 {
     return cpu_has_avx;
 }
-#define simd_check_sse2_avx  simd_check_avx
-#define simd_check_sse4_avx  simd_check_avx
 
 static bool simd_check_fma4(void)
 {
@@ -59,6 +56,11 @@ static bool simd_check_fma(void)
     return cpu_has_fma;
 }
 
+static bool simd_check_avx2(void)
+{
+    return cpu_has_avx2;
+}
+
 static void simd_set_regs(struct cpu_user_regs *regs)
 {
     if ( cpu_has_mmx )
@@ -133,22 +135,6 @@ static const struct {
     SIMD(SSE4 packed u32,        sse4,      16u4),
     SIMD(SSE4 packed s64,        sse4,      16i8),
     SIMD(SSE4 packed u64,        sse4,      16u8),
-    SIMD(SSE2/AVX packed s8,     sse2_avx,  16i1),
-    SIMD(SSE2/AVX packed u8,     sse2_avx,  16u1),
-    SIMD(SSE2/AVX packed s16,    sse2_avx,  16i2),
-    SIMD(SSE2/AVX packed u16,    sse2_avx,  16u2),
-    SIMD(SSE2/AVX packed s32,    sse2_avx,  16i4),
-    SIMD(SSE2/AVX packed u32,    sse2_avx,  16u4),
-    SIMD(SSE2/AVX packed s64,    sse2_avx,  16i8),
-    SIMD(SSE2/AVX packed u64,    sse2_avx,  16u8),
-    SIMD(SSE4/AVX packed s8,     sse4_avx,  16i1),
-    SIMD(SSE4/AVX packed u8,     sse4_avx,  16u1),
-    SIMD(SSE4/AVX packed s16,    sse4_avx,  16i2),
-    SIMD(SSE4/AVX packed u16,    sse4_avx,  16u2),
-    SIMD(SSE4/AVX packed s32,    sse4_avx,  16i4),
-    SIMD(SSE4/AVX packed u32,    sse4_avx,  16u4),
-    SIMD(SSE4/AVX packed s64,    sse4_avx,  16i8),
-    SIMD(SSE4/AVX packed u64,    sse4_avx,  16u8),
     SIMD(AVX scalar single,      avx,         f4),
     SIMD(AVX 128bit single,      avx,       16f4),
     SIMD(AVX 256bit single,      avx,       32f4),
@@ -167,6 +153,26 @@ static const struct {
     SIMD(FMA scalar double,      fma,         f8),
     SIMD(FMA 128bit double,      fma,       16f8),
     SIMD(FMA 256bit double,      fma,       32f8),
+    SIMD(AVX2 128bit single,     avx2,      16f4),
+    SIMD(AVX2 256bit single,     avx2,      32f4),
+    SIMD(AVX2 128bit double,     avx2,      16f8),
+    SIMD(AVX2 256bit double,     avx2,      32f8),
+    SIMD(AVX2 s8x16,             avx2,      16i1),
+    SIMD(AVX2 u8x16,             avx2,      16u1),
+    SIMD(AVX2 s16x8,             avx2,      16i2),
+    SIMD(AVX2 u16x8,             avx2,      16u2),
+    SIMD(AVX2 s32x4,             avx2,      16i4),
+    SIMD(AVX2 u32x4,             avx2,      16u4),
+    SIMD(AVX2 s64x2,             avx2,      16i8),
+    SIMD(AVX2 u64x2,             avx2,      16u8),
+    SIMD(AVX2 s8x32,             avx2,      32i1),
+    SIMD(AVX2 u8x32,             avx2,      32u1),
+    SIMD(AVX2 s16x16,            avx2,      32i2),
+    SIMD(AVX2 u16x16,            avx2,      32u2),
+    SIMD(AVX2 s32x8,             avx2,      32i4),
+    SIMD(AVX2 u32x8,             avx2,      32u4),
+    SIMD(AVX2 s64x4,             avx2,      32i8),
+    SIMD(AVX2 u64x4,             avx2,      32u8),
 #undef SIMD_
 #undef SIMD
 };
@@ -2925,6 +2931,91 @@ int main(int argc, char **argv)
              res[0] || res[1] || memcmp(res + 2, res + 4, 8) )
             goto fail;
 
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing vpmaskmovd %xmm1,%xmm2,(%edx)...");
+    if ( stack_exec && cpu_has_avx2 )
+    {
+        decl_insn(vpmaskmovd);
+
+        asm volatile ( "vpxor %%xmm1, %%xmm1, %%xmm1\n\t"
+                       "vpinsrd $0b00, %1, %%xmm1, %%xmm2\n\t"
+#if 0 /* Don't use AVX2 instructions for now */
+                       put_insn(vpmaskmovd, "vpmaskmovd %%xmm1, %%xmm2, (%0)")
+#else
+                       put_insn(vpmaskmovd,
+                                ".byte 0xc4, 0xe2, 0x69, 0x8e, 0x0a")
+#endif
+                       :: "d" (NULL), "r" (~0) );
+
+        memset(res + MMAP_SZ / sizeof(*res) - 8, 0xdb, 32);
+        set_insn(vpmaskmovd);
+        regs.edx = (unsigned long)res + MMAP_SZ - 4;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vpmaskmovd) ||
+             res[MMAP_SZ / sizeof(*res) - 1] ||
+             memcmp(res + MMAP_SZ / sizeof(*res) - 8,
+                    res + MMAP_SZ / sizeof(*res) - 4, 12) )
+            goto fail;
+
+        asm volatile ( "vpinsrd $0b11, %0, %%xmm1, %%xmm2" :: "r" (~0) );
+        memset(res, 0xdb, 32);
+        set_insn(vpmaskmovd);
+        regs.edx = (unsigned long)(res - 3);
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vpmaskmovd) ||
+             res[0] || memcmp(res + 1, res + 4, 12) )
+            goto fail;
+
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing vpmaskmovq %xmm1,%xmm2,(%edx)...");
+    if ( stack_exec && cpu_has_avx2 )
+    {
+        decl_insn(vpmaskmovq);
+
+        asm volatile ( "vpxor %%xmm1, %%xmm1, %%xmm1\n\t"
+                       "vpcmpeqd %%xmm0, %%xmm0, %%xmm0\n\t"
+#if 0 /* Don't use AVX2 instructions for now */
+                       "vpblendd $0b0011, %%xmm0, %%xmm1, %%xmm2\n\t"
+                       put_insn(vpmaskmovq, "vpmaskmovq %%xmm1, %%xmm2, (%0)")
+#else
+                       ".byte 0xc4, 0xe3, 0x71, 0x02, 0xd0, 0b0011\n\t"
+                       put_insn(vpmaskmovq,
+                                ".byte 0xc4, 0xe2, 0xe9, 0x8e, 0x0a")
+#endif
+                       :: "d" (NULL) );
+
+        memset(res + MMAP_SZ / sizeof(*res) - 8, 0xdb, 32);
+        set_insn(vpmaskmovq);
+        regs.edx = (unsigned long)res + MMAP_SZ - 8;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vpmaskmovq) ||
+             res[MMAP_SZ / sizeof(*res) - 1] ||
+             res[MMAP_SZ / sizeof(*res) - 2] ||
+             memcmp(res + MMAP_SZ / sizeof(*res) - 8,
+                    res + MMAP_SZ / sizeof(*res) - 4, 8) )
+            goto fail;
+
+#if 0 /* Don't use AVX2 instructions for now */
+        asm volatile ( "vpermq $0b00000001, %ymm2, %ymm2" );
+#else
+        asm volatile ( ".byte 0xc4, 0xe3, 0xfd, 0x00, 0xd2, 0b00000001" );
+#endif
+        memset(res, 0xdb, 32);
+        set_insn(vpmaskmovq);
+        regs.edx = (unsigned long)(res - 2);
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vpmaskmovq) ||
+             res[0] || res[1] || memcmp(res + 2, res + 4, 8) )
+            goto fail;
+
         printf("okay\n");
     }
     else
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -370,7 +370,7 @@ static const struct {
     [0x0c ... 0x0f] = { .simd_size = simd_packed_fp },
     [0x10] = { .simd_size = simd_packed_int },
     [0x13] = { .simd_size = simd_other, .two_op = 1 },
-    [0x14 ... 0x15] = { .simd_size = simd_packed_fp },
+    [0x14 ... 0x16] = { .simd_size = simd_packed_fp },
     [0x17] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0x18 ... 0x19] = { .simd_size = simd_scalar_fp, .two_op = 1 },
     [0x1a] = { .simd_size = simd_128, .two_op = 1 },
@@ -382,9 +382,15 @@ static const struct {
     [0x2c ... 0x2d] = { .simd_size = simd_other },
     [0x2e ... 0x2f] = { .simd_size = simd_other, .to_mem = 1 },
     [0x30 ... 0x35] = { .simd_size = simd_other, .two_op = 1 },
-    [0x37 ... 0x3f] = { .simd_size = simd_packed_int },
+    [0x36 ... 0x3f] = { .simd_size = simd_packed_int },
     [0x40] = { .simd_size = simd_packed_int },
     [0x41] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0x45 ... 0x47] = { .simd_size = simd_packed_int },
+    [0x58 ... 0x59] = { .simd_size = simd_other, .two_op = 1 },
+    [0x5a] = { .simd_size = simd_128, .two_op = 1 },
+    [0x78 ... 0x79] = { .simd_size = simd_other, .two_op = 1 },
+    [0x8c] = { .simd_size = simd_other },
+    [0x8e] = { .simd_size = simd_other, .to_mem = 1 },
     [0x96 ... 0x9f] = { .simd_size = simd_packed_fp },
     [0xa6 ... 0xaf] = { .simd_size = simd_packed_fp },
     [0xb6 ... 0xbf] = { .simd_size = simd_packed_fp },
@@ -406,6 +412,9 @@ static const struct {
     uint8_t two_op:1;
     uint8_t four_op:1;
 } ext0f3a_table[256] = {
+    [0x00] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0x01] = { .simd_size = simd_packed_fp, .two_op = 1 },
+    [0x02] = { .simd_size = simd_packed_int },
     [0x04 ... 0x05] = { .simd_size = simd_packed_fp, .two_op = 1 },
     [0x06] = { .simd_size = simd_packed_fp },
     [0x08 ... 0x09] = { .simd_size = simd_packed_fp, .two_op = 1 },
@@ -419,9 +428,12 @@ static const struct {
     [0x20] = { .simd_size = simd_none },
     [0x21] = { .simd_size = simd_other },
     [0x22] = { .simd_size = simd_none },
+    [0x38] = { .simd_size = simd_128 },
+    [0x39] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1 },
     [0x40 ... 0x41] = { .simd_size = simd_packed_fp },
     [0x42] = { .simd_size = simd_packed_int },
     [0x44] = { .simd_size = simd_packed_int },
+    [0x46] = { .simd_size = simd_packed_int },
     [0x4a ... 0x4b] = { .simd_size = simd_packed_fp, .four_op = 1 },
     [0x4c] = { .simd_size = simd_packed_int, .four_op = 1 },
     [0x5c ... 0x5f] = { .simd_size = simd_packed_fp, .four_op = 1 },
@@ -2964,7 +2976,7 @@ x86_decode(
         }
         break;
 
-    case simd_scalar_fp:
+    case simd_scalar_fp: /* case simd_scalar_dq: */
         op_bytes = 4 << (ctxt->opcode & 1);
         break;
 
@@ -6057,6 +6069,10 @@ x86_emulate(
     case X86EMUL_OPC_VEX_66(0x0f38, 0x40): /* vpmulld {x,y}mm/mem,{x,y}mm,{x,y}mm */
             if ( !vex.l )
                 goto simd_0f_avx;
+            /* fall through */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x45): /* vpsrlv{d,q} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x47): /* vpsllv{d,q} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    simd_0f_avx2:
             host_and_vcpu_must_have(avx2);
             goto simd_0f_ymm;
         }
@@ -6156,7 +6172,10 @@ x86_emulate(
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x0f): /* vpalignr $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x42): /* vmpsadbw $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
             if ( vex.l )
+            {
+    simd_0f_imm8_avx2:
                 host_and_vcpu_must_have(avx2);
+            }
             else
             {
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x08): /* vroundps $imm8,{x,y}mm/mem,{x,y}mm */
@@ -7240,6 +7259,11 @@ x86_emulate(
         op_bytes = 8 << vex.l;
         goto simd_0f_ymm;
 
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x16): /* vpermps ymm/m256,ymm,ymm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x36): /* vpermd ymm/m256,ymm,ymm */
+        generate_exception_if(!vex.l || vex.w, EXC_UD);
+        goto simd_0f_avx2;
+
     case X86EMUL_OPC_VEX_66(0x0f38, 0x20): /* vpmovsxbw xmm/mem,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x21): /* vpmovsxbd xmm/mem,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x22): /* vpmovsxbq xmm/mem,{x,y}mm */
@@ -7356,6 +7380,80 @@ x86_emulate(
         generate_exception_if(vex.l, EXC_UD);
         goto simd_0f_avx;
 
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x58): /* vpbroadcastd xmm/m32,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x59): /* vpbroadcastq xmm/m64,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x78): /* vpbroadcastb xmm/m8,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x79): /* vpbroadcastw xmm/m16,{x,y}mm */
+        op_bytes = 1 << ((!(b & 0x20) * 2) + (b & 1));
+        /* fall through */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x46): /* vpsravd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+        generate_exception_if(vex.w, EXC_UD);
+        goto simd_0f_avx2;
+
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x5a): /* vbroadcasti128 m128,ymm */
+        generate_exception_if(ea.type != OP_MEM || !vex.l || vex.w, EXC_UD);
+        goto simd_0f_avx2;
+
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x8c): /* vpmaskmov{d,q} mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x8e): /* vpmaskmov{d,q} {x,y}mm,{x,y}mm,mem */
+    {
+        typeof(vex) *pvex;
+        unsigned int mask = vex.w ? 0x80808080U : 0x88888888U;
+
+        generate_exception_if(ea.type != OP_MEM, EXC_UD);
+        host_and_vcpu_must_have(avx2);
+        get_fpu(X86EMUL_FPU_ymm, &fic);
+
+        /*
+         * While we can't reasonably provide fully correct behavior here
+         * (in particular, for writes, avoiding the memory read in anticipation
+         * of all elements in the range eventually being written), we can (and
+         * should) still limit the memory access to the smallest possible range
+         * (suppressing it altogether if all mask bits are clear), to provide
+         * correct faulting behavior. Read the mask bits via vmovmskp{s,d}
+         * for that purpose.
+         */
+        opc = init_prefixes(stub);
+        pvex = copy_VEX(opc, vex);
+        pvex->opcx = vex_0f;
+        opc[0] = 0xd7; /* vpmovmskb */
+        /* Use %rax as GPR destination and VEX.vvvv as source. */
+        pvex->r = 1;
+        pvex->b = !mode_64bit() || (vex.reg >> 3);
+        opc[1] = 0xc0 | (~vex.reg & 7);
+        pvex->reg = 0xf;
+        opc[2] = 0xc3;
+
+        invoke_stub("", "", "=a" (ea.val) : [dummy] "i" (0));
+        put_stub(stub);
+
+        /* Convert byte granular result to dword/qword granularity. */
+        ea.val &= mask;
+        if ( !ea.val )
+            goto complete_insn;
+
+        first_byte = __builtin_ctz(ea.val) & ~((4 << vex.w) - 1);
+        ea.val >>= first_byte;
+        op_bytes = 32 - __builtin_clz(ea.val);
+
+        /*
+         * Even for the memory write variant a memory read is needed, unless
+         * all set mask bits are contiguous.
+         */
+        if ( ea.val & (ea.val + ~mask + 1) )
+            d = (d & ~SrcMask) | SrcMem;
+
+        opc = init_prefixes(stub);
+        opc[0] = b;
+        /* Convert memory operand to (%rAX). */
+        rex_prefix &= ~REX_B;
+        vex.b = 1;
+        opc[1] = modrm & 0x38;
+        fic.insn_bytes = PFX_BYTES + 2;
+
+        break;
+    }
+
     case X86EMUL_OPC_VEX_66(0x0f38, 0x96): /* vfmaddsub132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x97): /* vfmsubadd132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x98): /* vfmadd132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
@@ -7564,6 +7662,20 @@ x86_emulate(
                             : "0" ((uint32_t)src.val), "rm" (_regs.edx) );
         break;
 
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x00): /* vpermq $imm8,ymm/m256,ymm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x01): /* vpermpd $imm8,ymm/m256,ymm */
+        generate_exception_if(!vex.l || !vex.w, EXC_UD);
+        goto simd_0f_imm8_avx2;
+
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x38): /* vinserti128 $imm8,xmm/m128,ymm,ymm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x39): /* vextracti128 $imm8,ymm,xmm/m128 */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x46): /* vperm2i128 $imm8,ymm/m256,ymm,ymm */
+        generate_exception_if(!vex.l, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x02): /* vpblendd $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+        generate_exception_if(vex.w, EXC_UD);
+        goto simd_0f_imm8_avx2;
+
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x06): /* vperm2f128 $imm8,ymm/m256,ymm,ymm */
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x18): /* vinsertf128 $imm8,xmm/m128,ymm,ymm */
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x19): /* vextractf128 $imm8,ymm,xmm/m128 */

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 08/17] x86emul: fold/eliminate some local variables
  2017-06-21 11:23 [PATCH 00/17] x86: emulator enhancements Jan Beulich
                   ` (6 preceding siblings ...)
  2017-06-21 12:02 ` [PATCH 07/17] x86emul: support most remaining AVX2 insns Jan Beulich
@ 2017-06-21 12:03 ` Jan Beulich
  2017-06-21 12:04 ` [PATCH 09/17] x86emul: support AVX2 gather insns Jan Beulich
                   ` (9 subsequent siblings)
  17 siblings, 0 replies; 27+ messages in thread
From: Jan Beulich @ 2017-06-21 12:03 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 5327 bytes --]

Make i switch-wide (at once makeing it unsigned, as it should have
been) and introduce n (for immediate use in enter and aam/aad
handling). Eliminate on-stack arrays in pusha/popa handling. Use ea.val
insatead of a custom variable in bound handling.

No (intended) functional change.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -3229,6 +3229,7 @@ x86_emulate(
         struct segment_register cs, sreg;
         struct cpuid_leaf cpuid_leaf;
         uint64_t msr_val;
+        unsigned int i, n;
         unsigned long dummy;
 
     case 0x00 ... 0x05: add: /* add */
@@ -3361,47 +3362,45 @@ x86_emulate(
             goto done;
         break;
 
-    case 0x60: /* pusha */ {
-        int i;
-        unsigned int regs[] = {
-            _regs.eax, _regs.ecx, _regs.edx, _regs.ebx,
-            _regs.esp, _regs.ebp, _regs.esi, _regs.edi };
-
+    case 0x60: /* pusha */
         fail_if(!ops->write);
+        ea.val = _regs.esp;
         for ( i = 0; i < 8; i++ )
+        {
+            void *reg = decode_register(i, &_regs, 0);
+
             if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes),
-                                  &regs[i], op_bytes, ctxt)) != 0 )
-            goto done;
+                                  reg != &_regs.esp ? reg : &ea.val,
+                                  op_bytes, ctxt)) != 0 )
+                goto done;
+        }
         break;
-    }
-
-    case 0x61: /* popa */ {
-        int i;
-        unsigned int dummy_esp, *regs[] = {
-            &_regs.edi, &_regs.esi, &_regs.ebp, &dummy_esp,
-            &_regs.ebx, &_regs.edx, &_regs.ecx, &_regs.eax };
 
+    case 0x61: /* popa */
         for ( i = 0; i < 8; i++ )
         {
+            void *reg = decode_register(7 - i, &_regs, 0);
+
             if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes),
                                   &dst.val, op_bytes, ctxt, ops)) != 0 )
                 goto done;
+            if ( reg == &_regs.r(sp) )
+                continue;
             if ( op_bytes == 2 )
-                *(uint16_t *)regs[i] = (uint16_t)dst.val;
+                *(uint16_t *)reg = dst.val;
             else
-                *regs[i] = dst.val; /* 64b: zero-ext done by read_ulong() */
+                *(unsigned long *)reg = dst.val;
         }
         break;
-    }
 
     case 0x62: /* bound */ {
-        unsigned long src_val2;
         int lb, ub, idx;
+
         generate_exception_if(src.type != OP_MEM, EXC_UD);
         if ( (rc = read_ulong(src.mem.seg, src.mem.off + op_bytes,
-                              &src_val2, op_bytes, ctxt, ops)) )
+                              &ea.val, op_bytes, ctxt, ops)) )
             goto done;
-        ub  = (op_bytes == 2) ? (int16_t)src_val2 : (int32_t)src_val2;
+        ub  = (op_bytes == 2) ? (int16_t)ea.val   : (int32_t)ea.val;
         lb  = (op_bytes == 2) ? (int16_t)src.val  : (int32_t)src.val;
         idx = (op_bytes == 2) ? (int16_t)dst.val  : (int32_t)dst.val;
         generate_exception_if((idx < lb) || (idx > ub), EXC_BR);
@@ -3957,10 +3956,7 @@ x86_emulate(
         dst.val = src.val;
         break;
 
-    case 0xc8: /* enter imm16,imm8 */ {
-        uint8_t depth = imm2 & 31;
-        int i;
-
+    case 0xc8: /* enter imm16,imm8 */
         dst.type = OP_REG;
         dst.bytes = (mode_64bit() && (op_bytes == 4)) ? 8 : op_bytes;
         dst.reg = (unsigned long *)&_regs.r(bp);
@@ -3970,9 +3966,10 @@ x86_emulate(
             goto done;
         dst.val = _regs.r(sp);
 
-        if ( depth > 0 )
+        n = imm2 & 31;
+        if ( n )
         {
-            for ( i = 1; i < depth; i++ )
+            for ( i = 1; i < n; i++ )
             {
                 unsigned long ebp, temp_data;
                 ebp = truncate_word(_regs.r(bp) - i*dst.bytes, ctxt->sp_size/8);
@@ -3989,7 +3986,6 @@ x86_emulate(
 
         sp_pre_dec(src.val);
         break;
-    }
 
     case 0xc9: /* leave */
         /* First writeback, to %%esp. */
@@ -4084,28 +4080,21 @@ x86_emulate(
         goto grp2;
 
     case 0xd4: /* aam */
-    case 0xd5: /* aad */ {
-        unsigned int base = (uint8_t)src.val;
-
+    case 0xd5: /* aad */
+        n = (uint8_t)src.val;
         if ( b & 0x01 )
-        {
-            uint16_t ax = _regs.ax;
-
-            _regs.ax = (uint8_t)(ax + ((ax >> 8) * base));
-        }
+            _regs.ax = (uint8_t)(_regs.al + (_regs.ah * n));
         else
         {
-            uint8_t al = _regs.al;
-
-            generate_exception_if(!base, EXC_DE);
-            _regs.ax = ((al / base) << 8) | (al % base);
+            generate_exception_if(!n, EXC_DE);
+            _regs.al = _regs.al % n;
+            _regs.ah = _regs.al / n;
         }
         _regs.eflags &= ~(X86_EFLAGS_SF | X86_EFLAGS_ZF | X86_EFLAGS_PF);
         _regs.eflags |= !_regs.al ? X86_EFLAGS_ZF : 0;
         _regs.eflags |= ((int8_t)_regs.al < 0) ? X86_EFLAGS_SF : 0;
         _regs.eflags |= even_parity(_regs.al) ? X86_EFLAGS_PF : 0;
         break;
-    }
 
     case 0xd6: /* salc */
         _regs.al = (_regs.eflags & X86_EFLAGS_CF) ? 0xff : 0x00;



[-- Attachment #2: x86emul-combine-i.patch --]
[-- Type: text/plain, Size: 5371 bytes --]

x86emul: fold/eliminate some local variables

Make i switch-wide (at once makeing it unsigned, as it should have
been) and introduce n (for immediate use in enter and aam/aad
handling). Eliminate on-stack arrays in pusha/popa handling. Use ea.val
insatead of a custom variable in bound handling.

No (intended) functional change.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -3229,6 +3229,7 @@ x86_emulate(
         struct segment_register cs, sreg;
         struct cpuid_leaf cpuid_leaf;
         uint64_t msr_val;
+        unsigned int i, n;
         unsigned long dummy;
 
     case 0x00 ... 0x05: add: /* add */
@@ -3361,47 +3362,45 @@ x86_emulate(
             goto done;
         break;
 
-    case 0x60: /* pusha */ {
-        int i;
-        unsigned int regs[] = {
-            _regs.eax, _regs.ecx, _regs.edx, _regs.ebx,
-            _regs.esp, _regs.ebp, _regs.esi, _regs.edi };
-
+    case 0x60: /* pusha */
         fail_if(!ops->write);
+        ea.val = _regs.esp;
         for ( i = 0; i < 8; i++ )
+        {
+            void *reg = decode_register(i, &_regs, 0);
+
             if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes),
-                                  &regs[i], op_bytes, ctxt)) != 0 )
-            goto done;
+                                  reg != &_regs.esp ? reg : &ea.val,
+                                  op_bytes, ctxt)) != 0 )
+                goto done;
+        }
         break;
-    }
-
-    case 0x61: /* popa */ {
-        int i;
-        unsigned int dummy_esp, *regs[] = {
-            &_regs.edi, &_regs.esi, &_regs.ebp, &dummy_esp,
-            &_regs.ebx, &_regs.edx, &_regs.ecx, &_regs.eax };
 
+    case 0x61: /* popa */
         for ( i = 0; i < 8; i++ )
         {
+            void *reg = decode_register(7 - i, &_regs, 0);
+
             if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes),
                                   &dst.val, op_bytes, ctxt, ops)) != 0 )
                 goto done;
+            if ( reg == &_regs.r(sp) )
+                continue;
             if ( op_bytes == 2 )
-                *(uint16_t *)regs[i] = (uint16_t)dst.val;
+                *(uint16_t *)reg = dst.val;
             else
-                *regs[i] = dst.val; /* 64b: zero-ext done by read_ulong() */
+                *(unsigned long *)reg = dst.val;
         }
         break;
-    }
 
     case 0x62: /* bound */ {
-        unsigned long src_val2;
         int lb, ub, idx;
+
         generate_exception_if(src.type != OP_MEM, EXC_UD);
         if ( (rc = read_ulong(src.mem.seg, src.mem.off + op_bytes,
-                              &src_val2, op_bytes, ctxt, ops)) )
+                              &ea.val, op_bytes, ctxt, ops)) )
             goto done;
-        ub  = (op_bytes == 2) ? (int16_t)src_val2 : (int32_t)src_val2;
+        ub  = (op_bytes == 2) ? (int16_t)ea.val   : (int32_t)ea.val;
         lb  = (op_bytes == 2) ? (int16_t)src.val  : (int32_t)src.val;
         idx = (op_bytes == 2) ? (int16_t)dst.val  : (int32_t)dst.val;
         generate_exception_if((idx < lb) || (idx > ub), EXC_BR);
@@ -3957,10 +3956,7 @@ x86_emulate(
         dst.val = src.val;
         break;
 
-    case 0xc8: /* enter imm16,imm8 */ {
-        uint8_t depth = imm2 & 31;
-        int i;
-
+    case 0xc8: /* enter imm16,imm8 */
         dst.type = OP_REG;
         dst.bytes = (mode_64bit() && (op_bytes == 4)) ? 8 : op_bytes;
         dst.reg = (unsigned long *)&_regs.r(bp);
@@ -3970,9 +3966,10 @@ x86_emulate(
             goto done;
         dst.val = _regs.r(sp);
 
-        if ( depth > 0 )
+        n = imm2 & 31;
+        if ( n )
         {
-            for ( i = 1; i < depth; i++ )
+            for ( i = 1; i < n; i++ )
             {
                 unsigned long ebp, temp_data;
                 ebp = truncate_word(_regs.r(bp) - i*dst.bytes, ctxt->sp_size/8);
@@ -3989,7 +3986,6 @@ x86_emulate(
 
         sp_pre_dec(src.val);
         break;
-    }
 
     case 0xc9: /* leave */
         /* First writeback, to %%esp. */
@@ -4084,28 +4080,21 @@ x86_emulate(
         goto grp2;
 
     case 0xd4: /* aam */
-    case 0xd5: /* aad */ {
-        unsigned int base = (uint8_t)src.val;
-
+    case 0xd5: /* aad */
+        n = (uint8_t)src.val;
         if ( b & 0x01 )
-        {
-            uint16_t ax = _regs.ax;
-
-            _regs.ax = (uint8_t)(ax + ((ax >> 8) * base));
-        }
+            _regs.ax = (uint8_t)(_regs.al + (_regs.ah * n));
         else
         {
-            uint8_t al = _regs.al;
-
-            generate_exception_if(!base, EXC_DE);
-            _regs.ax = ((al / base) << 8) | (al % base);
+            generate_exception_if(!n, EXC_DE);
+            _regs.al = _regs.al % n;
+            _regs.ah = _regs.al / n;
         }
         _regs.eflags &= ~(X86_EFLAGS_SF | X86_EFLAGS_ZF | X86_EFLAGS_PF);
         _regs.eflags |= !_regs.al ? X86_EFLAGS_ZF : 0;
         _regs.eflags |= ((int8_t)_regs.al < 0) ? X86_EFLAGS_SF : 0;
         _regs.eflags |= even_parity(_regs.al) ? X86_EFLAGS_PF : 0;
         break;
-    }
 
     case 0xd6: /* salc */
         _regs.al = (_regs.eflags & X86_EFLAGS_CF) ? 0xff : 0x00;

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 09/17] x86emul: support AVX2 gather insns
  2017-06-21 11:23 [PATCH 00/17] x86: emulator enhancements Jan Beulich
                   ` (7 preceding siblings ...)
  2017-06-21 12:03 ` [PATCH 08/17] x86emul: fold/eliminate some local variables Jan Beulich
@ 2017-06-21 12:04 ` Jan Beulich
  2017-06-21 12:04 ` [PATCH 10/17] x86emul: add tables for XOP 08 and 09 extension spaces Jan Beulich
                   ` (8 subsequent siblings)
  17 siblings, 0 replies; 27+ messages in thread
From: Jan Beulich @ 2017-06-21 12:04 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 17362 bytes --]

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -13,7 +13,8 @@ run: $(TARGET)
 
 SIMD := sse sse2 sse4 avx avx2
 FMA := fma4 fma
-TESTCASES := blowfish $(SIMD) $(FMA)
+SG := avx2-sg
+TESTCASES := blowfish $(SIMD) $(FMA) $(SG)
 
 blowfish-cflags := ""
 blowfish-cflags-x86_32 := "-mno-accumulate-outgoing-args -Dstatic="
@@ -39,6 +40,10 @@ fma-flts := $(avx-flts)
 avx2-vecs := $(avx-vecs)
 avx2-ints := 1 2 4 8
 avx2-flts := 4 8
+avx2-sg-vecs := $(avx2-vecs)
+avx2-sg-idxs := 4 8
+avx2-sg-ints := 4 8
+avx2-sg-flts := 4 8
 
 # For AVX and later, have the compiler avoid XMM0 to widen coverage of
 # the VEX.vvvv checks in the emulator.
@@ -55,8 +60,18 @@ $(1)-cflags := \
 	$(foreach flt,$($(1)-flts), \
 	  "-D_f$(flt) -m$(1) $(call non-sse,$(1)) -mfpmath=sse -Os -DFLOAT_SIZE=$(flt)")
 endef
+define simd-sg-defs
+$(1)-cflags := \
+	$(foreach vec,$($(1)-vecs), \
+	  $(foreach idx,$($(1)-idxs), \
+	   $(foreach int,$($(1)-ints), \
+	     "-D_$(vec)x$(idx)i$(int) -m$(1:-sg=) $(call non-sse,$(1)) -Os -DVEC_MAX=$(vec) -DIDX_SIZE=$(idx) -DINT_SIZE=$(int)") \
+	   $(foreach flt,$($(1)-flts), \
+	     "-D_$(vec)x$(idx)f$(flt) -m$(1:-sg=) $(call non-sse,$(1)) -Os -DVEC_MAX=$(vec) -DIDX_SIZE=$(idx) -DFLOAT_SIZE=$(flt)")))
+endef
 
 $(foreach flavor,$(SIMD) $(FMA),$(eval $(call simd-defs,$(flavor))))
+$(foreach flavor,$(SG),$(eval $(call simd-sg-defs,$(flavor))))
 
 $(addsuffix .h,$(TESTCASES)): %.h: %.c testcase.mk Makefile
 	rm -f $@.new $*.bin
@@ -78,7 +93,10 @@ $(addsuffix .c,$(SIMD)):
 $(addsuffix .c,$(FMA)):
 	ln -sf simd-fma.c $@
 
-$(addsuffix .o,$(SIMD) $(FMA)): simd.h
+$(addsuffix .c,$(SG)):
+	ln -sf simd-sg.c $@
+
+$(addsuffix .o,$(SIMD) $(FMA) $(SG)): simd.h
 
 $(TARGET): x86_emulate.o test_x86_emulator.o
 	$(HOSTCC) -o $@ $^
--- /dev/null
+++ b/tools/tests/x86_emulator/simd-sg.c
@@ -0,0 +1,209 @@
+#ifdef INT_SIZE
+# define ELEM_SIZE INT_SIZE
+#else
+# define ELEM_SIZE FLOAT_SIZE
+#endif
+
+#define VEC_SIZE (IDX_SIZE <= ELEM_SIZE ? VEC_MAX \
+                                        : VEC_MAX * ELEM_SIZE / IDX_SIZE)
+#if VEC_SIZE < 16
+# undef VEC_SIZE
+# define VEC_SIZE 16
+#endif
+
+#include "simd.h"
+
+ENTRY(sg_test);
+
+#undef MODE
+#if IDX_SIZE == 4
+# define MODE SI
+#elif IDX_SIZE == 8
+# define MODE DI
+#endif
+
+#define IVEC_SIZE (ELEM_SIZE <= IDX_SIZE ? VEC_MAX \
+                                         : VEC_MAX * IDX_SIZE / ELEM_SIZE)
+#if IVEC_SIZE < 16
+# undef IVEC_SIZE
+# define IVEC_SIZE 16
+#endif
+
+typedef signed int __attribute__((mode(MODE), vector_size(IVEC_SIZE))) idx_t;
+typedef long long __attribute__((vector_size(IVEC_SIZE))) idi_t;
+
+#define ITEM_COUNT (VEC_SIZE / ELEM_SIZE < IVEC_SIZE / IDX_SIZE ? \
+                    VEC_SIZE / ELEM_SIZE : IVEC_SIZE / IDX_SIZE)
+
+#if VEC_SIZE == 16
+# define to_bool(cmp) __builtin_ia32_ptestc128(cmp, (vec_t){} == 0)
+#else
+# define to_bool(cmp) __builtin_ia32_ptestc256(cmp, (vec_t){} == 0)
+#endif
+
+#if defined(__AVX2__)
+# if VEC_MAX == 16
+#  if IDX_SIZE == 4
+#   if INT_SIZE == 4
+#    define gather __builtin_ia32_gathersiv4si
+#   elif INT_SIZE == 8
+#    define gather(reg, mem, idx, msk, scl) \
+            (vec_t)(__builtin_ia32_gathersiv2di((vdi_t)(reg), \
+                                                (const void *)(mem), \
+                                                idx, (vdi_t)(msk), scl))
+#   elif FLOAT_SIZE == 4
+#    define gather __builtin_ia32_gathersiv4sf
+#   elif FLOAT_SIZE == 8
+#    define gather __builtin_ia32_gathersiv2df
+#   endif
+#  elif IDX_SIZE == 8
+#   if INT_SIZE == 4
+#    define gather(reg, mem, idx, msk, scl) \
+            __builtin_ia32_gatherdiv4si(reg, mem, (vdi_t)(idx), msk, scl)
+#   elif INT_SIZE == 8
+#    define gather(reg, mem, idx, msk, scl) \
+            (vec_t)(__builtin_ia32_gatherdiv2di((vdi_t)(reg), \
+                                                (const void *)(mem), \
+                                                (vdi_t)(idx), (vdi_t)(msk), \
+                                                scl))
+#   elif FLOAT_SIZE == 4
+#    define gather(reg, mem, idx, msk, scl) \
+            __builtin_ia32_gatherdiv4sf(reg, mem, (vdi_t)(idx), msk, scl)
+#   elif FLOAT_SIZE == 8
+#    define gather(reg, mem, idx, msk, scl) \
+            __builtin_ia32_gatherdiv2df(reg, mem, (vdi_t)(idx), msk, scl)
+#   endif
+#  endif
+# elif VEC_MAX == 32
+#  if IDX_SIZE == 4
+#   if INT_SIZE == 4
+#    define gather __builtin_ia32_gathersiv8si
+#   elif INT_SIZE == 8
+#    define gather(reg, mem, idx, msk, scl) \
+            (vec_t)(__builtin_ia32_gathersiv4di((vdi_t)(reg), \
+                                                (const void *)(mem), \
+                                                idx, (vdi_t)(msk), scl))
+
+#   elif FLOAT_SIZE == 4
+#    define gather __builtin_ia32_gathersiv8sf
+#   elif FLOAT_SIZE == 8
+#    define gather __builtin_ia32_gathersiv4df
+#   endif
+#  elif IDX_SIZE == 8
+#   if INT_SIZE == 4
+#    define gather(reg, mem, idx, msk, scl) \
+            __builtin_ia32_gatherdiv4si256(reg, mem, (idi_t)(idx), msk, scl)
+#   elif INT_SIZE == 8
+#    define gather(reg, mem, idx, msk, scl) \
+            (vec_t)(__builtin_ia32_gatherdiv4di((vdi_t)(reg), \
+                                                (const void *)(mem), \
+                                                (vdi_t)(idx), (vdi_t)(msk), \
+                                                scl))
+
+#   elif FLOAT_SIZE == 4
+#    define gather(reg, mem, idx, msk, scl) \
+            __builtin_ia32_gatherdiv4sf256(reg, mem, (idi_t)(idx), msk, scl)
+#   elif FLOAT_SIZE == 8
+#    define gather(reg, mem, idx, msk, scl) \
+            __builtin_ia32_gatherdiv4df(reg, mem, (vdi_t)(idx), msk, scl)
+#   endif
+#  endif
+# endif
+#endif
+
+#define GLUE_(x, y) x ## y
+#define GLUE(x, y) GLUE_(x, y)
+
+#define PUT2(n)      (n),        (n) +  1
+#define PUT4(n)  PUT2(n),   PUT2((n) +  2)
+#define PUT8(n)  PUT4(n),   PUT4((n) +  4)
+#define PUT16(n) PUT8(n),   PUT8((n) +  8)
+#define PUT32(n) PUT16(n), PUT16((n) + 16)
+
+const typeof((vec_t){}[0]) array[] = {
+    GLUE(PUT, VEC_MAX)(1),
+    GLUE(PUT, VEC_MAX)(VEC_MAX + 1)
+};
+
+int sg_test(void)
+{
+    unsigned int i;
+    vec_t x, y, full = (vec_t){} == 0;
+    idx_t idx, inv;
+
+    for ( i = 0; i < IVEC_SIZE / IDX_SIZE; ++i )
+    {
+        idx[i] = i + 1;
+        inv[i] = ITEM_COUNT - i;
+    }
+
+    touch(idx);
+    touch(inv);
+
+    x = gather(full, array, (idx_t){}, full, 1);
+    for ( i = 0; i < ITEM_COUNT; ++i )
+        if ( x[i] != 1 )
+            return __LINE__;
+    for ( ; i < ELEM_COUNT; ++i )
+        if ( x[i] )
+            return __LINE__;
+
+    x = gather(full, array, idx, full, ELEM_SIZE);
+    for ( i = 0; i < ITEM_COUNT; ++i )
+        if ( x[i] != i + 2 )
+            return __LINE__;
+    for ( ; i < ELEM_COUNT; ++i )
+        if ( x[i] )
+            return __LINE__;
+
+    x = gather(full, array, idx * ELEM_SIZE, full, 2);
+    for ( i = 0; i < ITEM_COUNT; ++i )
+        if ( x[i] != i * 2 + 3 )
+            return __LINE__;
+    for ( ; i < ELEM_COUNT; ++i )
+        if ( x[i] )
+            return __LINE__;
+
+    x = gather(full, array, inv, full, ELEM_SIZE);
+    for ( i = 0; i < ITEM_COUNT; ++i )
+        if ( x[i] != inv[i] + 1 )
+            return __LINE__;
+    for ( ; i < ELEM_COUNT; ++i )
+        if ( x[i] )
+            return __LINE__;
+
+    y = gather(full, array + ITEM_COUNT, -idx, full, ELEM_SIZE);
+#if ITEM_COUNT == ELEM_COUNT
+    if ( !to_bool(y == x - 1) )
+        return __LINE__;
+#else
+    for ( i = 0; i < ITEM_COUNT; ++i )
+        if ( y[i] != x[i] - 1 )
+            return __LINE__;
+    for ( ; i < ELEM_COUNT; ++i )
+        if ( y[i] )
+            return __LINE__;
+#endif
+
+#if ELEM_SIZE > 1
+    x = gather(full, array, inv * 2, full, ELEM_SIZE / 2);
+    for ( i = 0; i < ITEM_COUNT; ++i )
+        if ( x[i] != inv[i] + 1 )
+            return __LINE__;
+    for ( ; i < ELEM_COUNT; ++i )
+        if ( x[i] )
+            return __LINE__;
+
+# if ELEM_SIZE == IDX_SIZE
+    y = gather(x, array, idx, (idx & inv) != 0, ELEM_SIZE);
+    for ( i = 0; i < ITEM_COUNT; ++i )
+        if ( y[i] != ((i + 1) & (ITEM_COUNT - i) ? idx : inv)[i] + 1 )
+            return __LINE__;
+    for ( ; i < ELEM_COUNT; ++i )
+        if ( y[i] )
+            return __LINE__;
+# endif
+#endif
+
+    return 0;
+}
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -12,6 +12,7 @@
 #include "fma4.h"
 #include "fma.h"
 #include "avx2.h"
+#include "avx2-sg.h"
 
 #define verbose false /* Switch to true for far more logging. */
 
@@ -60,6 +61,7 @@ static bool simd_check_avx2(void)
 {
     return cpu_has_avx2;
 }
+#define simd_check_avx2_sg simd_check_avx2
 
 static void simd_set_regs(struct cpu_user_regs *regs)
 {
@@ -173,6 +175,22 @@ static const struct {
     SIMD(AVX2 u32x8,             avx2,      32u4),
     SIMD(AVX2 s64x4,             avx2,      32i8),
     SIMD(AVX2 u64x4,             avx2,      32u8),
+    SIMD(AVX2 S/G f32[4x32],  avx2_sg,    16x4f4),
+    SIMD(AVX2 S/G f64[2x32],  avx2_sg,    16x4f8),
+    SIMD(AVX2 S/G f32[2x64],  avx2_sg,    16x8f4),
+    SIMD(AVX2 S/G f64[2x64],  avx2_sg,    16x8f8),
+    SIMD(AVX2 S/G f32[8x32],  avx2_sg,    32x4f4),
+    SIMD(AVX2 S/G f64[4x32],  avx2_sg,    32x4f8),
+    SIMD(AVX2 S/G f32[4x64],  avx2_sg,    32x8f4),
+    SIMD(AVX2 S/G f64[4x64],  avx2_sg,    32x8f8),
+    SIMD(AVX2 S/G i32[4x32],  avx2_sg,    16x4i4),
+    SIMD(AVX2 S/G i64[2x32],  avx2_sg,    16x4i8),
+    SIMD(AVX2 S/G i32[2x64],  avx2_sg,    16x8i4),
+    SIMD(AVX2 S/G i64[2x64],  avx2_sg,    16x8i8),
+    SIMD(AVX2 S/G i32[8x32],  avx2_sg,    32x4i4),
+    SIMD(AVX2 S/G i64[4x32],  avx2_sg,    32x4i8),
+    SIMD(AVX2 S/G i32[4x64],  avx2_sg,    32x8i4),
+    SIMD(AVX2 S/G i64[4x64],  avx2_sg,    32x8i8),
 #undef SIMD_
 #undef SIMD
 };
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -391,6 +391,7 @@ static const struct {
     [0x78 ... 0x79] = { .simd_size = simd_other, .two_op = 1 },
     [0x8c] = { .simd_size = simd_other },
     [0x8e] = { .simd_size = simd_other, .to_mem = 1 },
+    [0x90 ... 0x93] = { .simd_size = simd_other, .vsib = 1 },
     [0x96 ... 0x9f] = { .simd_size = simd_packed_fp },
     [0xa6 ... 0xaf] = { .simd_size = simd_packed_fp },
     [0xb6 ... 0xbf] = { .simd_size = simd_packed_fp },
@@ -598,6 +599,7 @@ struct x86_emulate_state {
         ext_8f0a,
     } ext;
     uint8_t modrm, modrm_mod, modrm_reg, modrm_rm;
+    uint8_t sib_index, sib_scale;
     uint8_t rex_prefix;
     bool lock_prefix;
     bool not_64bit; /* Instruction not available in 64bit. */
@@ -2409,7 +2411,7 @@ x86_decode(
     struct x86_emulate_ctxt *ctxt,
     const struct x86_emulate_ops  *ops)
 {
-    uint8_t b, d, sib, sib_index, sib_base;
+    uint8_t b, d;
     unsigned int def_op_bytes, def_ad_bytes, opcode;
     enum x86_segment override_seg = x86_seg_none;
     bool pc_rel = false;
@@ -2735,6 +2737,7 @@ x86_decode(
 
         if ( modrm_mod == 3 )
         {
+            generate_exception_if(d & vSIB, EXC_UD);
             modrm_rm |= (rex_prefix & 1) << 3;
             ea.type = OP_REG;
         }
@@ -2795,13 +2798,17 @@ x86_decode(
             ea.type = OP_MEM;
             if ( modrm_rm == 4 )
             {
-                sib = insn_fetch_type(uint8_t);
-                sib_index = ((sib >> 3) & 7) | ((rex_prefix << 2) & 8);
-                sib_base  = (sib & 7) | ((rex_prefix << 3) & 8);
-                if ( sib_index != 4 && !(d & vSIB) )
-                    ea.mem.off = *(long *)decode_register(sib_index,
+                uint8_t sib = insn_fetch_type(uint8_t);
+                uint8_t sib_base = (sib & 7) | ((rex_prefix << 3) & 8);
+
+                state->sib_index = ((sib >> 3) & 7) | ((rex_prefix << 2) & 8);
+                state->sib_scale = (sib >> 6) & 3;
+                if ( state->sib_index != 4 && !(d & vSIB) )
+                {
+                    ea.mem.off = *(long *)decode_register(state->sib_index,
                                                           state->regs, 0);
-                ea.mem.off <<= (sib >> 6) & 3;
+                    ea.mem.off <<= state->sib_scale;
+                }
                 if ( (modrm_mod == 0) && ((sib_base & 7) == 5) )
                     ea.mem.off += insn_fetch_type(int32_t);
                 else if ( sib_base == 4 )
@@ -7443,6 +7450,110 @@ x86_emulate(
         break;
     }
 
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x90): /* vpgatherd{d,q} {x,y}mm,mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x91): /* vpgatherq{d,q} {x,y}mm,mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x92): /* vgatherdp{s,d} {x,y}mm,mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x93): /* vgatherqp{s,d} {x,y}mm,mem,{x,y}mm */
+    {
+        unsigned int mask_reg = ~vex.reg & (mode_64bit() ? 0xf : 7);
+        typeof(vex) *pvex;
+        union {
+            int32_t dw[8];
+            int64_t qw[4];
+        } index, mask;
+
+        ASSERT(ea.type == OP_MEM);
+        generate_exception_if(modrm_reg == state->sib_index ||
+                              modrm_reg == mask_reg ||
+                              state->sib_index == mask_reg, EXC_UD);
+        generate_exception_if(!cpu_has_avx, EXC_UD);
+        vcpu_must_have(avx2);
+        get_fpu(X86EMUL_FPU_ymm, &fic);
+
+        /* Read destination, index, and mask registers. */
+        opc = init_prefixes(stub);
+        pvex = copy_VEX(opc, vex);
+        pvex->opcx = vex_0f;
+        opc[0] = 0x7f; /* vmovdqa */
+        /* Use (%rax) as destination and modrm_reg as source. */
+        pvex->r = !mode_64bit() || !(modrm_reg & 8);
+        pvex->b = 1;
+        opc[1] = (modrm_reg & 7) << 3;
+        pvex->reg = 0xf;
+        opc[2] = 0xc3;
+
+        invoke_stub("", "", "=m" (*mmvalp) : "a" (mmvalp));
+
+        pvex->pfx = vex_f3; /* vmovdqu */
+        /* Switch to sib_index as source. */
+        pvex->r = !mode_64bit() || !(state->sib_index & 8);
+        opc[1] = (state->sib_index & 7) << 3;
+
+        invoke_stub("", "", "=m" (index) : "a" (&index));
+
+        /* Switch to mask_reg as source. */
+        pvex->r = !mode_64bit() || !(mask_reg & 8);
+        opc[1] = (mask_reg & 7) << 3;
+
+        invoke_stub("", "", "=m" (mask) : "a" (&mask));
+        put_stub(stub);
+
+        /* Clear untouched parts of the destination and mask values. */
+        n = 1 << (2 + vex.l - ((b & 1) | vex.w));
+        op_bytes = 4 << vex.w;
+        memset((void *)mmvalp + n * op_bytes, 0, 32 - n * op_bytes);
+        memset((void *)&mask + n * op_bytes, 0, 32 - n * op_bytes);
+
+        for ( i = 0; i < n && rc == X86EMUL_OKAY; ++i )
+        {
+            if ( (vex.w ? mask.qw[i] : mask.dw[i]) < 0 )
+            {
+                signed long idx = b & 1 ? index.qw[i] : index.dw[i];
+
+                rc = ops->read(ea.mem.seg,
+                               ea.mem.off + (idx << state->sib_scale),
+                               (void *)mmvalp + i * op_bytes, op_bytes, ctxt);
+                if ( rc != X86EMUL_OKAY )
+                    break;
+
+#ifdef __XEN__
+                if ( i + 1 < n && local_events_need_delivery() )
+                    rc = X86EMUL_RETRY;
+#endif
+            }
+
+            if ( vex.w )
+                mask.qw[i] = 0;
+            else
+                mask.dw[i] = 0;
+        }
+
+        /* Write destination and mask registers. */
+        opc = init_prefixes(stub);
+        pvex = copy_VEX(opc, vex);
+        pvex->opcx = vex_0f;
+        opc[0] = 0x6f; /* vmovdqa */
+        /* Use modrm_reg as destination and (%rax) as source. */
+        pvex->r = !mode_64bit() || !(modrm_reg & 8);
+        pvex->b = 1;
+        opc[1] = (modrm_reg & 7) << 3;
+        pvex->reg = 0xf;
+        opc[2] = 0xc3;
+
+        invoke_stub("", "", "+m" (*mmvalp) : "a" (mmvalp));
+
+        pvex->pfx = vex_f3; /* vmovdqu */
+        /* Switch to mask_reg as destination. */
+        pvex->r = !mode_64bit() || !(mask_reg & 8);
+        opc[1] = (mask_reg & 7) << 3;
+
+        invoke_stub("", "", "+m" (mask) : "a" (&mask));
+        put_stub(stub);
+
+        state->simd_size = simd_none;
+        break;
+    }
+
     case X86EMUL_OPC_VEX_66(0x0f38, 0x96): /* vfmaddsub132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x97): /* vfmsubadd132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x98): /* vfmadd132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
--- a/xen/arch/x86/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate.c
@@ -10,6 +10,7 @@
  */
 
 #include <xen/domain_page.h>
+#include <xen/event.h>
 #include <asm/x86_emulate.h>
 #include <asm/asm_defns.h> /* mark_regs_dirty() */
 #include <asm/processor.h> /* current_cpu_info */



[-- Attachment #2: x86emul-AVX2-gather.patch --]
[-- Type: text/plain, Size: 17396 bytes --]

x86emul: support AVX2 gather insns

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -13,7 +13,8 @@ run: $(TARGET)
 
 SIMD := sse sse2 sse4 avx avx2
 FMA := fma4 fma
-TESTCASES := blowfish $(SIMD) $(FMA)
+SG := avx2-sg
+TESTCASES := blowfish $(SIMD) $(FMA) $(SG)
 
 blowfish-cflags := ""
 blowfish-cflags-x86_32 := "-mno-accumulate-outgoing-args -Dstatic="
@@ -39,6 +40,10 @@ fma-flts := $(avx-flts)
 avx2-vecs := $(avx-vecs)
 avx2-ints := 1 2 4 8
 avx2-flts := 4 8
+avx2-sg-vecs := $(avx2-vecs)
+avx2-sg-idxs := 4 8
+avx2-sg-ints := 4 8
+avx2-sg-flts := 4 8
 
 # For AVX and later, have the compiler avoid XMM0 to widen coverage of
 # the VEX.vvvv checks in the emulator.
@@ -55,8 +60,18 @@ $(1)-cflags := \
 	$(foreach flt,$($(1)-flts), \
 	  "-D_f$(flt) -m$(1) $(call non-sse,$(1)) -mfpmath=sse -Os -DFLOAT_SIZE=$(flt)")
 endef
+define simd-sg-defs
+$(1)-cflags := \
+	$(foreach vec,$($(1)-vecs), \
+	  $(foreach idx,$($(1)-idxs), \
+	   $(foreach int,$($(1)-ints), \
+	     "-D_$(vec)x$(idx)i$(int) -m$(1:-sg=) $(call non-sse,$(1)) -Os -DVEC_MAX=$(vec) -DIDX_SIZE=$(idx) -DINT_SIZE=$(int)") \
+	   $(foreach flt,$($(1)-flts), \
+	     "-D_$(vec)x$(idx)f$(flt) -m$(1:-sg=) $(call non-sse,$(1)) -Os -DVEC_MAX=$(vec) -DIDX_SIZE=$(idx) -DFLOAT_SIZE=$(flt)")))
+endef
 
 $(foreach flavor,$(SIMD) $(FMA),$(eval $(call simd-defs,$(flavor))))
+$(foreach flavor,$(SG),$(eval $(call simd-sg-defs,$(flavor))))
 
 $(addsuffix .h,$(TESTCASES)): %.h: %.c testcase.mk Makefile
 	rm -f $@.new $*.bin
@@ -78,7 +93,10 @@ $(addsuffix .c,$(SIMD)):
 $(addsuffix .c,$(FMA)):
 	ln -sf simd-fma.c $@
 
-$(addsuffix .o,$(SIMD) $(FMA)): simd.h
+$(addsuffix .c,$(SG)):
+	ln -sf simd-sg.c $@
+
+$(addsuffix .o,$(SIMD) $(FMA) $(SG)): simd.h
 
 $(TARGET): x86_emulate.o test_x86_emulator.o
 	$(HOSTCC) -o $@ $^
--- /dev/null
+++ b/tools/tests/x86_emulator/simd-sg.c
@@ -0,0 +1,209 @@
+#ifdef INT_SIZE
+# define ELEM_SIZE INT_SIZE
+#else
+# define ELEM_SIZE FLOAT_SIZE
+#endif
+
+#define VEC_SIZE (IDX_SIZE <= ELEM_SIZE ? VEC_MAX \
+                                        : VEC_MAX * ELEM_SIZE / IDX_SIZE)
+#if VEC_SIZE < 16
+# undef VEC_SIZE
+# define VEC_SIZE 16
+#endif
+
+#include "simd.h"
+
+ENTRY(sg_test);
+
+#undef MODE
+#if IDX_SIZE == 4
+# define MODE SI
+#elif IDX_SIZE == 8
+# define MODE DI
+#endif
+
+#define IVEC_SIZE (ELEM_SIZE <= IDX_SIZE ? VEC_MAX \
+                                         : VEC_MAX * IDX_SIZE / ELEM_SIZE)
+#if IVEC_SIZE < 16
+# undef IVEC_SIZE
+# define IVEC_SIZE 16
+#endif
+
+typedef signed int __attribute__((mode(MODE), vector_size(IVEC_SIZE))) idx_t;
+typedef long long __attribute__((vector_size(IVEC_SIZE))) idi_t;
+
+#define ITEM_COUNT (VEC_SIZE / ELEM_SIZE < IVEC_SIZE / IDX_SIZE ? \
+                    VEC_SIZE / ELEM_SIZE : IVEC_SIZE / IDX_SIZE)
+
+#if VEC_SIZE == 16
+# define to_bool(cmp) __builtin_ia32_ptestc128(cmp, (vec_t){} == 0)
+#else
+# define to_bool(cmp) __builtin_ia32_ptestc256(cmp, (vec_t){} == 0)
+#endif
+
+#if defined(__AVX2__)
+# if VEC_MAX == 16
+#  if IDX_SIZE == 4
+#   if INT_SIZE == 4
+#    define gather __builtin_ia32_gathersiv4si
+#   elif INT_SIZE == 8
+#    define gather(reg, mem, idx, msk, scl) \
+            (vec_t)(__builtin_ia32_gathersiv2di((vdi_t)(reg), \
+                                                (const void *)(mem), \
+                                                idx, (vdi_t)(msk), scl))
+#   elif FLOAT_SIZE == 4
+#    define gather __builtin_ia32_gathersiv4sf
+#   elif FLOAT_SIZE == 8
+#    define gather __builtin_ia32_gathersiv2df
+#   endif
+#  elif IDX_SIZE == 8
+#   if INT_SIZE == 4
+#    define gather(reg, mem, idx, msk, scl) \
+            __builtin_ia32_gatherdiv4si(reg, mem, (vdi_t)(idx), msk, scl)
+#   elif INT_SIZE == 8
+#    define gather(reg, mem, idx, msk, scl) \
+            (vec_t)(__builtin_ia32_gatherdiv2di((vdi_t)(reg), \
+                                                (const void *)(mem), \
+                                                (vdi_t)(idx), (vdi_t)(msk), \
+                                                scl))
+#   elif FLOAT_SIZE == 4
+#    define gather(reg, mem, idx, msk, scl) \
+            __builtin_ia32_gatherdiv4sf(reg, mem, (vdi_t)(idx), msk, scl)
+#   elif FLOAT_SIZE == 8
+#    define gather(reg, mem, idx, msk, scl) \
+            __builtin_ia32_gatherdiv2df(reg, mem, (vdi_t)(idx), msk, scl)
+#   endif
+#  endif
+# elif VEC_MAX == 32
+#  if IDX_SIZE == 4
+#   if INT_SIZE == 4
+#    define gather __builtin_ia32_gathersiv8si
+#   elif INT_SIZE == 8
+#    define gather(reg, mem, idx, msk, scl) \
+            (vec_t)(__builtin_ia32_gathersiv4di((vdi_t)(reg), \
+                                                (const void *)(mem), \
+                                                idx, (vdi_t)(msk), scl))
+
+#   elif FLOAT_SIZE == 4
+#    define gather __builtin_ia32_gathersiv8sf
+#   elif FLOAT_SIZE == 8
+#    define gather __builtin_ia32_gathersiv4df
+#   endif
+#  elif IDX_SIZE == 8
+#   if INT_SIZE == 4
+#    define gather(reg, mem, idx, msk, scl) \
+            __builtin_ia32_gatherdiv4si256(reg, mem, (idi_t)(idx), msk, scl)
+#   elif INT_SIZE == 8
+#    define gather(reg, mem, idx, msk, scl) \
+            (vec_t)(__builtin_ia32_gatherdiv4di((vdi_t)(reg), \
+                                                (const void *)(mem), \
+                                                (vdi_t)(idx), (vdi_t)(msk), \
+                                                scl))
+
+#   elif FLOAT_SIZE == 4
+#    define gather(reg, mem, idx, msk, scl) \
+            __builtin_ia32_gatherdiv4sf256(reg, mem, (idi_t)(idx), msk, scl)
+#   elif FLOAT_SIZE == 8
+#    define gather(reg, mem, idx, msk, scl) \
+            __builtin_ia32_gatherdiv4df(reg, mem, (vdi_t)(idx), msk, scl)
+#   endif
+#  endif
+# endif
+#endif
+
+#define GLUE_(x, y) x ## y
+#define GLUE(x, y) GLUE_(x, y)
+
+#define PUT2(n)      (n),        (n) +  1
+#define PUT4(n)  PUT2(n),   PUT2((n) +  2)
+#define PUT8(n)  PUT4(n),   PUT4((n) +  4)
+#define PUT16(n) PUT8(n),   PUT8((n) +  8)
+#define PUT32(n) PUT16(n), PUT16((n) + 16)
+
+const typeof((vec_t){}[0]) array[] = {
+    GLUE(PUT, VEC_MAX)(1),
+    GLUE(PUT, VEC_MAX)(VEC_MAX + 1)
+};
+
+int sg_test(void)
+{
+    unsigned int i;
+    vec_t x, y, full = (vec_t){} == 0;
+    idx_t idx, inv;
+
+    for ( i = 0; i < IVEC_SIZE / IDX_SIZE; ++i )
+    {
+        idx[i] = i + 1;
+        inv[i] = ITEM_COUNT - i;
+    }
+
+    touch(idx);
+    touch(inv);
+
+    x = gather(full, array, (idx_t){}, full, 1);
+    for ( i = 0; i < ITEM_COUNT; ++i )
+        if ( x[i] != 1 )
+            return __LINE__;
+    for ( ; i < ELEM_COUNT; ++i )
+        if ( x[i] )
+            return __LINE__;
+
+    x = gather(full, array, idx, full, ELEM_SIZE);
+    for ( i = 0; i < ITEM_COUNT; ++i )
+        if ( x[i] != i + 2 )
+            return __LINE__;
+    for ( ; i < ELEM_COUNT; ++i )
+        if ( x[i] )
+            return __LINE__;
+
+    x = gather(full, array, idx * ELEM_SIZE, full, 2);
+    for ( i = 0; i < ITEM_COUNT; ++i )
+        if ( x[i] != i * 2 + 3 )
+            return __LINE__;
+    for ( ; i < ELEM_COUNT; ++i )
+        if ( x[i] )
+            return __LINE__;
+
+    x = gather(full, array, inv, full, ELEM_SIZE);
+    for ( i = 0; i < ITEM_COUNT; ++i )
+        if ( x[i] != inv[i] + 1 )
+            return __LINE__;
+    for ( ; i < ELEM_COUNT; ++i )
+        if ( x[i] )
+            return __LINE__;
+
+    y = gather(full, array + ITEM_COUNT, -idx, full, ELEM_SIZE);
+#if ITEM_COUNT == ELEM_COUNT
+    if ( !to_bool(y == x - 1) )
+        return __LINE__;
+#else
+    for ( i = 0; i < ITEM_COUNT; ++i )
+        if ( y[i] != x[i] - 1 )
+            return __LINE__;
+    for ( ; i < ELEM_COUNT; ++i )
+        if ( y[i] )
+            return __LINE__;
+#endif
+
+#if ELEM_SIZE > 1
+    x = gather(full, array, inv * 2, full, ELEM_SIZE / 2);
+    for ( i = 0; i < ITEM_COUNT; ++i )
+        if ( x[i] != inv[i] + 1 )
+            return __LINE__;
+    for ( ; i < ELEM_COUNT; ++i )
+        if ( x[i] )
+            return __LINE__;
+
+# if ELEM_SIZE == IDX_SIZE
+    y = gather(x, array, idx, (idx & inv) != 0, ELEM_SIZE);
+    for ( i = 0; i < ITEM_COUNT; ++i )
+        if ( y[i] != ((i + 1) & (ITEM_COUNT - i) ? idx : inv)[i] + 1 )
+            return __LINE__;
+    for ( ; i < ELEM_COUNT; ++i )
+        if ( y[i] )
+            return __LINE__;
+# endif
+#endif
+
+    return 0;
+}
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -12,6 +12,7 @@
 #include "fma4.h"
 #include "fma.h"
 #include "avx2.h"
+#include "avx2-sg.h"
 
 #define verbose false /* Switch to true for far more logging. */
 
@@ -60,6 +61,7 @@ static bool simd_check_avx2(void)
 {
     return cpu_has_avx2;
 }
+#define simd_check_avx2_sg simd_check_avx2
 
 static void simd_set_regs(struct cpu_user_regs *regs)
 {
@@ -173,6 +175,22 @@ static const struct {
     SIMD(AVX2 u32x8,             avx2,      32u4),
     SIMD(AVX2 s64x4,             avx2,      32i8),
     SIMD(AVX2 u64x4,             avx2,      32u8),
+    SIMD(AVX2 S/G f32[4x32],  avx2_sg,    16x4f4),
+    SIMD(AVX2 S/G f64[2x32],  avx2_sg,    16x4f8),
+    SIMD(AVX2 S/G f32[2x64],  avx2_sg,    16x8f4),
+    SIMD(AVX2 S/G f64[2x64],  avx2_sg,    16x8f8),
+    SIMD(AVX2 S/G f32[8x32],  avx2_sg,    32x4f4),
+    SIMD(AVX2 S/G f64[4x32],  avx2_sg,    32x4f8),
+    SIMD(AVX2 S/G f32[4x64],  avx2_sg,    32x8f4),
+    SIMD(AVX2 S/G f64[4x64],  avx2_sg,    32x8f8),
+    SIMD(AVX2 S/G i32[4x32],  avx2_sg,    16x4i4),
+    SIMD(AVX2 S/G i64[2x32],  avx2_sg,    16x4i8),
+    SIMD(AVX2 S/G i32[2x64],  avx2_sg,    16x8i4),
+    SIMD(AVX2 S/G i64[2x64],  avx2_sg,    16x8i8),
+    SIMD(AVX2 S/G i32[8x32],  avx2_sg,    32x4i4),
+    SIMD(AVX2 S/G i64[4x32],  avx2_sg,    32x4i8),
+    SIMD(AVX2 S/G i32[4x64],  avx2_sg,    32x8i4),
+    SIMD(AVX2 S/G i64[4x64],  avx2_sg,    32x8i8),
 #undef SIMD_
 #undef SIMD
 };
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -391,6 +391,7 @@ static const struct {
     [0x78 ... 0x79] = { .simd_size = simd_other, .two_op = 1 },
     [0x8c] = { .simd_size = simd_other },
     [0x8e] = { .simd_size = simd_other, .to_mem = 1 },
+    [0x90 ... 0x93] = { .simd_size = simd_other, .vsib = 1 },
     [0x96 ... 0x9f] = { .simd_size = simd_packed_fp },
     [0xa6 ... 0xaf] = { .simd_size = simd_packed_fp },
     [0xb6 ... 0xbf] = { .simd_size = simd_packed_fp },
@@ -598,6 +599,7 @@ struct x86_emulate_state {
         ext_8f0a,
     } ext;
     uint8_t modrm, modrm_mod, modrm_reg, modrm_rm;
+    uint8_t sib_index, sib_scale;
     uint8_t rex_prefix;
     bool lock_prefix;
     bool not_64bit; /* Instruction not available in 64bit. */
@@ -2409,7 +2411,7 @@ x86_decode(
     struct x86_emulate_ctxt *ctxt,
     const struct x86_emulate_ops  *ops)
 {
-    uint8_t b, d, sib, sib_index, sib_base;
+    uint8_t b, d;
     unsigned int def_op_bytes, def_ad_bytes, opcode;
     enum x86_segment override_seg = x86_seg_none;
     bool pc_rel = false;
@@ -2735,6 +2737,7 @@ x86_decode(
 
         if ( modrm_mod == 3 )
         {
+            generate_exception_if(d & vSIB, EXC_UD);
             modrm_rm |= (rex_prefix & 1) << 3;
             ea.type = OP_REG;
         }
@@ -2795,13 +2798,17 @@ x86_decode(
             ea.type = OP_MEM;
             if ( modrm_rm == 4 )
             {
-                sib = insn_fetch_type(uint8_t);
-                sib_index = ((sib >> 3) & 7) | ((rex_prefix << 2) & 8);
-                sib_base  = (sib & 7) | ((rex_prefix << 3) & 8);
-                if ( sib_index != 4 && !(d & vSIB) )
-                    ea.mem.off = *(long *)decode_register(sib_index,
+                uint8_t sib = insn_fetch_type(uint8_t);
+                uint8_t sib_base = (sib & 7) | ((rex_prefix << 3) & 8);
+
+                state->sib_index = ((sib >> 3) & 7) | ((rex_prefix << 2) & 8);
+                state->sib_scale = (sib >> 6) & 3;
+                if ( state->sib_index != 4 && !(d & vSIB) )
+                {
+                    ea.mem.off = *(long *)decode_register(state->sib_index,
                                                           state->regs, 0);
-                ea.mem.off <<= (sib >> 6) & 3;
+                    ea.mem.off <<= state->sib_scale;
+                }
                 if ( (modrm_mod == 0) && ((sib_base & 7) == 5) )
                     ea.mem.off += insn_fetch_type(int32_t);
                 else if ( sib_base == 4 )
@@ -7443,6 +7450,110 @@ x86_emulate(
         break;
     }
 
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x90): /* vpgatherd{d,q} {x,y}mm,mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x91): /* vpgatherq{d,q} {x,y}mm,mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x92): /* vgatherdp{s,d} {x,y}mm,mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x93): /* vgatherqp{s,d} {x,y}mm,mem,{x,y}mm */
+    {
+        unsigned int mask_reg = ~vex.reg & (mode_64bit() ? 0xf : 7);
+        typeof(vex) *pvex;
+        union {
+            int32_t dw[8];
+            int64_t qw[4];
+        } index, mask;
+
+        ASSERT(ea.type == OP_MEM);
+        generate_exception_if(modrm_reg == state->sib_index ||
+                              modrm_reg == mask_reg ||
+                              state->sib_index == mask_reg, EXC_UD);
+        generate_exception_if(!cpu_has_avx, EXC_UD);
+        vcpu_must_have(avx2);
+        get_fpu(X86EMUL_FPU_ymm, &fic);
+
+        /* Read destination, index, and mask registers. */
+        opc = init_prefixes(stub);
+        pvex = copy_VEX(opc, vex);
+        pvex->opcx = vex_0f;
+        opc[0] = 0x7f; /* vmovdqa */
+        /* Use (%rax) as destination and modrm_reg as source. */
+        pvex->r = !mode_64bit() || !(modrm_reg & 8);
+        pvex->b = 1;
+        opc[1] = (modrm_reg & 7) << 3;
+        pvex->reg = 0xf;
+        opc[2] = 0xc3;
+
+        invoke_stub("", "", "=m" (*mmvalp) : "a" (mmvalp));
+
+        pvex->pfx = vex_f3; /* vmovdqu */
+        /* Switch to sib_index as source. */
+        pvex->r = !mode_64bit() || !(state->sib_index & 8);
+        opc[1] = (state->sib_index & 7) << 3;
+
+        invoke_stub("", "", "=m" (index) : "a" (&index));
+
+        /* Switch to mask_reg as source. */
+        pvex->r = !mode_64bit() || !(mask_reg & 8);
+        opc[1] = (mask_reg & 7) << 3;
+
+        invoke_stub("", "", "=m" (mask) : "a" (&mask));
+        put_stub(stub);
+
+        /* Clear untouched parts of the destination and mask values. */
+        n = 1 << (2 + vex.l - ((b & 1) | vex.w));
+        op_bytes = 4 << vex.w;
+        memset((void *)mmvalp + n * op_bytes, 0, 32 - n * op_bytes);
+        memset((void *)&mask + n * op_bytes, 0, 32 - n * op_bytes);
+
+        for ( i = 0; i < n && rc == X86EMUL_OKAY; ++i )
+        {
+            if ( (vex.w ? mask.qw[i] : mask.dw[i]) < 0 )
+            {
+                signed long idx = b & 1 ? index.qw[i] : index.dw[i];
+
+                rc = ops->read(ea.mem.seg,
+                               ea.mem.off + (idx << state->sib_scale),
+                               (void *)mmvalp + i * op_bytes, op_bytes, ctxt);
+                if ( rc != X86EMUL_OKAY )
+                    break;
+
+#ifdef __XEN__
+                if ( i + 1 < n && local_events_need_delivery() )
+                    rc = X86EMUL_RETRY;
+#endif
+            }
+
+            if ( vex.w )
+                mask.qw[i] = 0;
+            else
+                mask.dw[i] = 0;
+        }
+
+        /* Write destination and mask registers. */
+        opc = init_prefixes(stub);
+        pvex = copy_VEX(opc, vex);
+        pvex->opcx = vex_0f;
+        opc[0] = 0x6f; /* vmovdqa */
+        /* Use modrm_reg as destination and (%rax) as source. */
+        pvex->r = !mode_64bit() || !(modrm_reg & 8);
+        pvex->b = 1;
+        opc[1] = (modrm_reg & 7) << 3;
+        pvex->reg = 0xf;
+        opc[2] = 0xc3;
+
+        invoke_stub("", "", "+m" (*mmvalp) : "a" (mmvalp));
+
+        pvex->pfx = vex_f3; /* vmovdqu */
+        /* Switch to mask_reg as destination. */
+        pvex->r = !mode_64bit() || !(mask_reg & 8);
+        opc[1] = (mask_reg & 7) << 3;
+
+        invoke_stub("", "", "+m" (mask) : "a" (&mask));
+        put_stub(stub);
+
+        state->simd_size = simd_none;
+        break;
+    }
+
     case X86EMUL_OPC_VEX_66(0x0f38, 0x96): /* vfmaddsub132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x97): /* vfmsubadd132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x98): /* vfmadd132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
--- a/xen/arch/x86/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate.c
@@ -10,6 +10,7 @@
  */
 
 #include <xen/domain_page.h>
+#include <xen/event.h>
 #include <asm/x86_emulate.h>
 #include <asm/asm_defns.h> /* mark_regs_dirty() */
 #include <asm/processor.h> /* current_cpu_info */

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 10/17] x86emul: add tables for XOP 08 and 09 extension spaces
  2017-06-21 11:23 [PATCH 00/17] x86: emulator enhancements Jan Beulich
                   ` (8 preceding siblings ...)
  2017-06-21 12:04 ` [PATCH 09/17] x86emul: support AVX2 gather insns Jan Beulich
@ 2017-06-21 12:04 ` Jan Beulich
  2017-06-21 12:05 ` [PATCH 11/17] x86emul: support XOP insns Jan Beulich
                   ` (7 subsequent siblings)
  17 siblings, 0 replies; 27+ messages in thread
From: Jan Beulich @ 2017-06-21 12:04 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 1965 bytes --]

Convert the few existing opcodes so far supported.

Also adjust two vex_* case labels to better be ext_* (the values are
identical).

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -458,6 +458,20 @@ static const opcode_desc_t xop_table[] =
     DstReg|SrcImm|ModRM,
 };
 
+static const struct {
+    uint8_t simd_size:5;
+    uint8_t two_op:1;
+    uint8_t four_op:1;
+} ext8f08_table[256] = {
+};
+
+static const struct {
+    uint8_t simd_size:5;
+    uint8_t two_op:1;
+} ext8f09_table[256] = {
+    [0x01 ... 0x02] = { .two_op = 1 },
+};
+
 #define REX_PREFIX 0x40
 #define REX_B 0x01
 #define REX_X 0x02
@@ -2716,7 +2730,7 @@ x86_decode(
             }
             break;
 
-        case vex_0f38:
+        case ext_0f38:
             d = ext0f38_table[b].to_mem ? DstMem | SrcReg
                                         : DstReg | SrcMem;
             if ( ext0f38_table[b].two_op )
@@ -2726,7 +2740,14 @@ x86_decode(
             state->simd_size = ext0f38_table[b].simd_size;
             break;
 
-        case vex_0f3a:
+        case ext_8f09:
+            if ( ext8f09_table[b].two_op )
+                d |= TwoOp;
+            state->simd_size = ext8f09_table[b].simd_size;
+            break;
+
+        case ext_0f3a:
+        case ext_8f08:
             /*
              * Cannot update d here yet, as the immediate operand still
              * needs fetching.
@@ -2919,6 +2940,15 @@ x86_decode(
         break;
 
     case ext_8f08:
+        d = DstReg | SrcMem;
+        if ( ext8f08_table[b].two_op )
+            d |= TwoOp;
+        else if ( ext8f08_table[b].four_op && !mode_64bit() )
+            imm1 &= 0x7f;
+        state->desc = d;
+        state->simd_size = ext8f08_table[b].simd_size;
+        break;
+
     case ext_8f09:
     case ext_8f0a:
         break;




[-- Attachment #2: x86emul-XOP-tables.patch --]
[-- Type: text/plain, Size: 2017 bytes --]

x86emul: add tables for XOP 08 and 09 extension spaces

Convert the few existing opcodes so far supported.

Also adjust two vex_* case labels to better be ext_* (the values are
identical).

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -458,6 +458,20 @@ static const opcode_desc_t xop_table[] =
     DstReg|SrcImm|ModRM,
 };
 
+static const struct {
+    uint8_t simd_size:5;
+    uint8_t two_op:1;
+    uint8_t four_op:1;
+} ext8f08_table[256] = {
+};
+
+static const struct {
+    uint8_t simd_size:5;
+    uint8_t two_op:1;
+} ext8f09_table[256] = {
+    [0x01 ... 0x02] = { .two_op = 1 },
+};
+
 #define REX_PREFIX 0x40
 #define REX_B 0x01
 #define REX_X 0x02
@@ -2716,7 +2730,7 @@ x86_decode(
             }
             break;
 
-        case vex_0f38:
+        case ext_0f38:
             d = ext0f38_table[b].to_mem ? DstMem | SrcReg
                                         : DstReg | SrcMem;
             if ( ext0f38_table[b].two_op )
@@ -2726,7 +2740,14 @@ x86_decode(
             state->simd_size = ext0f38_table[b].simd_size;
             break;
 
-        case vex_0f3a:
+        case ext_8f09:
+            if ( ext8f09_table[b].two_op )
+                d |= TwoOp;
+            state->simd_size = ext8f09_table[b].simd_size;
+            break;
+
+        case ext_0f3a:
+        case ext_8f08:
             /*
              * Cannot update d here yet, as the immediate operand still
              * needs fetching.
@@ -2919,6 +2940,15 @@ x86_decode(
         break;
 
     case ext_8f08:
+        d = DstReg | SrcMem;
+        if ( ext8f08_table[b].two_op )
+            d |= TwoOp;
+        else if ( ext8f08_table[b].four_op && !mode_64bit() )
+            imm1 &= 0x7f;
+        state->desc = d;
+        state->simd_size = ext8f08_table[b].simd_size;
+        break;
+
     case ext_8f09:
     case ext_8f0a:
         break;

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 11/17] x86emul: support XOP insns
  2017-06-21 11:23 [PATCH 00/17] x86: emulator enhancements Jan Beulich
                   ` (9 preceding siblings ...)
  2017-06-21 12:04 ` [PATCH 10/17] x86emul: add tables for XOP 08 and 09 extension spaces Jan Beulich
@ 2017-06-21 12:05 ` Jan Beulich
  2017-06-21 12:05 ` [PATCH 12/17] x86emul: support 3DNow! insns Jan Beulich
                   ` (6 subsequent siblings)
  17 siblings, 0 replies; 27+ messages in thread
From: Jan Beulich @ 2017-06-21 12:05 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 20208 bytes --]

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/.gitignore
+++ b/.gitignore
@@ -230,7 +230,7 @@
 tools/tests/x86_emulator/sse*.[ch]
 tools/tests/x86_emulator/test_x86_emulator
 tools/tests/x86_emulator/x86_emulate
+tools/tests/x86_emulator/xop*.[ch]
 tools/tests/xen-access/xen-access
 tools/tests/xenstore/xs-test
 tools/tests/regression/installed/*
--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -11,7 +11,7 @@ all: $(TARGET)
 run: $(TARGET)
 	./$(TARGET)
 
-SIMD := sse sse2 sse4 avx avx2
+SIMD := sse sse2 sse4 avx avx2 xop
 FMA := fma4 fma
 SG := avx2-sg
 TESTCASES := blowfish $(SIMD) $(FMA) $(SG)
@@ -44,6 +44,9 @@ avx2-sg-vecs := $(avx2-vecs)
 avx2-sg-idxs := 4 8
 avx2-sg-ints := 4 8
 avx2-sg-flts := 4 8
+xop-vecs := $(avx-vecs)
+xop-ints := 1 2 4 8
+xop-flts := $(avx-flts)
 
 # For AVX and later, have the compiler avoid XMM0 to widen coverage of
 # the VEX.vvvv checks in the emulator.
@@ -98,6 +101,8 @@ $(addsuffix .c,$(SG)):
 
 $(addsuffix .o,$(SIMD) $(FMA) $(SG)): simd.h
 
+xop.o: simd-fma.c
+
 $(TARGET): x86_emulate.o test_x86_emulator.o
 	$(HOSTCC) -o $@ $^
 
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -471,6 +471,86 @@ static inline bool _to_bool(byte_vec_t b
 #  endif
 # endif
 #endif
+#ifdef __XOP__
+# undef select
+# if VEC_SIZE == 16
+#  if INT_SIZE == 2 || INT_SIZE == 4
+#   include "simd-fma.c"
+#  endif
+#  define select(d, x, y, m) \
+    (*(d) = (vec_t)__builtin_ia32_vpcmov((vdi_t)(x), (vdi_t)(y), (vdi_t)(m)))
+#  if INT_SIZE == 1 || UINT_SIZE == 1
+#   define swap2(x) ((vec_t)__builtin_ia32_vpperm((vqi_t)(x), (vqi_t)(x), (vqi_t)inv - 1))
+#  elif INT_SIZE == 2 || UINT_SIZE == 2
+#   define swap2(x) \
+    ((vec_t)__builtin_ia32_vpperm((vqi_t)(x), (vqi_t)(x), \
+                                  (vqi_t)(__builtin_ia32_vprotwi(2 * (vhi_t)inv - 1, 8) | \
+                                          (2 * inv - 2))))
+#  elif FLOAT_SIZE == 4
+#   define frac(x) __builtin_ia32_vfrczps(x)
+#   undef swap2
+#   define swap2(x) ({ \
+    /* Buggy in gcc 7.1.0 and earlier. */ \
+    /* __builtin_ia32_vpermil2ps((vec_t){}, x, __builtin_ia32_cvtps2dq(inv) + 3, 0) */ \
+    vec_t t_; \
+    asm ( "vpermil2ps $0, %3, %2, %1, %0" : \
+          "=x" (t_) : \
+          "x" ((vec_t){}), "m" (x), "x" (__builtin_ia32_cvtps2dq(inv) + 3) ); \
+    t_; \
+})
+#  elif FLOAT_SIZE == 8
+#   define frac(x) __builtin_ia32_vfrczpd(x)
+#   undef swap2
+#   define swap2(x) ({ \
+    /* Buggy in gcc 7.1.0 and earlier. */ \
+    /* __builtin_ia32_vpermil2pd((vec_t){}, x, */ \
+    /*                            __builtin_ia32_pmovsxdq128( */ \
+    /*                                __builtin_ia32_cvtpd2dq(inv) + 1) << 1, 0) */ \
+    vdi_t s_ = __builtin_ia32_pmovsxdq128( \
+                   __builtin_ia32_cvtpd2dq(inv) + 1) << 1; \
+    vec_t t_; \
+    asm ( "vpermil2pd $0, %3, %2, %1, %0" : \
+          "=x" (t_) : "x" ((vec_t){}), "x" (x), "m" (s_) ); \
+    t_; \
+})
+#  endif
+#  if INT_SIZE == 1
+#   define hadd(x, y) ((vec_t)__builtin_ia32_packsswb128(__builtin_ia32_vphaddbw((vqi_t)(x)), \
+                                                         __builtin_ia32_vphaddbw((vqi_t)(y))))
+#   define hsub(x, y) ((vec_t)__builtin_ia32_packsswb128(__builtin_ia32_vphsubbw((vqi_t)(x)), \
+                                                         __builtin_ia32_vphsubbw((vqi_t)(y))))
+#  elif UINT_SIZE == 1
+#   define hadd(x, y) ((vec_t)__builtin_ia32_packuswb128(__builtin_ia32_vphaddubw((vqi_t)(x)), \
+                                                         __builtin_ia32_vphaddubw((vqi_t)(y))))
+#  elif INT_SIZE == 2
+#   undef hadd
+#   define hadd(x, y) __builtin_ia32_packssdw128(__builtin_ia32_vphaddwd(x), \
+                                                 __builtin_ia32_vphaddwd(y))
+#   undef hsub
+#   define hsub(x, y) __builtin_ia32_packssdw128(__builtin_ia32_vphsubwd(x), \
+                                                 __builtin_ia32_vphsubwd(y))
+#  elif UINT_SIZE == 2
+#   undef hadd
+#   define hadd(x, y) ((vec_t)__builtin_ia32_packusdw128(__builtin_ia32_vphadduwd((vhi_t)(x)), \
+                                                         __builtin_ia32_vphadduwd((vhi_t)(y))))
+#   undef hsub
+#  endif
+# elif VEC_SIZE == 32
+#  define select(d, x, y, m) \
+    (*(d) = (vec_t)__builtin_ia32_vpcmov256((vdi_t)(x), (vdi_t)(y), (vdi_t)(m)))
+#  if FLOAT_SIZE == 4
+#   define frac(x) __builtin_ia32_vfrczps256(x)
+#  elif FLOAT_SIZE == 8
+#   define frac(x) __builtin_ia32_vfrczpd256(x)
+#  endif
+# elif VEC_SIZE == FLOAT_SIZE
+#  if VEC_SIZE == 4
+#   define frac(x) scalar_1op(x, "vfrczss %[in], %[out]")
+#  elif VEC_SIZE == 8
+#   define frac(x) scalar_1op(x, "vfrczsd %[in], %[out]")
+#  endif
+# endif
+#endif
 
 int simd_test(void)
 {
@@ -576,6 +656,29 @@ int simd_test(void)
     if ( !to_bool(y == z) ) return __LINE__;
 # endif
 
+# ifdef frac
+    touch(src);
+    x = frac(src);
+    touch(src);
+    if ( !to_bool(x == 0) ) return __LINE__;
+
+    x = 1 / (src + 1);
+    touch(x);
+    y = frac(x);
+    touch(x);
+    if ( !to_bool(x == y) ) return __LINE__;
+# endif
+
+# if defined(trunc) && defined(frac)
+    x = src / 4;
+    touch(x);
+    y = trunc(x);
+    touch(x);
+    z = frac(x);
+    touch(x);
+    if ( !to_bool(x == y + z) ) return __LINE__;
+# endif
+
 #else
 
 # if ELEM_SIZE > 1
@@ -677,7 +780,7 @@ int simd_test(void)
     y = z << sh;
     if ( !to_bool(x == y + y) ) return __LINE__;
 
-#  if defined(__AVX2__) && ELEM_SIZE >= 4
+#  if (defined(__AVX2__) && ELEM_SIZE >= 4) || defined(__XOP__)
     touch(sh);
     x = y >> sh;
     if ( !to_bool(x == z) ) return __LINE__;
@@ -871,6 +974,8 @@ int simd_test(void)
 #endif
 
 #ifdef hadd
+# if (!defined(INT_SIZE) || INT_SIZE > 1 || ELEM_COUNT < 16) && \
+     (!defined(UINT_SIZE) || UINT_SIZE > 1 || ELEM_COUNT <= 16)
     x = src;
     for ( i = ELEM_COUNT; i >>= 1; )
     {
@@ -878,6 +983,7 @@ int simd_test(void)
         x = hadd((vec_t){}, x);
     }
     if ( x[ELEM_COUNT - 1] != (ELEM_COUNT * (ELEM_COUNT + 1)) / 2 ) return __LINE__;
+# endif
 
 # ifdef hsub
     touch(src);
@@ -889,6 +995,9 @@ int simd_test(void)
 # endif
 #endif
 
+#if defined(__XOP__) && VEC_SIZE == 16 && (INT_SIZE == 2 || INT_SIZE == 4)
+    return -fma_test();
+#endif
 
     return 0;
 }
--- a/tools/tests/x86_emulator/simd-fma.c
+++ b/tools/tests/x86_emulator/simd-fma.c
@@ -1,6 +1,8 @@
 #include "simd.h"
 
+#ifndef __XOP__
 ENTRY(fma_test);
+#endif
 
 #if VEC_SIZE < 16
 # define to_bool(cmp) (!~(cmp)[0])
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -13,6 +13,7 @@
 #include "fma.h"
 #include "avx2.h"
 #include "avx2-sg.h"
+#include "xop.h"
 
 #define verbose false /* Switch to true for far more logging. */
 
@@ -63,6 +64,11 @@ static bool simd_check_avx2(void)
 }
 #define simd_check_avx2_sg simd_check_avx2
 
+static bool simd_check_xop(void)
+{
+    return cpu_has_xop;
+}
+
 static void simd_set_regs(struct cpu_user_regs *regs)
 {
     if ( cpu_has_mmx )
@@ -191,6 +197,22 @@ static const struct {
     SIMD(AVX2 S/G i64[4x32],  avx2_sg,    32x4i8),
     SIMD(AVX2 S/G i32[4x64],  avx2_sg,    32x8i4),
     SIMD(AVX2 S/G i64[4x64],  avx2_sg,    32x8i8),
+    SIMD(XOP 128bit single,       xop,      16f4),
+    SIMD(XOP 256bit single,       xop,      32f4),
+    SIMD(XOP 128bit double,       xop,      16f8),
+    SIMD(XOP 256bit double,       xop,      32f8),
+    SIMD(XOP s8x16,               xop,      16i1),
+    SIMD(XOP u8x16,               xop,      16u1),
+    SIMD(XOP s16x8,               xop,      16i2),
+    SIMD(XOP u16x8,               xop,      16u2),
+    SIMD(XOP s32x4,               xop,      16i4),
+    SIMD(XOP u32x4,               xop,      16u4),
+    SIMD(XOP s64x2,               xop,      16i8),
+    SIMD(XOP u64x2,               xop,      16u8),
+    SIMD(XOP i8x32,               xop,      32i1),
+    SIMD(XOP i16x16,              xop,      32i2),
+    SIMD(XOP i32x8,               xop,      32i4),
+    SIMD(XOP i64x4,               xop,      32i8),
 #undef SIMD_
 #undef SIMD
 };
--- a/tools/tests/x86_emulator/x86_emulate.h
+++ b/tools/tests/x86_emulator/x86_emulate.h
@@ -172,6 +172,16 @@ static inline uint64_t xgetbv(uint32_t x
     (res.c & (1U << 6)) != 0; \
 })
 
+#define cpu_has_xop ({ \
+    struct cpuid_leaf res; \
+    emul_test_cpuid(1, 0, &res, NULL); \
+    if ( !(res.c & (1U << 27)) || ((xgetbv(0) & 6) != 6) ) \
+        res.c = 0; \
+    else \
+        emul_test_cpuid(0x80000001, 0, &res, NULL); \
+    (res.c & (1U << 11)) != 0; \
+})
+
 #define cpu_has_fma4 ({ \
     struct cpuid_leaf res; \
     emul_test_cpuid(1, 0, &res, NULL); \
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -435,6 +435,7 @@ static const struct {
     [0x42] = { .simd_size = simd_packed_int },
     [0x44] = { .simd_size = simd_packed_int },
     [0x46] = { .simd_size = simd_packed_int },
+    [0x48 ... 0x49] = { .simd_size = simd_packed_fp, .four_op = 1 },
     [0x4a ... 0x4b] = { .simd_size = simd_packed_fp, .four_op = 1 },
     [0x4c] = { .simd_size = simd_packed_int, .four_op = 1 },
     [0x5c ... 0x5f] = { .simd_size = simd_packed_fp, .four_op = 1 },
@@ -463,6 +464,17 @@ static const struct {
     uint8_t two_op:1;
     uint8_t four_op:1;
 } ext8f08_table[256] = {
+    [0xa2] = { .simd_size = simd_packed_int, .four_op = 1 },
+    [0x85 ... 0x87] = { .simd_size = simd_packed_int, .four_op = 1 },
+    [0x8e ... 0x8f] = { .simd_size = simd_packed_int, .four_op = 1 },
+    [0x95 ... 0x97] = { .simd_size = simd_packed_int, .four_op = 1 },
+    [0x9e ... 0x9f] = { .simd_size = simd_packed_int, .four_op = 1 },
+    [0xa3] = { .simd_size = simd_packed_int, .four_op = 1 },
+    [0xa6] = { .simd_size = simd_packed_int, .four_op = 1 },
+    [0xb6] = { .simd_size = simd_packed_int, .four_op = 1 },
+    [0xc0 ... 0xc3] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0xcc ... 0xcf] = { .simd_size = simd_packed_int },
+    [0xec ... 0xef] = { .simd_size = simd_packed_int },
 };
 
 static const struct {
@@ -470,6 +482,16 @@ static const struct {
     uint8_t two_op:1;
 } ext8f09_table[256] = {
     [0x01 ... 0x02] = { .two_op = 1 },
+    [0x80 ... 0x81] = { .simd_size = simd_packed_fp, .two_op = 1 },
+    [0x82 ... 0x83] = { .simd_size = simd_scalar_fp, .two_op = 1 },
+    [0x90 ... 0x9b] = { .simd_size = simd_packed_int },
+    [0xc1 ... 0xc3] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0xc6 ... 0xc7] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0xcb] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0xd1 ... 0xd3] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0xd6 ... 0xd7] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0xdb] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0xe1 ... 0xe3] = { .simd_size = simd_packed_int, .two_op = 1 },
 };
 
 #define REX_PREFIX 0x40
@@ -528,7 +550,7 @@ union vex {
 #define copy_VEX(ptr, vex) ({ \
     if ( !mode_64bit() ) \
         (vex).reg |= 8; \
-    (ptr)[0 - PFX_BYTES] = 0xc4; \
+    (ptr)[0 - PFX_BYTES] = ext < ext_8f08 ? 0xc4 : 0x8f; \
     (ptr)[1 - PFX_BYTES] = (vex).raw[0]; \
     (ptr)[2 - PFX_BYTES] = (vex).raw[1]; \
     container_of((ptr) + 1 - PFX_BYTES, typeof(vex), raw[0]); \
@@ -1653,6 +1675,7 @@ static bool vcpu_has(
 #define vcpu_has_lzcnt()       vcpu_has(0x80000001, ECX,  5, ctxt, ops)
 #define vcpu_has_sse4a()       vcpu_has(0x80000001, ECX,  6, ctxt, ops)
 #define vcpu_has_misalignsse() vcpu_has(0x80000001, ECX,  7, ctxt, ops)
+#define vcpu_has_xop()         vcpu_has(0x80000001, ECX, 12, ctxt, ops)
 #define vcpu_has_fma4()        vcpu_has(0x80000001, ECX, 16, ctxt, ops)
 #define vcpu_has_tbm()         vcpu_has(0x80000001, ECX, 21, ctxt, ops)
 #define vcpu_has_bmi1()        vcpu_has(         7, EBX,  3, ctxt, ops)
@@ -2985,9 +3008,19 @@ x86_decode(
     case simd_packed_int:
         switch ( vex.pfx )
         {
-        case vex_none: op_bytes = 8;           break;
-        case vex_66:   op_bytes = 16 << vex.l; break;
-        default:       op_bytes = 0;           break;
+        case vex_none:
+            if ( !vex.opcx )
+            {
+                op_bytes = 8;
+                break;
+            }
+            /* fall through */
+        case vex_66:
+            op_bytes = 16 << vex.l;
+            break;
+        default:
+            op_bytes = 0;
+            break;
         }
         break;
 
@@ -7996,6 +8029,13 @@ x86_emulate(
         generate_exception_if(vex.w, EXC_UD);
         goto simd_0f_imm8_avx;
 
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x48): /* vpermil2ps $imm,{x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+                                           /* vpermil2ps $imm,{x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x49): /* vpermil2pd $imm,{x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+                                           /* vpermil2pd $imm,{x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+        host_and_vcpu_must_have(xop);
+        goto simd_0f_imm8_ymm;
+
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x4c): /* vpblendvb {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
         generate_exception_if(vex.w, EXC_UD);
         goto simd_0f_int_imm8;
@@ -8133,6 +8173,41 @@ x86_emulate(
             asm ( "rorl %b1,%k0" : "=g" (dst.val) : "c" (imm1), "0" (src.val) );
         break;
 
+    case X86EMUL_OPC_XOP(08, 0x85): /* vpmacssww xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0x86): /* vpmacsswd xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0x87): /* vpmacssdql xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0x8e): /* vpmacssdd xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0x8f): /* vpmacssdqh xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0x95): /* vpmacsww xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0x96): /* vpmacswd xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0x97): /* vpmacsdql xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0x9e): /* vpmacsdd xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0x9f): /* vpmacsdqh xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0xa6): /* vpmadcsswd xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0xb6): /* vpmadcswd xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0xc0): /* vprotb $imm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(08, 0xc1): /* vprotw $imm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(08, 0xc2): /* vprotd $imm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(08, 0xc3): /* vprotq $imm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(08, 0xcc): /* vpcomb $imm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0xcd): /* vpcomw $imm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0xce): /* vpcomd $imm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0xcf): /* vpcomq $imm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0xec): /* vpcomub $imm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0xed): /* vpcomuw $imm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0xee): /* vpcomud $imm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0xef): /* vpcomuq $imm,xmm/m128,xmm,xmm */
+        generate_exception_if(vex.w, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_XOP(08, 0xa3): /* vpperm xmm/m128,xmm,xmm,xmm */
+                                    /* vpperm xmm,xmm/m128,xmm,xmm */
+        generate_exception_if(vex.l, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_XOP(08, 0xa2): /* vpcmov {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+                                    /* vpcmov {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+        host_and_vcpu_must_have(xop);
+        goto simd_0f_imm8_ymm;
+
     case X86EMUL_OPC_XOP(09, 0x01): /* XOP Grp1 */
         switch ( modrm_reg & 7 )
         {
@@ -8182,6 +8257,61 @@ x86_emulate(
         }
         goto cannot_emulate;
 
+    case X86EMUL_OPC_XOP(09, 0x82): /* vfrczss xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x83): /* vfrczsd xmm/m128,xmm */
+        generate_exception_if(vex.l, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_XOP(09, 0x80): /* vfrczps {x,y}mm/mem,{x,y}mm */
+    case X86EMUL_OPC_XOP(09, 0x81): /* vfrczpd {x,y}mm/mem,{x,y}mm */
+        host_and_vcpu_must_have(xop);
+        generate_exception_if(vex.w, EXC_UD);
+        goto simd_0f_ymm;
+
+    case X86EMUL_OPC_XOP(09, 0xc1): /* vphaddbw xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xc2): /* vphaddbd xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xc3): /* vphaddbq xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xc6): /* vphaddwd xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xc7): /* vphaddwq xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xcb): /* vphadddq xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xd1): /* vphaddubw xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xd2): /* vphaddubd xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xd3): /* vphaddubq xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xd6): /* vphadduwd xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xd7): /* vphadduwq xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xdb): /* vphaddudq xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xe2): /* vphsubwd xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xe3): /* vphsubdq xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xe1): /* vphsubbw xmm/m128,xmm */
+        generate_exception_if(vex.w, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_XOP(09, 0x90): /* vprotb xmm/m128,xmm,xmm */
+                                    /* vprotb xmm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x91): /* vprotw xmm/m128,xmm,xmm */
+                                    /* vprotw xmm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x92): /* vprotd xmm/m128,xmm,xmm */
+                                    /* vprotd xmm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x93): /* vprotq xmm/m128,xmm,xmm */
+                                    /* vprotq xmm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x94): /* vpshlb xmm/m128,xmm,xmm */
+                                    /* vpshlb xmm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x95): /* vpshlw xmm/m128,xmm,xmm */
+                                    /* vpshlw xmm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x96): /* vpshld xmm/m128,xmm,xmm */
+                                    /* vpshld xmm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x97): /* vpshlq xmm/m128,xmm,xmm */
+                                    /* vpshlq xmm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x98): /* vpshab xmm/m128,xmm,xmm */
+                                    /* vpshab xmm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x99): /* vpshaw xmm/m128,xmm,xmm */
+                                    /* vpshaw xmm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x9a): /* vpshad xmm/m128,xmm,xmm */
+                                    /* vpshad xmm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x9b): /* vpshaq xmm/m128,xmm,xmm */
+                                    /* vpshaq xmm,xmm/m128,xmm */
+        generate_exception_if(vex.l, EXC_UD);
+        host_and_vcpu_must_have(xop);
+        goto simd_0f_ymm;
+
     case X86EMUL_OPC_XOP(0a, 0x10): /* bextr imm,r/m,r */
     {
         uint8_t *buf = get_stub(stub);
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -76,6 +76,7 @@
 #define cpu_has_cmp_legacy      boot_cpu_has(X86_FEATURE_CMP_LEGACY)
 #define cpu_has_svm             boot_cpu_has(X86_FEATURE_SVM)
 #define cpu_has_sse4a           boot_cpu_has(X86_FEATURE_SSE4A)
+#define cpu_has_xop             boot_cpu_has(X86_FEATURE_XOP)
 #define cpu_has_lwp             boot_cpu_has(X86_FEATURE_LWP)
 #define cpu_has_fma4            boot_cpu_has(X86_FEATURE_FMA4)
 #define cpu_has_tbm             boot_cpu_has(X86_FEATURE_TBM)



[-- Attachment #2: x86emul-XOP.patch --]
[-- Type: text/plain, Size: 20234 bytes --]

x86emul: support XOP insns

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/.gitignore
+++ b/.gitignore
@@ -230,7 +230,7 @@
 tools/tests/x86_emulator/sse*.[ch]
 tools/tests/x86_emulator/test_x86_emulator
 tools/tests/x86_emulator/x86_emulate
+tools/tests/x86_emulator/xop*.[ch]
 tools/tests/xen-access/xen-access
 tools/tests/xenstore/xs-test
 tools/tests/regression/installed/*
--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -11,7 +11,7 @@ all: $(TARGET)
 run: $(TARGET)
 	./$(TARGET)
 
-SIMD := sse sse2 sse4 avx avx2
+SIMD := sse sse2 sse4 avx avx2 xop
 FMA := fma4 fma
 SG := avx2-sg
 TESTCASES := blowfish $(SIMD) $(FMA) $(SG)
@@ -44,6 +44,9 @@ avx2-sg-vecs := $(avx2-vecs)
 avx2-sg-idxs := 4 8
 avx2-sg-ints := 4 8
 avx2-sg-flts := 4 8
+xop-vecs := $(avx-vecs)
+xop-ints := 1 2 4 8
+xop-flts := $(avx-flts)
 
 # For AVX and later, have the compiler avoid XMM0 to widen coverage of
 # the VEX.vvvv checks in the emulator.
@@ -98,6 +101,8 @@ $(addsuffix .c,$(SG)):
 
 $(addsuffix .o,$(SIMD) $(FMA) $(SG)): simd.h
 
+xop.o: simd-fma.c
+
 $(TARGET): x86_emulate.o test_x86_emulator.o
 	$(HOSTCC) -o $@ $^
 
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -471,6 +471,86 @@ static inline bool _to_bool(byte_vec_t b
 #  endif
 # endif
 #endif
+#ifdef __XOP__
+# undef select
+# if VEC_SIZE == 16
+#  if INT_SIZE == 2 || INT_SIZE == 4
+#   include "simd-fma.c"
+#  endif
+#  define select(d, x, y, m) \
+    (*(d) = (vec_t)__builtin_ia32_vpcmov((vdi_t)(x), (vdi_t)(y), (vdi_t)(m)))
+#  if INT_SIZE == 1 || UINT_SIZE == 1
+#   define swap2(x) ((vec_t)__builtin_ia32_vpperm((vqi_t)(x), (vqi_t)(x), (vqi_t)inv - 1))
+#  elif INT_SIZE == 2 || UINT_SIZE == 2
+#   define swap2(x) \
+    ((vec_t)__builtin_ia32_vpperm((vqi_t)(x), (vqi_t)(x), \
+                                  (vqi_t)(__builtin_ia32_vprotwi(2 * (vhi_t)inv - 1, 8) | \
+                                          (2 * inv - 2))))
+#  elif FLOAT_SIZE == 4
+#   define frac(x) __builtin_ia32_vfrczps(x)
+#   undef swap2
+#   define swap2(x) ({ \
+    /* Buggy in gcc 7.1.0 and earlier. */ \
+    /* __builtin_ia32_vpermil2ps((vec_t){}, x, __builtin_ia32_cvtps2dq(inv) + 3, 0) */ \
+    vec_t t_; \
+    asm ( "vpermil2ps $0, %3, %2, %1, %0" : \
+          "=x" (t_) : \
+          "x" ((vec_t){}), "m" (x), "x" (__builtin_ia32_cvtps2dq(inv) + 3) ); \
+    t_; \
+})
+#  elif FLOAT_SIZE == 8
+#   define frac(x) __builtin_ia32_vfrczpd(x)
+#   undef swap2
+#   define swap2(x) ({ \
+    /* Buggy in gcc 7.1.0 and earlier. */ \
+    /* __builtin_ia32_vpermil2pd((vec_t){}, x, */ \
+    /*                            __builtin_ia32_pmovsxdq128( */ \
+    /*                                __builtin_ia32_cvtpd2dq(inv) + 1) << 1, 0) */ \
+    vdi_t s_ = __builtin_ia32_pmovsxdq128( \
+                   __builtin_ia32_cvtpd2dq(inv) + 1) << 1; \
+    vec_t t_; \
+    asm ( "vpermil2pd $0, %3, %2, %1, %0" : \
+          "=x" (t_) : "x" ((vec_t){}), "x" (x), "m" (s_) ); \
+    t_; \
+})
+#  endif
+#  if INT_SIZE == 1
+#   define hadd(x, y) ((vec_t)__builtin_ia32_packsswb128(__builtin_ia32_vphaddbw((vqi_t)(x)), \
+                                                         __builtin_ia32_vphaddbw((vqi_t)(y))))
+#   define hsub(x, y) ((vec_t)__builtin_ia32_packsswb128(__builtin_ia32_vphsubbw((vqi_t)(x)), \
+                                                         __builtin_ia32_vphsubbw((vqi_t)(y))))
+#  elif UINT_SIZE == 1
+#   define hadd(x, y) ((vec_t)__builtin_ia32_packuswb128(__builtin_ia32_vphaddubw((vqi_t)(x)), \
+                                                         __builtin_ia32_vphaddubw((vqi_t)(y))))
+#  elif INT_SIZE == 2
+#   undef hadd
+#   define hadd(x, y) __builtin_ia32_packssdw128(__builtin_ia32_vphaddwd(x), \
+                                                 __builtin_ia32_vphaddwd(y))
+#   undef hsub
+#   define hsub(x, y) __builtin_ia32_packssdw128(__builtin_ia32_vphsubwd(x), \
+                                                 __builtin_ia32_vphsubwd(y))
+#  elif UINT_SIZE == 2
+#   undef hadd
+#   define hadd(x, y) ((vec_t)__builtin_ia32_packusdw128(__builtin_ia32_vphadduwd((vhi_t)(x)), \
+                                                         __builtin_ia32_vphadduwd((vhi_t)(y))))
+#   undef hsub
+#  endif
+# elif VEC_SIZE == 32
+#  define select(d, x, y, m) \
+    (*(d) = (vec_t)__builtin_ia32_vpcmov256((vdi_t)(x), (vdi_t)(y), (vdi_t)(m)))
+#  if FLOAT_SIZE == 4
+#   define frac(x) __builtin_ia32_vfrczps256(x)
+#  elif FLOAT_SIZE == 8
+#   define frac(x) __builtin_ia32_vfrczpd256(x)
+#  endif
+# elif VEC_SIZE == FLOAT_SIZE
+#  if VEC_SIZE == 4
+#   define frac(x) scalar_1op(x, "vfrczss %[in], %[out]")
+#  elif VEC_SIZE == 8
+#   define frac(x) scalar_1op(x, "vfrczsd %[in], %[out]")
+#  endif
+# endif
+#endif
 
 int simd_test(void)
 {
@@ -576,6 +656,29 @@ int simd_test(void)
     if ( !to_bool(y == z) ) return __LINE__;
 # endif
 
+# ifdef frac
+    touch(src);
+    x = frac(src);
+    touch(src);
+    if ( !to_bool(x == 0) ) return __LINE__;
+
+    x = 1 / (src + 1);
+    touch(x);
+    y = frac(x);
+    touch(x);
+    if ( !to_bool(x == y) ) return __LINE__;
+# endif
+
+# if defined(trunc) && defined(frac)
+    x = src / 4;
+    touch(x);
+    y = trunc(x);
+    touch(x);
+    z = frac(x);
+    touch(x);
+    if ( !to_bool(x == y + z) ) return __LINE__;
+# endif
+
 #else
 
 # if ELEM_SIZE > 1
@@ -677,7 +780,7 @@ int simd_test(void)
     y = z << sh;
     if ( !to_bool(x == y + y) ) return __LINE__;
 
-#  if defined(__AVX2__) && ELEM_SIZE >= 4
+#  if (defined(__AVX2__) && ELEM_SIZE >= 4) || defined(__XOP__)
     touch(sh);
     x = y >> sh;
     if ( !to_bool(x == z) ) return __LINE__;
@@ -871,6 +974,8 @@ int simd_test(void)
 #endif
 
 #ifdef hadd
+# if (!defined(INT_SIZE) || INT_SIZE > 1 || ELEM_COUNT < 16) && \
+     (!defined(UINT_SIZE) || UINT_SIZE > 1 || ELEM_COUNT <= 16)
     x = src;
     for ( i = ELEM_COUNT; i >>= 1; )
     {
@@ -878,6 +983,7 @@ int simd_test(void)
         x = hadd((vec_t){}, x);
     }
     if ( x[ELEM_COUNT - 1] != (ELEM_COUNT * (ELEM_COUNT + 1)) / 2 ) return __LINE__;
+# endif
 
 # ifdef hsub
     touch(src);
@@ -889,6 +995,9 @@ int simd_test(void)
 # endif
 #endif
 
+#if defined(__XOP__) && VEC_SIZE == 16 && (INT_SIZE == 2 || INT_SIZE == 4)
+    return -fma_test();
+#endif
 
     return 0;
 }
--- a/tools/tests/x86_emulator/simd-fma.c
+++ b/tools/tests/x86_emulator/simd-fma.c
@@ -1,6 +1,8 @@
 #include "simd.h"
 
+#ifndef __XOP__
 ENTRY(fma_test);
+#endif
 
 #if VEC_SIZE < 16
 # define to_bool(cmp) (!~(cmp)[0])
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -13,6 +13,7 @@
 #include "fma.h"
 #include "avx2.h"
 #include "avx2-sg.h"
+#include "xop.h"
 
 #define verbose false /* Switch to true for far more logging. */
 
@@ -63,6 +64,11 @@ static bool simd_check_avx2(void)
 }
 #define simd_check_avx2_sg simd_check_avx2
 
+static bool simd_check_xop(void)
+{
+    return cpu_has_xop;
+}
+
 static void simd_set_regs(struct cpu_user_regs *regs)
 {
     if ( cpu_has_mmx )
@@ -191,6 +197,22 @@ static const struct {
     SIMD(AVX2 S/G i64[4x32],  avx2_sg,    32x4i8),
     SIMD(AVX2 S/G i32[4x64],  avx2_sg,    32x8i4),
     SIMD(AVX2 S/G i64[4x64],  avx2_sg,    32x8i8),
+    SIMD(XOP 128bit single,       xop,      16f4),
+    SIMD(XOP 256bit single,       xop,      32f4),
+    SIMD(XOP 128bit double,       xop,      16f8),
+    SIMD(XOP 256bit double,       xop,      32f8),
+    SIMD(XOP s8x16,               xop,      16i1),
+    SIMD(XOP u8x16,               xop,      16u1),
+    SIMD(XOP s16x8,               xop,      16i2),
+    SIMD(XOP u16x8,               xop,      16u2),
+    SIMD(XOP s32x4,               xop,      16i4),
+    SIMD(XOP u32x4,               xop,      16u4),
+    SIMD(XOP s64x2,               xop,      16i8),
+    SIMD(XOP u64x2,               xop,      16u8),
+    SIMD(XOP i8x32,               xop,      32i1),
+    SIMD(XOP i16x16,              xop,      32i2),
+    SIMD(XOP i32x8,               xop,      32i4),
+    SIMD(XOP i64x4,               xop,      32i8),
 #undef SIMD_
 #undef SIMD
 };
--- a/tools/tests/x86_emulator/x86_emulate.h
+++ b/tools/tests/x86_emulator/x86_emulate.h
@@ -172,6 +172,16 @@ static inline uint64_t xgetbv(uint32_t x
     (res.c & (1U << 6)) != 0; \
 })
 
+#define cpu_has_xop ({ \
+    struct cpuid_leaf res; \
+    emul_test_cpuid(1, 0, &res, NULL); \
+    if ( !(res.c & (1U << 27)) || ((xgetbv(0) & 6) != 6) ) \
+        res.c = 0; \
+    else \
+        emul_test_cpuid(0x80000001, 0, &res, NULL); \
+    (res.c & (1U << 11)) != 0; \
+})
+
 #define cpu_has_fma4 ({ \
     struct cpuid_leaf res; \
     emul_test_cpuid(1, 0, &res, NULL); \
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -435,6 +435,7 @@ static const struct {
     [0x42] = { .simd_size = simd_packed_int },
     [0x44] = { .simd_size = simd_packed_int },
     [0x46] = { .simd_size = simd_packed_int },
+    [0x48 ... 0x49] = { .simd_size = simd_packed_fp, .four_op = 1 },
     [0x4a ... 0x4b] = { .simd_size = simd_packed_fp, .four_op = 1 },
     [0x4c] = { .simd_size = simd_packed_int, .four_op = 1 },
     [0x5c ... 0x5f] = { .simd_size = simd_packed_fp, .four_op = 1 },
@@ -463,6 +464,17 @@ static const struct {
     uint8_t two_op:1;
     uint8_t four_op:1;
 } ext8f08_table[256] = {
+    [0xa2] = { .simd_size = simd_packed_int, .four_op = 1 },
+    [0x85 ... 0x87] = { .simd_size = simd_packed_int, .four_op = 1 },
+    [0x8e ... 0x8f] = { .simd_size = simd_packed_int, .four_op = 1 },
+    [0x95 ... 0x97] = { .simd_size = simd_packed_int, .four_op = 1 },
+    [0x9e ... 0x9f] = { .simd_size = simd_packed_int, .four_op = 1 },
+    [0xa3] = { .simd_size = simd_packed_int, .four_op = 1 },
+    [0xa6] = { .simd_size = simd_packed_int, .four_op = 1 },
+    [0xb6] = { .simd_size = simd_packed_int, .four_op = 1 },
+    [0xc0 ... 0xc3] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0xcc ... 0xcf] = { .simd_size = simd_packed_int },
+    [0xec ... 0xef] = { .simd_size = simd_packed_int },
 };
 
 static const struct {
@@ -470,6 +482,16 @@ static const struct {
     uint8_t two_op:1;
 } ext8f09_table[256] = {
     [0x01 ... 0x02] = { .two_op = 1 },
+    [0x80 ... 0x81] = { .simd_size = simd_packed_fp, .two_op = 1 },
+    [0x82 ... 0x83] = { .simd_size = simd_scalar_fp, .two_op = 1 },
+    [0x90 ... 0x9b] = { .simd_size = simd_packed_int },
+    [0xc1 ... 0xc3] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0xc6 ... 0xc7] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0xcb] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0xd1 ... 0xd3] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0xd6 ... 0xd7] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0xdb] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0xe1 ... 0xe3] = { .simd_size = simd_packed_int, .two_op = 1 },
 };
 
 #define REX_PREFIX 0x40
@@ -528,7 +550,7 @@ union vex {
 #define copy_VEX(ptr, vex) ({ \
     if ( !mode_64bit() ) \
         (vex).reg |= 8; \
-    (ptr)[0 - PFX_BYTES] = 0xc4; \
+    (ptr)[0 - PFX_BYTES] = ext < ext_8f08 ? 0xc4 : 0x8f; \
     (ptr)[1 - PFX_BYTES] = (vex).raw[0]; \
     (ptr)[2 - PFX_BYTES] = (vex).raw[1]; \
     container_of((ptr) + 1 - PFX_BYTES, typeof(vex), raw[0]); \
@@ -1653,6 +1675,7 @@ static bool vcpu_has(
 #define vcpu_has_lzcnt()       vcpu_has(0x80000001, ECX,  5, ctxt, ops)
 #define vcpu_has_sse4a()       vcpu_has(0x80000001, ECX,  6, ctxt, ops)
 #define vcpu_has_misalignsse() vcpu_has(0x80000001, ECX,  7, ctxt, ops)
+#define vcpu_has_xop()         vcpu_has(0x80000001, ECX, 12, ctxt, ops)
 #define vcpu_has_fma4()        vcpu_has(0x80000001, ECX, 16, ctxt, ops)
 #define vcpu_has_tbm()         vcpu_has(0x80000001, ECX, 21, ctxt, ops)
 #define vcpu_has_bmi1()        vcpu_has(         7, EBX,  3, ctxt, ops)
@@ -2985,9 +3008,19 @@ x86_decode(
     case simd_packed_int:
         switch ( vex.pfx )
         {
-        case vex_none: op_bytes = 8;           break;
-        case vex_66:   op_bytes = 16 << vex.l; break;
-        default:       op_bytes = 0;           break;
+        case vex_none:
+            if ( !vex.opcx )
+            {
+                op_bytes = 8;
+                break;
+            }
+            /* fall through */
+        case vex_66:
+            op_bytes = 16 << vex.l;
+            break;
+        default:
+            op_bytes = 0;
+            break;
         }
         break;
 
@@ -7996,6 +8029,13 @@ x86_emulate(
         generate_exception_if(vex.w, EXC_UD);
         goto simd_0f_imm8_avx;
 
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x48): /* vpermil2ps $imm,{x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+                                           /* vpermil2ps $imm,{x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x49): /* vpermil2pd $imm,{x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+                                           /* vpermil2pd $imm,{x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+        host_and_vcpu_must_have(xop);
+        goto simd_0f_imm8_ymm;
+
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x4c): /* vpblendvb {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
         generate_exception_if(vex.w, EXC_UD);
         goto simd_0f_int_imm8;
@@ -8133,6 +8173,41 @@ x86_emulate(
             asm ( "rorl %b1,%k0" : "=g" (dst.val) : "c" (imm1), "0" (src.val) );
         break;
 
+    case X86EMUL_OPC_XOP(08, 0x85): /* vpmacssww xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0x86): /* vpmacsswd xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0x87): /* vpmacssdql xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0x8e): /* vpmacssdd xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0x8f): /* vpmacssdqh xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0x95): /* vpmacsww xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0x96): /* vpmacswd xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0x97): /* vpmacsdql xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0x9e): /* vpmacsdd xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0x9f): /* vpmacsdqh xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0xa6): /* vpmadcsswd xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0xb6): /* vpmadcswd xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0xc0): /* vprotb $imm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(08, 0xc1): /* vprotw $imm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(08, 0xc2): /* vprotd $imm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(08, 0xc3): /* vprotq $imm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(08, 0xcc): /* vpcomb $imm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0xcd): /* vpcomw $imm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0xce): /* vpcomd $imm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0xcf): /* vpcomq $imm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0xec): /* vpcomub $imm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0xed): /* vpcomuw $imm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0xee): /* vpcomud $imm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0xef): /* vpcomuq $imm,xmm/m128,xmm,xmm */
+        generate_exception_if(vex.w, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_XOP(08, 0xa3): /* vpperm xmm/m128,xmm,xmm,xmm */
+                                    /* vpperm xmm,xmm/m128,xmm,xmm */
+        generate_exception_if(vex.l, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_XOP(08, 0xa2): /* vpcmov {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+                                    /* vpcmov {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+        host_and_vcpu_must_have(xop);
+        goto simd_0f_imm8_ymm;
+
     case X86EMUL_OPC_XOP(09, 0x01): /* XOP Grp1 */
         switch ( modrm_reg & 7 )
         {
@@ -8182,6 +8257,61 @@ x86_emulate(
         }
         goto cannot_emulate;
 
+    case X86EMUL_OPC_XOP(09, 0x82): /* vfrczss xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x83): /* vfrczsd xmm/m128,xmm */
+        generate_exception_if(vex.l, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_XOP(09, 0x80): /* vfrczps {x,y}mm/mem,{x,y}mm */
+    case X86EMUL_OPC_XOP(09, 0x81): /* vfrczpd {x,y}mm/mem,{x,y}mm */
+        host_and_vcpu_must_have(xop);
+        generate_exception_if(vex.w, EXC_UD);
+        goto simd_0f_ymm;
+
+    case X86EMUL_OPC_XOP(09, 0xc1): /* vphaddbw xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xc2): /* vphaddbd xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xc3): /* vphaddbq xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xc6): /* vphaddwd xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xc7): /* vphaddwq xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xcb): /* vphadddq xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xd1): /* vphaddubw xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xd2): /* vphaddubd xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xd3): /* vphaddubq xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xd6): /* vphadduwd xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xd7): /* vphadduwq xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xdb): /* vphaddudq xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xe2): /* vphsubwd xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xe3): /* vphsubdq xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xe1): /* vphsubbw xmm/m128,xmm */
+        generate_exception_if(vex.w, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_XOP(09, 0x90): /* vprotb xmm/m128,xmm,xmm */
+                                    /* vprotb xmm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x91): /* vprotw xmm/m128,xmm,xmm */
+                                    /* vprotw xmm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x92): /* vprotd xmm/m128,xmm,xmm */
+                                    /* vprotd xmm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x93): /* vprotq xmm/m128,xmm,xmm */
+                                    /* vprotq xmm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x94): /* vpshlb xmm/m128,xmm,xmm */
+                                    /* vpshlb xmm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x95): /* vpshlw xmm/m128,xmm,xmm */
+                                    /* vpshlw xmm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x96): /* vpshld xmm/m128,xmm,xmm */
+                                    /* vpshld xmm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x97): /* vpshlq xmm/m128,xmm,xmm */
+                                    /* vpshlq xmm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x98): /* vpshab xmm/m128,xmm,xmm */
+                                    /* vpshab xmm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x99): /* vpshaw xmm/m128,xmm,xmm */
+                                    /* vpshaw xmm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x9a): /* vpshad xmm/m128,xmm,xmm */
+                                    /* vpshad xmm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x9b): /* vpshaq xmm/m128,xmm,xmm */
+                                    /* vpshaq xmm,xmm/m128,xmm */
+        generate_exception_if(vex.l, EXC_UD);
+        host_and_vcpu_must_have(xop);
+        goto simd_0f_ymm;
+
     case X86EMUL_OPC_XOP(0a, 0x10): /* bextr imm,r/m,r */
     {
         uint8_t *buf = get_stub(stub);
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -76,6 +76,7 @@
 #define cpu_has_cmp_legacy      boot_cpu_has(X86_FEATURE_CMP_LEGACY)
 #define cpu_has_svm             boot_cpu_has(X86_FEATURE_SVM)
 #define cpu_has_sse4a           boot_cpu_has(X86_FEATURE_SSE4A)
+#define cpu_has_xop             boot_cpu_has(X86_FEATURE_XOP)
 #define cpu_has_lwp             boot_cpu_has(X86_FEATURE_LWP)
 #define cpu_has_fma4            boot_cpu_has(X86_FEATURE_FMA4)
 #define cpu_has_tbm             boot_cpu_has(X86_FEATURE_TBM)

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 12/17] x86emul: support 3DNow! insns
  2017-06-21 11:23 [PATCH 00/17] x86: emulator enhancements Jan Beulich
                   ` (10 preceding siblings ...)
  2017-06-21 12:05 ` [PATCH 11/17] x86emul: support XOP insns Jan Beulich
@ 2017-06-21 12:05 ` Jan Beulich
  2017-06-21 12:06 ` [PATCH 13/17] x86emul: re-order checks in test harness Jan Beulich
                   ` (5 subsequent siblings)
  17 siblings, 0 replies; 27+ messages in thread
From: Jan Beulich @ 2017-06-21 12:05 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 8863 bytes --]

Yes, recent AMD CPUs don't support them anymore, but I think we should
nevertheless cope.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/.gitignore
+++ b/.gitignore
@@ -223,7 +223,7 @@
 tools/security/xensec_tool
 tools/tests/x86_emulator/*.bin
 tools/tests/x86_emulator/*.tmp
+tools/tests/x86_emulator/3dnow*.[ch]
 tools/tests/x86_emulator/asm
 tools/tests/x86_emulator/avx*.[ch]
 tools/tests/x86_emulator/blowfish.h
--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -11,7 +11,7 @@ all: $(TARGET)
 run: $(TARGET)
 	./$(TARGET)
 
-SIMD := sse sse2 sse4 avx avx2 xop
+SIMD := 3dnow sse sse2 sse4 avx avx2 xop
 FMA := fma4 fma
 SG := avx2-sg
 TESTCASES := blowfish $(SIMD) $(FMA) $(SG)
@@ -19,6 +19,9 @@ TESTCASES := blowfish $(SIMD) $(FMA) $(S
 blowfish-cflags := ""
 blowfish-cflags-x86_32 := "-mno-accumulate-outgoing-args -Dstatic="
 
+3dnow-vecs := 8
+3dnow-ints :=
+3dnow-flts := 4
 sse-vecs := 16
 sse-ints :=
 sse-flts := 4
@@ -49,8 +52,13 @@ xop-ints := 1 2 4 8
 xop-flts := $(avx-flts)
 
 # For AVX and later, have the compiler avoid XMM0 to widen coverage of
-# the VEX.vvvv checks in the emulator.
-non-sse = $(if $(filter sse%,$(1)),,-ffixed-xmm0)
+# the VEX.vvvv checks in the emulator.  For 3DNow!, however, force SSE
+# use for floating point operations, to avoid mixing MMX and FPU register
+# uses.  Also enable 3DNow! extensions, but note that we can't use 3dnowa
+# as the test flavor right away since -m3dnowa is being understood only
+# by gcc 7.x and newer (older ones want a specific machine model instead).
+3dnowa := $(call cc-option,$(CC),-m3dnowa,-march=k8)
+non-sse = $(if $(filter sse%,$(1)),,$(if $(filter 3dnow%,$(1)),-msse -mfpmath=sse $(3dnowa),-ffixed-xmm0))
 
 define simd-defs
 $(1)-cflags := \
@@ -81,8 +89,9 @@ $(addsuffix .h,$(TESTCASES)): %.h: %.c t
 	$(foreach arch,$(filter-out $(XEN_COMPILE_ARCH),x86_32) $(XEN_COMPILE_ARCH), \
 	    for cflags in $($*-cflags) $($*-cflags-$(arch)); do \
 		$(MAKE) -f testcase.mk TESTCASE=$* XEN_TARGET_ARCH=$(arch) $*-cflags="$$cflags" all; \
+		prefix=$(shell echo $(subst -,_,$*) | sed -e 's,^\([0-9]\),_\1,'); \
 		flavor=$$(echo $${cflags} | sed -e 's, .*,,' -e 'y,-=,__,') ; \
-		(echo "static const unsigned int $(subst -,_,$*)_$(arch)$${flavor}[] = {"; \
+		(echo "static const unsigned int $${prefix}_$(arch)$${flavor}[] = {"; \
 		 od -v -t x $*.bin | sed -e 's/^[0-9]* /0x/' -e 's/ /, 0x/g' -e 's/$$/,/'; \
 		 echo "};") >>$@.new; \
 		rm -f $*.bin; \
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -48,6 +48,8 @@ static inline bool _to_bool(byte_vec_t b
 
 #if VEC_SIZE == FLOAT_SIZE
 # define to_int(x) ((vec_t){ (int)(x)[0] })
+#elif VEC_SIZE == 8 && FLOAT_SIZE == 4 && defined(__3dNOW__)
+# define to_int(x) __builtin_ia32_pi2fd(__builtin_ia32_pf2id(x))
 #elif VEC_SIZE == 16 && defined(__SSE2__)
 # if FLOAT_SIZE == 4
 #  define to_int(x) __builtin_ia32_cvtdq2ps(__builtin_ia32_cvtps2dq(x))
@@ -70,7 +72,24 @@ static inline bool _to_bool(byte_vec_t b
 })
 #endif
 
-#if FLOAT_SIZE == 4 && defined(__SSE__)
+#if VEC_SIZE == 8 && FLOAT_SIZE == 4 && defined(__3dNOW_A__)
+# define max __builtin_ia32_pfmax
+# define min __builtin_ia32_pfmin
+# define recip(x) ({ \
+    vec_t t_ = __builtin_ia32_pfrcp(x); \
+    touch(x); \
+    t_[1] = __builtin_ia32_pfrcp(__builtin_ia32_pswapdsf(x))[0]; \
+    touch(x); \
+    __builtin_ia32_pfrcpit2(__builtin_ia32_pfrcpit1(t_, x), t_); \
+})
+# define rsqrt(x) ({ \
+    vec_t t_ = __builtin_ia32_pfrsqrt(x); \
+    touch(x); \
+    t_[1] = __builtin_ia32_pfrsqrt(__builtin_ia32_pswapdsf(x))[0]; \
+    touch(x); \
+    __builtin_ia32_pfrcpit2(__builtin_ia32_pfrsqit1(__builtin_ia32_pfmul(t_, t_), x), t_); \
+})
+#elif FLOAT_SIZE == 4 && defined(__SSE__)
 # if VEC_SIZE == 32 && defined(__AVX__)
 #  define broadcast(x) ({ float t_ = (x); __builtin_ia32_vbroadcastss256(&t_); })
 #  define max(x, y) __builtin_ia32_maxps256(x, y)
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -5,6 +5,7 @@
 
 #include "x86_emulate.h"
 #include "blowfish.h"
+#include "3dnow.h"
 #include "sse.h"
 #include "sse2.h"
 #include "sse4.h"
@@ -28,6 +29,11 @@ static bool blowfish_check_regs(const st
     return regs->eax == 2 && regs->edx == 1;
 }
 
+static bool simd_check__3dnow(void)
+{
+    return cpu_has_3dnow_ext && cpu_has_sse;
+}
+
 static bool simd_check_sse(void)
 {
     return cpu_has_sse;
@@ -117,6 +123,7 @@ static const struct {
 #else
 # define SIMD(desc, feat, form) SIMD_(32, desc, feat, form)
 #endif
+    SIMD(3DNow! single,          _3dnow,     8f4),
     SIMD(SSE scalar single,      sse,         f4),
     SIMD(SSE packed single,      sse,       16f4),
     SIMD(SSE2 scalar single,     sse2,        f4),
--- a/tools/tests/x86_emulator/x86_emulate.h
+++ b/tools/tests/x86_emulator/x86_emulate.h
@@ -166,6 +166,12 @@ static inline uint64_t xgetbv(uint32_t x
     (res.b & (1U << 8)) != 0; \
 })
 
+#define cpu_has_3dnow_ext ({ \
+    struct cpuid_leaf res; \
+    emul_test_cpuid(0x80000001, 0, &res, NULL); \
+    (res.d & (1U << 30)) != 0; \
+})
+
 #define cpu_has_sse4a ({ \
     struct cpuid_leaf res; \
     emul_test_cpuid(0x80000001, 0, &res, NULL); \
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -355,6 +355,36 @@ static const struct {
     [0xff] = { ModRM }
 };
 
+static const uint16_t _3dnow_table[16] = {
+    [0x0] = (1 << 0xd) /* pi2fd */,
+    [0x1] = (1 << 0xd) /* pf2id */,
+    [0x9] = (1 << 0x0) /* pfcmpge */ |
+            (1 << 0x4) /* pfmin */ |
+            (1 << 0x6) /* pfrcp */ |
+            (1 << 0x7) /* pfrsqrt */ |
+            (1 << 0xa) /* pfsub */ |
+            (1 << 0xe) /* pfadd */,
+    [0xa] = (1 << 0x0) /* pfcmpge */ |
+            (1 << 0x4) /* pfmax */ |
+            (1 << 0x6) /* pfrcpit1 */ |
+            (1 << 0x7) /* pfrsqit1 */ |
+            (1 << 0xa) /* pfsubr */ |
+            (1 << 0xe) /* pfacc */,
+    [0xb] = (1 << 0x0) /* pfcmpeq */ |
+            (1 << 0x4) /* pfmul */ |
+            (1 << 0x6) /* pfrcpit2 */ |
+            (1 << 0x7) /* pmulhrw */ |
+            (1 << 0xf) /* pavgusb */,
+};
+
+static const uint16_t _3dnow_ext_table[16] = {
+    [0x1] = (1 << 0xd) /* pi2fw */,
+    [0x1] = (1 << 0xc) /* pf2iw */,
+    [0x8] = (1 << 0xa) /* pfnacc */ |
+            (1 << 0xa) /* pfpnacc */,
+    [0xb] = (1 << 0xb) /* pfswapd */,
+};
+
 /*
  * "two_op" and "four_op" below refer to the number of register operands
  * (one of which possibly also allowing to be a memory one). The named
@@ -1670,6 +1700,8 @@ static bool vcpu_has(
 #define vcpu_has_rdrand()      vcpu_has(         1, ECX, 30, ctxt, ops)
 #define vcpu_has_mmxext()     (vcpu_has(0x80000001, EDX, 22, ctxt, ops) || \
                                vcpu_has_sse())
+#define vcpu_has_3dnow_ext()   vcpu_has(0x80000001, EDX, 30, ctxt, ops)
+#define vcpu_has_3dnow()       vcpu_has(0x80000001, EDX, 31, ctxt, ops)
 #define vcpu_has_lahf_lm()     vcpu_has(0x80000001, ECX,  0, ctxt, ops)
 #define vcpu_has_cr8_legacy()  vcpu_has(0x80000001, ECX,  4, ctxt, ops)
 #define vcpu_has_lzcnt()       vcpu_has(0x80000001, ECX,  5, ctxt, ops)
@@ -5480,6 +5512,26 @@ x86_emulate(
     case X86EMUL_OPC(0x0f, 0x19) ... X86EMUL_OPC(0x0f, 0x1f): /* nop */
         break;
 
+    case X86EMUL_OPC(0x0f, 0x0e): /* femms */
+        host_and_vcpu_must_have(3dnow);
+        asm volatile ( "femms" );
+        break;
+
+    case X86EMUL_OPC(0x0f, 0x0f): /* 3DNow! */
+        if ( _3dnow_ext_table[(imm1 >> 4) & 0xf] & (1 << (imm1 & 0xf)) )
+            host_and_vcpu_must_have(3dnow_ext);
+        else if ( _3dnow_table[(imm1 >> 4) & 0xf] & (1 << (imm1 & 0xf)) )
+            host_and_vcpu_must_have(3dnow);
+        else
+            generate_exception(EXC_UD);
+
+        get_fpu(X86EMUL_FPU_mmx, &fic);
+
+        d = DstReg | SrcMem;
+        op_bytes = 8;
+        state->simd_size = simd_other;
+        goto simd_0f_imm8;
+
 #define CASE_SIMD_PACKED_INT(pfx, opc)       \
     case X86EMUL_OPC(pfx, opc):              \
     case X86EMUL_OPC_66(pfx, opc)
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -71,6 +71,8 @@
                                  && boot_cpu_has(X86_FEATURE_FFXSR))
 #define cpu_has_page1gb         boot_cpu_has(X86_FEATURE_PAGE1GB)
 #define cpu_has_rdtscp          boot_cpu_has(X86_FEATURE_RDTSCP)
+#define cpu_has_3dnow_ext       boot_cpu_has(X86_FEATURE_3DNOWEXT)
+#define cpu_has_3dnow           boot_cpu_has(X86_FEATURE_3DNOW)
 
 /* CPUID level 0x80000001.ecx */
 #define cpu_has_cmp_legacy      boot_cpu_has(X86_FEATURE_CMP_LEGACY)



[-- Attachment #2: x86emul-3dnow.patch --]
[-- Type: text/plain, Size: 8892 bytes --]

x86emul: support 3DNow! insns

Yes, recent AMD CPUs don't support them anymore, but I think we should
nevertheless cope.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/.gitignore
+++ b/.gitignore
@@ -223,7 +223,7 @@
 tools/security/xensec_tool
 tools/tests/x86_emulator/*.bin
 tools/tests/x86_emulator/*.tmp
+tools/tests/x86_emulator/3dnow*.[ch]
 tools/tests/x86_emulator/asm
 tools/tests/x86_emulator/avx*.[ch]
 tools/tests/x86_emulator/blowfish.h
--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -11,7 +11,7 @@ all: $(TARGET)
 run: $(TARGET)
 	./$(TARGET)
 
-SIMD := sse sse2 sse4 avx avx2 xop
+SIMD := 3dnow sse sse2 sse4 avx avx2 xop
 FMA := fma4 fma
 SG := avx2-sg
 TESTCASES := blowfish $(SIMD) $(FMA) $(SG)
@@ -19,6 +19,9 @@ TESTCASES := blowfish $(SIMD) $(FMA) $(S
 blowfish-cflags := ""
 blowfish-cflags-x86_32 := "-mno-accumulate-outgoing-args -Dstatic="
 
+3dnow-vecs := 8
+3dnow-ints :=
+3dnow-flts := 4
 sse-vecs := 16
 sse-ints :=
 sse-flts := 4
@@ -49,8 +52,13 @@ xop-ints := 1 2 4 8
 xop-flts := $(avx-flts)
 
 # For AVX and later, have the compiler avoid XMM0 to widen coverage of
-# the VEX.vvvv checks in the emulator.
-non-sse = $(if $(filter sse%,$(1)),,-ffixed-xmm0)
+# the VEX.vvvv checks in the emulator.  For 3DNow!, however, force SSE
+# use for floating point operations, to avoid mixing MMX and FPU register
+# uses.  Also enable 3DNow! extensions, but note that we can't use 3dnowa
+# as the test flavor right away since -m3dnowa is being understood only
+# by gcc 7.x and newer (older ones want a specific machine model instead).
+3dnowa := $(call cc-option,$(CC),-m3dnowa,-march=k8)
+non-sse = $(if $(filter sse%,$(1)),,$(if $(filter 3dnow%,$(1)),-msse -mfpmath=sse $(3dnowa),-ffixed-xmm0))
 
 define simd-defs
 $(1)-cflags := \
@@ -81,8 +89,9 @@ $(addsuffix .h,$(TESTCASES)): %.h: %.c t
 	$(foreach arch,$(filter-out $(XEN_COMPILE_ARCH),x86_32) $(XEN_COMPILE_ARCH), \
 	    for cflags in $($*-cflags) $($*-cflags-$(arch)); do \
 		$(MAKE) -f testcase.mk TESTCASE=$* XEN_TARGET_ARCH=$(arch) $*-cflags="$$cflags" all; \
+		prefix=$(shell echo $(subst -,_,$*) | sed -e 's,^\([0-9]\),_\1,'); \
 		flavor=$$(echo $${cflags} | sed -e 's, .*,,' -e 'y,-=,__,') ; \
-		(echo "static const unsigned int $(subst -,_,$*)_$(arch)$${flavor}[] = {"; \
+		(echo "static const unsigned int $${prefix}_$(arch)$${flavor}[] = {"; \
 		 od -v -t x $*.bin | sed -e 's/^[0-9]* /0x/' -e 's/ /, 0x/g' -e 's/$$/,/'; \
 		 echo "};") >>$@.new; \
 		rm -f $*.bin; \
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -48,6 +48,8 @@ static inline bool _to_bool(byte_vec_t b
 
 #if VEC_SIZE == FLOAT_SIZE
 # define to_int(x) ((vec_t){ (int)(x)[0] })
+#elif VEC_SIZE == 8 && FLOAT_SIZE == 4 && defined(__3dNOW__)
+# define to_int(x) __builtin_ia32_pi2fd(__builtin_ia32_pf2id(x))
 #elif VEC_SIZE == 16 && defined(__SSE2__)
 # if FLOAT_SIZE == 4
 #  define to_int(x) __builtin_ia32_cvtdq2ps(__builtin_ia32_cvtps2dq(x))
@@ -70,7 +72,24 @@ static inline bool _to_bool(byte_vec_t b
 })
 #endif
 
-#if FLOAT_SIZE == 4 && defined(__SSE__)
+#if VEC_SIZE == 8 && FLOAT_SIZE == 4 && defined(__3dNOW_A__)
+# define max __builtin_ia32_pfmax
+# define min __builtin_ia32_pfmin
+# define recip(x) ({ \
+    vec_t t_ = __builtin_ia32_pfrcp(x); \
+    touch(x); \
+    t_[1] = __builtin_ia32_pfrcp(__builtin_ia32_pswapdsf(x))[0]; \
+    touch(x); \
+    __builtin_ia32_pfrcpit2(__builtin_ia32_pfrcpit1(t_, x), t_); \
+})
+# define rsqrt(x) ({ \
+    vec_t t_ = __builtin_ia32_pfrsqrt(x); \
+    touch(x); \
+    t_[1] = __builtin_ia32_pfrsqrt(__builtin_ia32_pswapdsf(x))[0]; \
+    touch(x); \
+    __builtin_ia32_pfrcpit2(__builtin_ia32_pfrsqit1(__builtin_ia32_pfmul(t_, t_), x), t_); \
+})
+#elif FLOAT_SIZE == 4 && defined(__SSE__)
 # if VEC_SIZE == 32 && defined(__AVX__)
 #  define broadcast(x) ({ float t_ = (x); __builtin_ia32_vbroadcastss256(&t_); })
 #  define max(x, y) __builtin_ia32_maxps256(x, y)
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -5,6 +5,7 @@
 
 #include "x86_emulate.h"
 #include "blowfish.h"
+#include "3dnow.h"
 #include "sse.h"
 #include "sse2.h"
 #include "sse4.h"
@@ -28,6 +29,11 @@ static bool blowfish_check_regs(const st
     return regs->eax == 2 && regs->edx == 1;
 }
 
+static bool simd_check__3dnow(void)
+{
+    return cpu_has_3dnow_ext && cpu_has_sse;
+}
+
 static bool simd_check_sse(void)
 {
     return cpu_has_sse;
@@ -117,6 +123,7 @@ static const struct {
 #else
 # define SIMD(desc, feat, form) SIMD_(32, desc, feat, form)
 #endif
+    SIMD(3DNow! single,          _3dnow,     8f4),
     SIMD(SSE scalar single,      sse,         f4),
     SIMD(SSE packed single,      sse,       16f4),
     SIMD(SSE2 scalar single,     sse2,        f4),
--- a/tools/tests/x86_emulator/x86_emulate.h
+++ b/tools/tests/x86_emulator/x86_emulate.h
@@ -166,6 +166,12 @@ static inline uint64_t xgetbv(uint32_t x
     (res.b & (1U << 8)) != 0; \
 })
 
+#define cpu_has_3dnow_ext ({ \
+    struct cpuid_leaf res; \
+    emul_test_cpuid(0x80000001, 0, &res, NULL); \
+    (res.d & (1U << 30)) != 0; \
+})
+
 #define cpu_has_sse4a ({ \
     struct cpuid_leaf res; \
     emul_test_cpuid(0x80000001, 0, &res, NULL); \
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -355,6 +355,36 @@ static const struct {
     [0xff] = { ModRM }
 };
 
+static const uint16_t _3dnow_table[16] = {
+    [0x0] = (1 << 0xd) /* pi2fd */,
+    [0x1] = (1 << 0xd) /* pf2id */,
+    [0x9] = (1 << 0x0) /* pfcmpge */ |
+            (1 << 0x4) /* pfmin */ |
+            (1 << 0x6) /* pfrcp */ |
+            (1 << 0x7) /* pfrsqrt */ |
+            (1 << 0xa) /* pfsub */ |
+            (1 << 0xe) /* pfadd */,
+    [0xa] = (1 << 0x0) /* pfcmpge */ |
+            (1 << 0x4) /* pfmax */ |
+            (1 << 0x6) /* pfrcpit1 */ |
+            (1 << 0x7) /* pfrsqit1 */ |
+            (1 << 0xa) /* pfsubr */ |
+            (1 << 0xe) /* pfacc */,
+    [0xb] = (1 << 0x0) /* pfcmpeq */ |
+            (1 << 0x4) /* pfmul */ |
+            (1 << 0x6) /* pfrcpit2 */ |
+            (1 << 0x7) /* pmulhrw */ |
+            (1 << 0xf) /* pavgusb */,
+};
+
+static const uint16_t _3dnow_ext_table[16] = {
+    [0x1] = (1 << 0xd) /* pi2fw */,
+    [0x1] = (1 << 0xc) /* pf2iw */,
+    [0x8] = (1 << 0xa) /* pfnacc */ |
+            (1 << 0xa) /* pfpnacc */,
+    [0xb] = (1 << 0xb) /* pfswapd */,
+};
+
 /*
  * "two_op" and "four_op" below refer to the number of register operands
  * (one of which possibly also allowing to be a memory one). The named
@@ -1670,6 +1700,8 @@ static bool vcpu_has(
 #define vcpu_has_rdrand()      vcpu_has(         1, ECX, 30, ctxt, ops)
 #define vcpu_has_mmxext()     (vcpu_has(0x80000001, EDX, 22, ctxt, ops) || \
                                vcpu_has_sse())
+#define vcpu_has_3dnow_ext()   vcpu_has(0x80000001, EDX, 30, ctxt, ops)
+#define vcpu_has_3dnow()       vcpu_has(0x80000001, EDX, 31, ctxt, ops)
 #define vcpu_has_lahf_lm()     vcpu_has(0x80000001, ECX,  0, ctxt, ops)
 #define vcpu_has_cr8_legacy()  vcpu_has(0x80000001, ECX,  4, ctxt, ops)
 #define vcpu_has_lzcnt()       vcpu_has(0x80000001, ECX,  5, ctxt, ops)
@@ -5480,6 +5512,26 @@ x86_emulate(
     case X86EMUL_OPC(0x0f, 0x19) ... X86EMUL_OPC(0x0f, 0x1f): /* nop */
         break;
 
+    case X86EMUL_OPC(0x0f, 0x0e): /* femms */
+        host_and_vcpu_must_have(3dnow);
+        asm volatile ( "femms" );
+        break;
+
+    case X86EMUL_OPC(0x0f, 0x0f): /* 3DNow! */
+        if ( _3dnow_ext_table[(imm1 >> 4) & 0xf] & (1 << (imm1 & 0xf)) )
+            host_and_vcpu_must_have(3dnow_ext);
+        else if ( _3dnow_table[(imm1 >> 4) & 0xf] & (1 << (imm1 & 0xf)) )
+            host_and_vcpu_must_have(3dnow);
+        else
+            generate_exception(EXC_UD);
+
+        get_fpu(X86EMUL_FPU_mmx, &fic);
+
+        d = DstReg | SrcMem;
+        op_bytes = 8;
+        state->simd_size = simd_other;
+        goto simd_0f_imm8;
+
 #define CASE_SIMD_PACKED_INT(pfx, opc)       \
     case X86EMUL_OPC(pfx, opc):              \
     case X86EMUL_OPC_66(pfx, opc)
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -71,6 +71,8 @@
                                  && boot_cpu_has(X86_FEATURE_FFXSR))
 #define cpu_has_page1gb         boot_cpu_has(X86_FEATURE_PAGE1GB)
 #define cpu_has_rdtscp          boot_cpu_has(X86_FEATURE_RDTSCP)
+#define cpu_has_3dnow_ext       boot_cpu_has(X86_FEATURE_3DNOWEXT)
+#define cpu_has_3dnow           boot_cpu_has(X86_FEATURE_3DNOW)
 
 /* CPUID level 0x80000001.ecx */
 #define cpu_has_cmp_legacy      boot_cpu_has(X86_FEATURE_CMP_LEGACY)

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 13/17] x86emul: re-order checks in test harness
  2017-06-21 11:23 [PATCH 00/17] x86: emulator enhancements Jan Beulich
                   ` (11 preceding siblings ...)
  2017-06-21 12:05 ` [PATCH 12/17] x86emul: support 3DNow! insns Jan Beulich
@ 2017-06-21 12:06 ` Jan Beulich
  2017-06-21 12:07 ` [PATCH 14/17] x86emul: abstract out XCRn accesses Jan Beulich
                   ` (4 subsequent siblings)
  17 siblings, 0 replies; 27+ messages in thread
From: Jan Beulich @ 2017-06-21 12:06 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 1238 bytes --]

On older systems printing the "n/a" messages (resulting from the
compiler not being new enough to deal with some of the test code) isn't
very useful: If both CPU and compiler are too old for a certain test,
we can as well omit those messages, as those tests wouldn't be run even
if the compiler did produce code. (This has become obvious with the
3DNow! tests, which I had to run on an older system still supporting
those insns, and that system naturally also had an older compiler.)

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -3267,15 +3267,15 @@ int main(int argc, char **argv)
 
     for ( j = 0; j < ARRAY_SIZE(blobs); j++ )
     {
+        if ( blobs[j].check_cpu && !blobs[j].check_cpu() )
+            continue;
+
         if ( !blobs[j].size )
         {
             printf("%-39s n/a\n", blobs[j].name);
             continue;
         }
 
-        if ( blobs[j].check_cpu && !blobs[j].check_cpu() )
-            continue;
-
         memcpy(res, blobs[j].code, blobs[j].size);
         ctxt.lma = blobs[j].bitness == 64;
         ctxt.addr_size = ctxt.sp_size = blobs[j].bitness;




[-- Attachment #2: x86emul-test-reorder-checks.patch --]
[-- Type: text/plain, Size: 1276 bytes --]

x86emul: re-order checks in test harness

On older systems printing the "n/a" messages (resulting from the
compiler not being new enough to deal with some of the test code) isn't
very useful: If both CPU and compiler are too old for a certain test,
we can as well omit those messages, as those tests wouldn't be run even
if the compiler did produce code. (This has become obvious with the
3DNow! tests, which I had to run on an older system still supporting
those insns, and that system naturally also had an older compiler.)

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -3267,15 +3267,15 @@ int main(int argc, char **argv)
 
     for ( j = 0; j < ARRAY_SIZE(blobs); j++ )
     {
+        if ( blobs[j].check_cpu && !blobs[j].check_cpu() )
+            continue;
+
         if ( !blobs[j].size )
         {
             printf("%-39s n/a\n", blobs[j].name);
             continue;
         }
 
-        if ( blobs[j].check_cpu && !blobs[j].check_cpu() )
-            continue;
-
         memcpy(res, blobs[j].code, blobs[j].size);
         ctxt.lma = blobs[j].bitness == 64;
         ctxt.addr_size = ctxt.sp_size = blobs[j].bitness;

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 14/17] x86emul: abstract out XCRn accesses
  2017-06-21 11:23 [PATCH 00/17] x86: emulator enhancements Jan Beulich
                   ` (12 preceding siblings ...)
  2017-06-21 12:06 ` [PATCH 13/17] x86emul: re-order checks in test harness Jan Beulich
@ 2017-06-21 12:07 ` Jan Beulich
  2017-06-21 12:07 ` [PATCH 15/17] x86emul: adjust_bnd() should check XCR0 Jan Beulich
                   ` (3 subsequent siblings)
  17 siblings, 0 replies; 27+ messages in thread
From: Jan Beulich @ 2017-06-21 12:07 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 21656 bytes --]

Use hooks, just like done for other special purpose registers.

This includes moving XCR0 checks from hvmemul_get_fpu() to the emulator
itself as well as adding support for XGETBV emulation.

For now fuzzer reads will obtain the real values (minus the fuzzing of
the hook pointer itself).

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/tools/fuzz/x86_instruction_emulator/fuzz-emul.c
+++ b/tools/fuzz/x86_instruction_emulator/fuzz-emul.c
@@ -409,6 +409,8 @@ static int fuzz_write_cr(
     return X86EMUL_OKAY;
 }
 
+#define fuzz_read_xcr emul_test_read_xcr
+
 enum {
     MSRI_IA32_SYSENTER_CS,
     MSRI_IA32_SYSENTER_ESP,
@@ -527,6 +529,7 @@ static const struct x86_emulate_ops all_
     SET(write_io),
     SET(read_cr),
     SET(write_cr),
+    SET(read_xcr),
     SET(read_msr),
     SET(write_msr),
     SET(wbinvd),
@@ -635,6 +638,7 @@ enum {
     HOOK_write_cr,
     HOOK_read_dr,
     HOOK_write_dr,
+    HOOK_read_xcr,
     HOOK_read_msr,
     HOOK_write_msr,
     HOOK_wbinvd,
@@ -679,6 +683,7 @@ static void disable_hooks(struct x86_emu
     MAYBE_DISABLE_HOOK(write_io);
     MAYBE_DISABLE_HOOK(read_cr);
     MAYBE_DISABLE_HOOK(write_cr);
+    MAYBE_DISABLE_HOOK(read_xcr);
     MAYBE_DISABLE_HOOK(read_msr);
     MAYBE_DISABLE_HOOK(write_msr);
     MAYBE_DISABLE_HOOK(wbinvd);
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -368,6 +368,7 @@ static struct x86_emulate_ops emulops =
     .read_segment = read_segment,
     .cpuid      = emul_test_cpuid,
     .read_cr    = emul_test_read_cr,
+    .read_xcr   = emul_test_read_xcr,
     .read_msr   = read_msr,
     .get_fpu    = emul_test_get_fpu,
     .put_fpu    = emul_test_put_fpu,
--- a/tools/tests/x86_emulator/x86_emulate.c
+++ b/tools/tests/x86_emulator/x86_emulate.c
@@ -120,6 +120,19 @@ int emul_test_read_cr(
     return X86EMUL_UNHANDLEABLE;
 }
 
+int emul_test_read_xcr(
+    unsigned int reg,
+    uint64_t *val,
+    struct x86_emulate_ctxt *ctxt)
+{
+    uint32_t lo, hi;
+
+    asm ( "xgetbv" : "=a" (lo), "=d" (hi) : "c" (reg) );
+    *val = lo | ((uint64_t)hi << 32);
+
+    return X86EMUL_OKAY;
+}
+
 int emul_test_get_fpu(
     void (*exception_callback)(void *, struct cpu_user_regs *),
     void *exception_callback_arg,
--- a/tools/tests/x86_emulator/x86_emulate.h
+++ b/tools/tests/x86_emulator/x86_emulate.h
@@ -215,6 +215,11 @@ int emul_test_read_cr(
     unsigned long *val,
     struct x86_emulate_ctxt *ctxt);
 
+int emul_test_read_xcr(
+    unsigned int reg,
+    uint64_t *val,
+    struct x86_emulate_ctxt *ctxt);
+
 int emul_test_get_fpu(
     void (*exception_callback)(void *, struct cpu_user_regs *),
     void *exception_callback_arg,
--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -1643,6 +1643,49 @@ static int hvmemul_write_cr(
     return rc;
 }
 
+static int hvmemul_read_xcr(
+    unsigned int reg,
+    uint64_t *val,
+    struct x86_emulate_ctxt *ctxt)
+{
+    uint32_t lo, hi;
+
+    switch ( reg )
+    {
+    case 0:
+        *val = current->arch.xcr0;
+        return X86EMUL_OKAY;
+
+    case 1:
+        if ( !cpu_has_xgetbv1 )
+            return X86EMUL_UNHANDLEABLE;
+        break;
+
+    default:
+        return X86EMUL_UNHANDLEABLE;
+    }
+
+    asm ( ".byte 0x0f,0x01,0xd0" /* xgetbv */
+          : "=a" (lo), "=d" (hi) : "c" (reg) );
+    *val = lo | ((uint64_t)hi << 32);
+    HVMTRACE_LONG_2D(XCR_READ, reg, TRC_PAR_LONG(*val));
+
+    return X86EMUL_OKAY;
+}
+
+static int hvmemul_write_xcr(
+    unsigned int reg,
+    uint64_t val,
+    struct x86_emulate_ctxt *ctxt)
+{
+    HVMTRACE_LONG_2D(XCR_WRITE, reg, TRC_PAR_LONG(val));
+    if ( likely(handle_xsetbv(reg, val) == 0) )
+        return X86EMUL_OKAY;
+
+    x86_emul_hw_exception(TRAP_gp_fault, 0, ctxt);
+    return X86EMUL_EXCEPTION;
+}
+
 static int hvmemul_read_msr(
     unsigned int reg,
     uint64_t *val,
@@ -1691,22 +1734,6 @@ static int hvmemul_get_fpu(
 {
     struct vcpu *curr = current;
 
-    switch ( type )
-    {
-    case X86EMUL_FPU_fpu:
-    case X86EMUL_FPU_wait:
-    case X86EMUL_FPU_mmx:
-    case X86EMUL_FPU_xmm:
-        break;
-    case X86EMUL_FPU_ymm:
-        if ( !(curr->arch.xcr0 & XSTATE_SSE) ||
-             !(curr->arch.xcr0 & XSTATE_YMM) )
-            return X86EMUL_UNHANDLEABLE;
-        break;
-    default:
-        return X86EMUL_UNHANDLEABLE;
-    }
-
     if ( !curr->fpu_dirtied )
         hvm_funcs.fpu_dirty_intercept();
     else if ( type == X86EMUL_FPU_fpu )
@@ -1890,6 +1917,8 @@ static const struct x86_emulate_ops hvm_
     .write_io      = hvmemul_write_io,
     .read_cr       = hvmemul_read_cr,
     .write_cr      = hvmemul_write_cr,
+    .read_xcr      = hvmemul_read_xcr,
+    .write_xcr     = hvmemul_write_xcr,
     .read_msr      = hvmemul_read_msr,
     .write_msr     = hvmemul_write_msr,
     .wbinvd        = hvmemul_wbinvd,
@@ -1915,6 +1944,8 @@ static const struct x86_emulate_ops hvm_
     .write_io      = hvmemul_write_io_discard,
     .read_cr       = hvmemul_read_cr,
     .write_cr      = hvmemul_write_cr,
+    .read_xcr      = hvmemul_read_xcr,
+    .write_xcr     = hvmemul_write_xcr,
     .read_msr      = hvmemul_read_msr,
     .write_msr     = hvmemul_write_msr_discard,
     .wbinvd        = hvmemul_wbinvd_discard,
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -2493,6 +2493,16 @@ static int priv_op_write_dr(unsigned int
            ? X86EMUL_OKAY : X86EMUL_UNHANDLEABLE;
 }
 
+static int priv_op_write_xcr(unsigned int reg, uint64_t val,
+                             struct x86_emulate_ctxt *ctxt)
+{
+    if ( likely(handle_xsetbv(reg, val) == 0) )
+        return X86EMUL_OKAY;
+
+    x86_emul_hw_exception(TRAP_gp_fault, 0, ctxt);
+    return X86EMUL_EXCEPTION;
+}
+
 static inline uint64_t guest_misc_enable(uint64_t val)
 {
     val &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL |
@@ -2969,6 +2979,7 @@ static const struct x86_emulate_ops priv
     .write_cr            = priv_op_write_cr,
     .read_dr             = priv_op_read_dr,
     .write_dr            = priv_op_write_dr,
+    .write_xcr           = priv_op_write_xcr,
     .read_msr            = priv_op_read_msr,
     .write_msr           = priv_op_write_msr,
     .cpuid               = pv_emul_cpuid,
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1117,10 +1117,27 @@ static int _get_fpu(
     struct x86_emulate_ctxt *ctxt,
     const struct x86_emulate_ops *ops)
 {
+    uint64_t xcr0;
     int rc;
 
     fail_if(!ops->get_fpu);
     ASSERT(type != X86EMUL_FPU_none);
+
+    if ( type < X86EMUL_FPU_ymm || !ops->read_xcr ||
+         ops->read_xcr(0, &xcr0, ctxt) != X86EMUL_OKAY )
+        xcr0 = 0;
+
+    switch ( type )
+    {
+    case X86EMUL_FPU_ymm:
+        if ( !(xcr0 & XSTATE_SSE) || !(xcr0 & XSTATE_YMM) )
+            return X86EMUL_UNHANDLEABLE;
+        break;
+
+    default:
+        break;
+    }
+
     rc = ops->get_fpu(fpu_handle_exception, fic, type, ctxt);
 
     if ( rc == X86EMUL_OKAY )
@@ -1648,7 +1665,8 @@ in_protmode(
 #define EBX 3
 
 static bool vcpu_has(
-    unsigned int eax,
+    unsigned int leaf,
+    unsigned int subleaf,
     unsigned int reg,
     unsigned int bit,
     struct x86_emulate_ctxt *ctxt,
@@ -1658,7 +1676,7 @@ static bool vcpu_has(
     int rc = X86EMUL_OKAY;
 
     fail_if(!ops->cpuid);
-    rc = ops->cpuid(eax, 0, &res, ctxt);
+    rc = ops->cpuid(leaf, subleaf, &res, ctxt);
     if ( rc == X86EMUL_OKAY )
     {
         switch ( reg )
@@ -1677,53 +1695,56 @@ static bool vcpu_has(
     return rc == X86EMUL_OKAY;
 }
 
-#define vcpu_has_fpu()         vcpu_has(         1, EDX,  0, ctxt, ops)
-#define vcpu_has_sep()         vcpu_has(         1, EDX, 11, ctxt, ops)
-#define vcpu_has_cx8()         vcpu_has(         1, EDX,  8, ctxt, ops)
-#define vcpu_has_cmov()        vcpu_has(         1, EDX, 15, ctxt, ops)
-#define vcpu_has_clflush()     vcpu_has(         1, EDX, 19, ctxt, ops)
-#define vcpu_has_mmx()         vcpu_has(         1, EDX, 23, ctxt, ops)
-#define vcpu_has_sse()         vcpu_has(         1, EDX, 25, ctxt, ops)
-#define vcpu_has_sse2()        vcpu_has(         1, EDX, 26, ctxt, ops)
-#define vcpu_has_sse3()        vcpu_has(         1, ECX,  0, ctxt, ops)
-#define vcpu_has_pclmulqdq()   vcpu_has(         1, ECX,  1, ctxt, ops)
-#define vcpu_has_ssse3()       vcpu_has(         1, ECX,  9, ctxt, ops)
-#define vcpu_has_fma()         vcpu_has(         1, ECX, 12, ctxt, ops)
-#define vcpu_has_cx16()        vcpu_has(         1, ECX, 13, ctxt, ops)
-#define vcpu_has_sse4_1()      vcpu_has(         1, ECX, 19, ctxt, ops)
-#define vcpu_has_sse4_2()      vcpu_has(         1, ECX, 20, ctxt, ops)
-#define vcpu_has_movbe()       vcpu_has(         1, ECX, 22, ctxt, ops)
-#define vcpu_has_popcnt()      vcpu_has(         1, ECX, 23, ctxt, ops)
-#define vcpu_has_aesni()       vcpu_has(         1, ECX, 25, ctxt, ops)
-#define vcpu_has_avx()         vcpu_has(         1, ECX, 28, ctxt, ops)
-#define vcpu_has_f16c()        vcpu_has(         1, ECX, 29, ctxt, ops)
-#define vcpu_has_rdrand()      vcpu_has(         1, ECX, 30, ctxt, ops)
-#define vcpu_has_mmxext()     (vcpu_has(0x80000001, EDX, 22, ctxt, ops) || \
+#define X 0 /* Just for documentation purposes. */
+
+#define vcpu_has_fpu()         vcpu_has(         1, X, EDX,  0, ctxt, ops)
+#define vcpu_has_sep()         vcpu_has(         1, X, EDX, 11, ctxt, ops)
+#define vcpu_has_cx8()         vcpu_has(         1, X, EDX,  8, ctxt, ops)
+#define vcpu_has_cmov()        vcpu_has(         1, X, EDX, 15, ctxt, ops)
+#define vcpu_has_clflush()     vcpu_has(         1, X, EDX, 19, ctxt, ops)
+#define vcpu_has_mmx()         vcpu_has(         1, X, EDX, 23, ctxt, ops)
+#define vcpu_has_sse()         vcpu_has(         1, X, EDX, 25, ctxt, ops)
+#define vcpu_has_sse2()        vcpu_has(         1, X, EDX, 26, ctxt, ops)
+#define vcpu_has_sse3()        vcpu_has(         1, X, ECX,  0, ctxt, ops)
+#define vcpu_has_pclmulqdq()   vcpu_has(         1, X, ECX,  1, ctxt, ops)
+#define vcpu_has_ssse3()       vcpu_has(         1, X, ECX,  9, ctxt, ops)
+#define vcpu_has_fma()         vcpu_has(         1, X, ECX, 12, ctxt, ops)
+#define vcpu_has_cx16()        vcpu_has(         1, X, ECX, 13, ctxt, ops)
+#define vcpu_has_sse4_1()      vcpu_has(         1, X, ECX, 19, ctxt, ops)
+#define vcpu_has_sse4_2()      vcpu_has(         1, X, ECX, 20, ctxt, ops)
+#define vcpu_has_movbe()       vcpu_has(         1, X, ECX, 22, ctxt, ops)
+#define vcpu_has_popcnt()      vcpu_has(         1, X, ECX, 23, ctxt, ops)
+#define vcpu_has_aesni()       vcpu_has(         1, X, ECX, 25, ctxt, ops)
+#define vcpu_has_avx()         vcpu_has(         1, X, ECX, 28, ctxt, ops)
+#define vcpu_has_f16c()        vcpu_has(         1, X, ECX, 29, ctxt, ops)
+#define vcpu_has_rdrand()      vcpu_has(         1, X, ECX, 30, ctxt, ops)
+#define vcpu_has_mmxext()     (vcpu_has(0x80000001, X, EDX, 22, ctxt, ops) || \
                                vcpu_has_sse())
-#define vcpu_has_3dnow_ext()   vcpu_has(0x80000001, EDX, 30, ctxt, ops)
-#define vcpu_has_3dnow()       vcpu_has(0x80000001, EDX, 31, ctxt, ops)
-#define vcpu_has_lahf_lm()     vcpu_has(0x80000001, ECX,  0, ctxt, ops)
-#define vcpu_has_cr8_legacy()  vcpu_has(0x80000001, ECX,  4, ctxt, ops)
-#define vcpu_has_lzcnt()       vcpu_has(0x80000001, ECX,  5, ctxt, ops)
-#define vcpu_has_sse4a()       vcpu_has(0x80000001, ECX,  6, ctxt, ops)
-#define vcpu_has_misalignsse() vcpu_has(0x80000001, ECX,  7, ctxt, ops)
-#define vcpu_has_xop()         vcpu_has(0x80000001, ECX, 12, ctxt, ops)
-#define vcpu_has_fma4()        vcpu_has(0x80000001, ECX, 16, ctxt, ops)
-#define vcpu_has_tbm()         vcpu_has(0x80000001, ECX, 21, ctxt, ops)
-#define vcpu_has_bmi1()        vcpu_has(         7, EBX,  3, ctxt, ops)
-#define vcpu_has_hle()         vcpu_has(         7, EBX,  4, ctxt, ops)
-#define vcpu_has_avx2()        vcpu_has(         7, EBX,  5, ctxt, ops)
-#define vcpu_has_bmi2()        vcpu_has(         7, EBX,  8, ctxt, ops)
-#define vcpu_has_rtm()         vcpu_has(         7, EBX, 11, ctxt, ops)
-#define vcpu_has_mpx()         vcpu_has(         7, EBX, 14, ctxt, ops)
-#define vcpu_has_rdseed()      vcpu_has(         7, EBX, 18, ctxt, ops)
-#define vcpu_has_adx()         vcpu_has(         7, EBX, 19, ctxt, ops)
-#define vcpu_has_smap()        vcpu_has(         7, EBX, 20, ctxt, ops)
-#define vcpu_has_clflushopt()  vcpu_has(         7, EBX, 23, ctxt, ops)
-#define vcpu_has_clwb()        vcpu_has(         7, EBX, 24, ctxt, ops)
-#define vcpu_has_sha()         vcpu_has(         7, EBX, 29, ctxt, ops)
-#define vcpu_has_rdpid()       vcpu_has(         7, ECX, 22, ctxt, ops)
-#define vcpu_has_clzero()      vcpu_has(0x80000008, EBX,  0, ctxt, ops)
+#define vcpu_has_3dnow_ext()   vcpu_has(0x80000001, X, EDX, 30, ctxt, ops)
+#define vcpu_has_3dnow()       vcpu_has(0x80000001, X, EDX, 31, ctxt, ops)
+#define vcpu_has_lahf_lm()     vcpu_has(0x80000001, X, ECX,  0, ctxt, ops)
+#define vcpu_has_cr8_legacy()  vcpu_has(0x80000001, X, ECX,  4, ctxt, ops)
+#define vcpu_has_lzcnt()       vcpu_has(0x80000001, X, ECX,  5, ctxt, ops)
+#define vcpu_has_sse4a()       vcpu_has(0x80000001, X, ECX,  6, ctxt, ops)
+#define vcpu_has_misalignsse() vcpu_has(0x80000001, X, ECX,  7, ctxt, ops)
+#define vcpu_has_xop()         vcpu_has(0x80000001, X, ECX, 12, ctxt, ops)
+#define vcpu_has_fma4()        vcpu_has(0x80000001, X, ECX, 16, ctxt, ops)
+#define vcpu_has_tbm()         vcpu_has(0x80000001, X, ECX, 21, ctxt, ops)
+#define vcpu_has_bmi1()        vcpu_has(         7, 0, EBX,  3, ctxt, ops)
+#define vcpu_has_hle()         vcpu_has(         7, 0, EBX,  4, ctxt, ops)
+#define vcpu_has_avx2()        vcpu_has(         7, 0, EBX,  5, ctxt, ops)
+#define vcpu_has_bmi2()        vcpu_has(         7, 0, EBX,  8, ctxt, ops)
+#define vcpu_has_rtm()         vcpu_has(         7, 0, EBX, 11, ctxt, ops)
+#define vcpu_has_mpx()         vcpu_has(         7, 0, EBX, 14, ctxt, ops)
+#define vcpu_has_rdseed()      vcpu_has(         7, 0, EBX, 18, ctxt, ops)
+#define vcpu_has_adx()         vcpu_has(         7, 0, EBX, 19, ctxt, ops)
+#define vcpu_has_smap()        vcpu_has(         7, 0, EBX, 20, ctxt, ops)
+#define vcpu_has_clflushopt()  vcpu_has(         7, 0, EBX, 23, ctxt, ops)
+#define vcpu_has_clwb()        vcpu_has(         7, 0, EBX, 24, ctxt, ops)
+#define vcpu_has_sha()         vcpu_has(         7, 0, EBX, 29, ctxt, ops)
+#define vcpu_has_rdpid()       vcpu_has(         7, 0, ECX, 22, ctxt, ops)
+#define vcpu_has_xgetbv1()     vcpu_has(       0xd, 1, EAX,  2, ctxt, ops)
+#define vcpu_has_clzero()      vcpu_has(0x80000008, X, EBX,  0, ctxt, ops)
 
 #define vcpu_must_have(feat) \
     generate_exception_if(!vcpu_has_##feat(), EXC_UD)
@@ -5144,18 +5165,33 @@ x86_emulate(
                 _regs.eflags |= X86_EFLAGS_AC;
             goto complete_insn;
 
-#ifdef __XEN__
-        case 0xd1: /* xsetbv */
+        case 0xd0: /* xgetbv */
             generate_exception_if(vex.pfx, EXC_UD);
-            if ( !ops->read_cr || ops->read_cr(4, &cr4, ctxt) != X86EMUL_OKAY )
+            if ( !ops->read_cr || !ops->read_xcr ||
+                 ops->read_cr(4, &cr4, ctxt) != X86EMUL_OKAY )
                 cr4 = 0;
             generate_exception_if(!(cr4 & X86_CR4_OSXSAVE), EXC_UD);
-            generate_exception_if(!mode_ring0() ||
-                                  handle_xsetbv(_regs.ecx,
-                                                _regs.eax | (_regs.rdx << 32)),
+            generate_exception_if(_regs.ecx > (vcpu_has_xgetbv1() ? 1 : 0),
                                   EXC_GP, 0);
+            rc = ops->read_xcr(_regs.ecx, &msr_val, ctxt);
+            if ( rc != X86EMUL_OKAY )
+                goto done;
+            _regs.r(ax) = (uint32_t)msr_val;
+            _regs.r(dx) = msr_val >> 32;
+            goto complete_insn;
+
+        case 0xd1: /* xsetbv */
+            generate_exception_if(vex.pfx, EXC_UD);
+            if ( !ops->read_cr || !ops->write_xcr ||
+                 ops->read_cr(4, &cr4, ctxt) != X86EMUL_OKAY )
+                cr4 = 0;
+            generate_exception_if(!(cr4 & X86_CR4_OSXSAVE), EXC_UD);
+            generate_exception_if(!mode_ring0() || _regs.ecx, EXC_GP, 0);
+            rc = ops->write_xcr(_regs.ecx,
+                                _regs.eax | ((uint64_t)_regs.edx << 32), ctxt);
+            if ( rc != X86EMUL_OKAY )
+                goto done;
             goto complete_insn;
-#endif
 
         case 0xd4: /* vmfunc */
             generate_exception_if(vex.pfx, EXC_UD);
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -380,6 +380,24 @@ struct x86_emulate_ops
         struct x86_emulate_ctxt *ctxt);
 
     /*
+     * read_xcr: Read from extended control register.
+     *  @reg:   [IN ] Register to read.
+     */
+    int (*read_xcr)(
+        unsigned int reg,
+        uint64_t *val,
+        struct x86_emulate_ctxt *ctxt);
+
+    /*
+     * write_xcr: Write to extended control register.
+     *  @reg:   [IN ] Register to write.
+     */
+    int (*write_xcr)(
+        unsigned int reg,
+        uint64_t val,
+        struct x86_emulate_ctxt *ctxt);
+
+    /*
      * read_msr: Read from model-specific register.
      *  @reg:   [IN ] Register to read.
      */
--- a/xen/include/asm-x86/hvm/trace.h
+++ b/xen/include/asm-x86/hvm/trace.h
@@ -33,6 +33,8 @@
 #define DO_TRC_HVM_CR_WRITE64  DEFAULT_HVM_REGACCESS
 #define DO_TRC_HVM_DR_READ     DEFAULT_HVM_REGACCESS
 #define DO_TRC_HVM_DR_WRITE    DEFAULT_HVM_REGACCESS
+#define DO_TRC_HVM_XCR_READ64  DEFAULT_HVM_REGACCESS
+#define DO_TRC_HVM_XCR_WRITE64 DEFAULT_HVM_REGACCESS
 #define DO_TRC_HVM_MSR_READ    DEFAULT_HVM_REGACCESS
 #define DO_TRC_HVM_MSR_WRITE   DEFAULT_HVM_REGACCESS
 #define DO_TRC_HVM_RDTSC       DEFAULT_HVM_REGACCESS
--- a/xen/include/asm-x86/x86-defns.h
+++ b/xen/include/asm-x86/x86-defns.h
@@ -66,4 +66,28 @@
 #define X86_CR4_SMAP       0x00200000 /* enable SMAP */
 #define X86_CR4_PKE        0x00400000 /* enable PKE */
 
+/*
+ * XSTATE component flags in XCR0
+ */
+#define _XSTATE_FP                0
+#define XSTATE_FP                 (1ULL << _XSTATE_FP)
+#define _XSTATE_SSE               1
+#define XSTATE_SSE                (1ULL << _XSTATE_SSE)
+#define _XSTATE_YMM               2
+#define XSTATE_YMM                (1ULL << _XSTATE_YMM)
+#define _XSTATE_BNDREGS           3
+#define XSTATE_BNDREGS            (1ULL << _XSTATE_BNDREGS)
+#define _XSTATE_BNDCSR            4
+#define XSTATE_BNDCSR             (1ULL << _XSTATE_BNDCSR)
+#define _XSTATE_OPMASK            5
+#define XSTATE_OPMASK             (1ULL << _XSTATE_OPMASK)
+#define _XSTATE_ZMM               6
+#define XSTATE_ZMM                (1ULL << _XSTATE_ZMM)
+#define _XSTATE_HI_ZMM            7
+#define XSTATE_HI_ZMM             (1ULL << _XSTATE_HI_ZMM)
+#define _XSTATE_PKRU              9
+#define XSTATE_PKRU               (1ULL << _XSTATE_PKRU)
+#define _XSTATE_LWP               62
+#define XSTATE_LWP                (1ULL << _XSTATE_LWP)
+
 #endif	/* __XEN_X86_DEFNS_H__ */
--- a/xen/include/asm-x86/xstate.h
+++ b/xen/include/asm-x86/xstate.h
@@ -10,6 +10,7 @@
 
 #include <xen/sched.h>
 #include <asm/cpufeature.h>
+#include <asm/x86-defns.h>
 
 #define FCW_DEFAULT               0x037f
 #define FCW_RESET                 0x0040
@@ -28,27 +29,6 @@ extern uint32_t mxcsr_mask;
 #define XSAVE_HDR_OFFSET          FXSAVE_SIZE
 #define XSTATE_AREA_MIN_SIZE      (FXSAVE_SIZE + XSAVE_HDR_SIZE)
 
-#define _XSTATE_FP                0
-#define XSTATE_FP                 (1ULL << _XSTATE_FP)
-#define _XSTATE_SSE               1
-#define XSTATE_SSE                (1ULL << _XSTATE_SSE)
-#define _XSTATE_YMM               2
-#define XSTATE_YMM                (1ULL << _XSTATE_YMM)
-#define _XSTATE_BNDREGS           3
-#define XSTATE_BNDREGS            (1ULL << _XSTATE_BNDREGS)
-#define _XSTATE_BNDCSR            4
-#define XSTATE_BNDCSR             (1ULL << _XSTATE_BNDCSR)
-#define _XSTATE_OPMASK            5
-#define XSTATE_OPMASK             (1ULL << _XSTATE_OPMASK)
-#define _XSTATE_ZMM               6
-#define XSTATE_ZMM                (1ULL << _XSTATE_ZMM)
-#define _XSTATE_HI_ZMM            7
-#define XSTATE_HI_ZMM             (1ULL << _XSTATE_HI_ZMM)
-#define _XSTATE_PKRU              9
-#define XSTATE_PKRU               (1ULL << _XSTATE_PKRU)
-#define _XSTATE_LWP               62
-#define XSTATE_LWP                (1ULL << _XSTATE_LWP)
-
 #define XSTATE_FP_SSE  (XSTATE_FP | XSTATE_SSE)
 #define XCNTXT_MASK    (XSTATE_FP | XSTATE_SSE | XSTATE_YMM | XSTATE_OPMASK | \
                         XSTATE_ZMM | XSTATE_HI_ZMM | XSTATE_NONLAZY)
--- a/xen/include/public/trace.h
+++ b/xen/include/public/trace.h
@@ -234,6 +234,8 @@
 #define TRC_HVM_TRAP             (TRC_HVM_HANDLER + 0x23)
 #define TRC_HVM_TRAP_DEBUG       (TRC_HVM_HANDLER + 0x24)
 #define TRC_HVM_VLAPIC           (TRC_HVM_HANDLER + 0x25)
+#define TRC_HVM_XCR_READ64      (TRC_HVM_HANDLER + TRC_64_FLAG + 0x26)
+#define TRC_HVM_XCR_WRITE64     (TRC_HVM_HANDLER + TRC_64_FLAG + 0x27)
 
 #define TRC_HVM_IOPORT_WRITE    (TRC_HVM_HANDLER + 0x216)
 #define TRC_HVM_IOMEM_WRITE     (TRC_HVM_HANDLER + 0x217)



[-- Attachment #2: x86emul-XCR-accesses.patch --]
[-- Type: text/plain, Size: 21691 bytes --]

x86emul: abstract out XCRn accesses

Use hooks, just like done for other special purpose registers.

This includes moving XCR0 checks from hvmemul_get_fpu() to the emulator
itself as well as adding support for XGETBV emulation.

For now fuzzer reads will obtain the real values (minus the fuzzing of
the hook pointer itself).

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/tools/fuzz/x86_instruction_emulator/fuzz-emul.c
+++ b/tools/fuzz/x86_instruction_emulator/fuzz-emul.c
@@ -409,6 +409,8 @@ static int fuzz_write_cr(
     return X86EMUL_OKAY;
 }
 
+#define fuzz_read_xcr emul_test_read_xcr
+
 enum {
     MSRI_IA32_SYSENTER_CS,
     MSRI_IA32_SYSENTER_ESP,
@@ -527,6 +529,7 @@ static const struct x86_emulate_ops all_
     SET(write_io),
     SET(read_cr),
     SET(write_cr),
+    SET(read_xcr),
     SET(read_msr),
     SET(write_msr),
     SET(wbinvd),
@@ -635,6 +638,7 @@ enum {
     HOOK_write_cr,
     HOOK_read_dr,
     HOOK_write_dr,
+    HOOK_read_xcr,
     HOOK_read_msr,
     HOOK_write_msr,
     HOOK_wbinvd,
@@ -679,6 +683,7 @@ static void disable_hooks(struct x86_emu
     MAYBE_DISABLE_HOOK(write_io);
     MAYBE_DISABLE_HOOK(read_cr);
     MAYBE_DISABLE_HOOK(write_cr);
+    MAYBE_DISABLE_HOOK(read_xcr);
     MAYBE_DISABLE_HOOK(read_msr);
     MAYBE_DISABLE_HOOK(write_msr);
     MAYBE_DISABLE_HOOK(wbinvd);
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -368,6 +368,7 @@ static struct x86_emulate_ops emulops =
     .read_segment = read_segment,
     .cpuid      = emul_test_cpuid,
     .read_cr    = emul_test_read_cr,
+    .read_xcr   = emul_test_read_xcr,
     .read_msr   = read_msr,
     .get_fpu    = emul_test_get_fpu,
     .put_fpu    = emul_test_put_fpu,
--- a/tools/tests/x86_emulator/x86_emulate.c
+++ b/tools/tests/x86_emulator/x86_emulate.c
@@ -120,6 +120,19 @@ int emul_test_read_cr(
     return X86EMUL_UNHANDLEABLE;
 }
 
+int emul_test_read_xcr(
+    unsigned int reg,
+    uint64_t *val,
+    struct x86_emulate_ctxt *ctxt)
+{
+    uint32_t lo, hi;
+
+    asm ( "xgetbv" : "=a" (lo), "=d" (hi) : "c" (reg) );
+    *val = lo | ((uint64_t)hi << 32);
+
+    return X86EMUL_OKAY;
+}
+
 int emul_test_get_fpu(
     void (*exception_callback)(void *, struct cpu_user_regs *),
     void *exception_callback_arg,
--- a/tools/tests/x86_emulator/x86_emulate.h
+++ b/tools/tests/x86_emulator/x86_emulate.h
@@ -215,6 +215,11 @@ int emul_test_read_cr(
     unsigned long *val,
     struct x86_emulate_ctxt *ctxt);
 
+int emul_test_read_xcr(
+    unsigned int reg,
+    uint64_t *val,
+    struct x86_emulate_ctxt *ctxt);
+
 int emul_test_get_fpu(
     void (*exception_callback)(void *, struct cpu_user_regs *),
     void *exception_callback_arg,
--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -1643,6 +1643,49 @@ static int hvmemul_write_cr(
     return rc;
 }
 
+static int hvmemul_read_xcr(
+    unsigned int reg,
+    uint64_t *val,
+    struct x86_emulate_ctxt *ctxt)
+{
+    uint32_t lo, hi;
+
+    switch ( reg )
+    {
+    case 0:
+        *val = current->arch.xcr0;
+        return X86EMUL_OKAY;
+
+    case 1:
+        if ( !cpu_has_xgetbv1 )
+            return X86EMUL_UNHANDLEABLE;
+        break;
+
+    default:
+        return X86EMUL_UNHANDLEABLE;
+    }
+
+    asm ( ".byte 0x0f,0x01,0xd0" /* xgetbv */
+          : "=a" (lo), "=d" (hi) : "c" (reg) );
+    *val = lo | ((uint64_t)hi << 32);
+    HVMTRACE_LONG_2D(XCR_READ, reg, TRC_PAR_LONG(*val));
+
+    return X86EMUL_OKAY;
+}
+
+static int hvmemul_write_xcr(
+    unsigned int reg,
+    uint64_t val,
+    struct x86_emulate_ctxt *ctxt)
+{
+    HVMTRACE_LONG_2D(XCR_WRITE, reg, TRC_PAR_LONG(val));
+    if ( likely(handle_xsetbv(reg, val) == 0) )
+        return X86EMUL_OKAY;
+
+    x86_emul_hw_exception(TRAP_gp_fault, 0, ctxt);
+    return X86EMUL_EXCEPTION;
+}
+
 static int hvmemul_read_msr(
     unsigned int reg,
     uint64_t *val,
@@ -1691,22 +1734,6 @@ static int hvmemul_get_fpu(
 {
     struct vcpu *curr = current;
 
-    switch ( type )
-    {
-    case X86EMUL_FPU_fpu:
-    case X86EMUL_FPU_wait:
-    case X86EMUL_FPU_mmx:
-    case X86EMUL_FPU_xmm:
-        break;
-    case X86EMUL_FPU_ymm:
-        if ( !(curr->arch.xcr0 & XSTATE_SSE) ||
-             !(curr->arch.xcr0 & XSTATE_YMM) )
-            return X86EMUL_UNHANDLEABLE;
-        break;
-    default:
-        return X86EMUL_UNHANDLEABLE;
-    }
-
     if ( !curr->fpu_dirtied )
         hvm_funcs.fpu_dirty_intercept();
     else if ( type == X86EMUL_FPU_fpu )
@@ -1890,6 +1917,8 @@ static const struct x86_emulate_ops hvm_
     .write_io      = hvmemul_write_io,
     .read_cr       = hvmemul_read_cr,
     .write_cr      = hvmemul_write_cr,
+    .read_xcr      = hvmemul_read_xcr,
+    .write_xcr     = hvmemul_write_xcr,
     .read_msr      = hvmemul_read_msr,
     .write_msr     = hvmemul_write_msr,
     .wbinvd        = hvmemul_wbinvd,
@@ -1915,6 +1944,8 @@ static const struct x86_emulate_ops hvm_
     .write_io      = hvmemul_write_io_discard,
     .read_cr       = hvmemul_read_cr,
     .write_cr      = hvmemul_write_cr,
+    .read_xcr      = hvmemul_read_xcr,
+    .write_xcr     = hvmemul_write_xcr,
     .read_msr      = hvmemul_read_msr,
     .write_msr     = hvmemul_write_msr_discard,
     .wbinvd        = hvmemul_wbinvd_discard,
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -2493,6 +2493,16 @@ static int priv_op_write_dr(unsigned int
            ? X86EMUL_OKAY : X86EMUL_UNHANDLEABLE;
 }
 
+static int priv_op_write_xcr(unsigned int reg, uint64_t val,
+                             struct x86_emulate_ctxt *ctxt)
+{
+    if ( likely(handle_xsetbv(reg, val) == 0) )
+        return X86EMUL_OKAY;
+
+    x86_emul_hw_exception(TRAP_gp_fault, 0, ctxt);
+    return X86EMUL_EXCEPTION;
+}
+
 static inline uint64_t guest_misc_enable(uint64_t val)
 {
     val &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL |
@@ -2969,6 +2979,7 @@ static const struct x86_emulate_ops priv
     .write_cr            = priv_op_write_cr,
     .read_dr             = priv_op_read_dr,
     .write_dr            = priv_op_write_dr,
+    .write_xcr           = priv_op_write_xcr,
     .read_msr            = priv_op_read_msr,
     .write_msr           = priv_op_write_msr,
     .cpuid               = pv_emul_cpuid,
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1117,10 +1117,27 @@ static int _get_fpu(
     struct x86_emulate_ctxt *ctxt,
     const struct x86_emulate_ops *ops)
 {
+    uint64_t xcr0;
     int rc;
 
     fail_if(!ops->get_fpu);
     ASSERT(type != X86EMUL_FPU_none);
+
+    if ( type < X86EMUL_FPU_ymm || !ops->read_xcr ||
+         ops->read_xcr(0, &xcr0, ctxt) != X86EMUL_OKAY )
+        xcr0 = 0;
+
+    switch ( type )
+    {
+    case X86EMUL_FPU_ymm:
+        if ( !(xcr0 & XSTATE_SSE) || !(xcr0 & XSTATE_YMM) )
+            return X86EMUL_UNHANDLEABLE;
+        break;
+
+    default:
+        break;
+    }
+
     rc = ops->get_fpu(fpu_handle_exception, fic, type, ctxt);
 
     if ( rc == X86EMUL_OKAY )
@@ -1648,7 +1665,8 @@ in_protmode(
 #define EBX 3
 
 static bool vcpu_has(
-    unsigned int eax,
+    unsigned int leaf,
+    unsigned int subleaf,
     unsigned int reg,
     unsigned int bit,
     struct x86_emulate_ctxt *ctxt,
@@ -1658,7 +1676,7 @@ static bool vcpu_has(
     int rc = X86EMUL_OKAY;
 
     fail_if(!ops->cpuid);
-    rc = ops->cpuid(eax, 0, &res, ctxt);
+    rc = ops->cpuid(leaf, subleaf, &res, ctxt);
     if ( rc == X86EMUL_OKAY )
     {
         switch ( reg )
@@ -1677,53 +1695,56 @@ static bool vcpu_has(
     return rc == X86EMUL_OKAY;
 }
 
-#define vcpu_has_fpu()         vcpu_has(         1, EDX,  0, ctxt, ops)
-#define vcpu_has_sep()         vcpu_has(         1, EDX, 11, ctxt, ops)
-#define vcpu_has_cx8()         vcpu_has(         1, EDX,  8, ctxt, ops)
-#define vcpu_has_cmov()        vcpu_has(         1, EDX, 15, ctxt, ops)
-#define vcpu_has_clflush()     vcpu_has(         1, EDX, 19, ctxt, ops)
-#define vcpu_has_mmx()         vcpu_has(         1, EDX, 23, ctxt, ops)
-#define vcpu_has_sse()         vcpu_has(         1, EDX, 25, ctxt, ops)
-#define vcpu_has_sse2()        vcpu_has(         1, EDX, 26, ctxt, ops)
-#define vcpu_has_sse3()        vcpu_has(         1, ECX,  0, ctxt, ops)
-#define vcpu_has_pclmulqdq()   vcpu_has(         1, ECX,  1, ctxt, ops)
-#define vcpu_has_ssse3()       vcpu_has(         1, ECX,  9, ctxt, ops)
-#define vcpu_has_fma()         vcpu_has(         1, ECX, 12, ctxt, ops)
-#define vcpu_has_cx16()        vcpu_has(         1, ECX, 13, ctxt, ops)
-#define vcpu_has_sse4_1()      vcpu_has(         1, ECX, 19, ctxt, ops)
-#define vcpu_has_sse4_2()      vcpu_has(         1, ECX, 20, ctxt, ops)
-#define vcpu_has_movbe()       vcpu_has(         1, ECX, 22, ctxt, ops)
-#define vcpu_has_popcnt()      vcpu_has(         1, ECX, 23, ctxt, ops)
-#define vcpu_has_aesni()       vcpu_has(         1, ECX, 25, ctxt, ops)
-#define vcpu_has_avx()         vcpu_has(         1, ECX, 28, ctxt, ops)
-#define vcpu_has_f16c()        vcpu_has(         1, ECX, 29, ctxt, ops)
-#define vcpu_has_rdrand()      vcpu_has(         1, ECX, 30, ctxt, ops)
-#define vcpu_has_mmxext()     (vcpu_has(0x80000001, EDX, 22, ctxt, ops) || \
+#define X 0 /* Just for documentation purposes. */
+
+#define vcpu_has_fpu()         vcpu_has(         1, X, EDX,  0, ctxt, ops)
+#define vcpu_has_sep()         vcpu_has(         1, X, EDX, 11, ctxt, ops)
+#define vcpu_has_cx8()         vcpu_has(         1, X, EDX,  8, ctxt, ops)
+#define vcpu_has_cmov()        vcpu_has(         1, X, EDX, 15, ctxt, ops)
+#define vcpu_has_clflush()     vcpu_has(         1, X, EDX, 19, ctxt, ops)
+#define vcpu_has_mmx()         vcpu_has(         1, X, EDX, 23, ctxt, ops)
+#define vcpu_has_sse()         vcpu_has(         1, X, EDX, 25, ctxt, ops)
+#define vcpu_has_sse2()        vcpu_has(         1, X, EDX, 26, ctxt, ops)
+#define vcpu_has_sse3()        vcpu_has(         1, X, ECX,  0, ctxt, ops)
+#define vcpu_has_pclmulqdq()   vcpu_has(         1, X, ECX,  1, ctxt, ops)
+#define vcpu_has_ssse3()       vcpu_has(         1, X, ECX,  9, ctxt, ops)
+#define vcpu_has_fma()         vcpu_has(         1, X, ECX, 12, ctxt, ops)
+#define vcpu_has_cx16()        vcpu_has(         1, X, ECX, 13, ctxt, ops)
+#define vcpu_has_sse4_1()      vcpu_has(         1, X, ECX, 19, ctxt, ops)
+#define vcpu_has_sse4_2()      vcpu_has(         1, X, ECX, 20, ctxt, ops)
+#define vcpu_has_movbe()       vcpu_has(         1, X, ECX, 22, ctxt, ops)
+#define vcpu_has_popcnt()      vcpu_has(         1, X, ECX, 23, ctxt, ops)
+#define vcpu_has_aesni()       vcpu_has(         1, X, ECX, 25, ctxt, ops)
+#define vcpu_has_avx()         vcpu_has(         1, X, ECX, 28, ctxt, ops)
+#define vcpu_has_f16c()        vcpu_has(         1, X, ECX, 29, ctxt, ops)
+#define vcpu_has_rdrand()      vcpu_has(         1, X, ECX, 30, ctxt, ops)
+#define vcpu_has_mmxext()     (vcpu_has(0x80000001, X, EDX, 22, ctxt, ops) || \
                                vcpu_has_sse())
-#define vcpu_has_3dnow_ext()   vcpu_has(0x80000001, EDX, 30, ctxt, ops)
-#define vcpu_has_3dnow()       vcpu_has(0x80000001, EDX, 31, ctxt, ops)
-#define vcpu_has_lahf_lm()     vcpu_has(0x80000001, ECX,  0, ctxt, ops)
-#define vcpu_has_cr8_legacy()  vcpu_has(0x80000001, ECX,  4, ctxt, ops)
-#define vcpu_has_lzcnt()       vcpu_has(0x80000001, ECX,  5, ctxt, ops)
-#define vcpu_has_sse4a()       vcpu_has(0x80000001, ECX,  6, ctxt, ops)
-#define vcpu_has_misalignsse() vcpu_has(0x80000001, ECX,  7, ctxt, ops)
-#define vcpu_has_xop()         vcpu_has(0x80000001, ECX, 12, ctxt, ops)
-#define vcpu_has_fma4()        vcpu_has(0x80000001, ECX, 16, ctxt, ops)
-#define vcpu_has_tbm()         vcpu_has(0x80000001, ECX, 21, ctxt, ops)
-#define vcpu_has_bmi1()        vcpu_has(         7, EBX,  3, ctxt, ops)
-#define vcpu_has_hle()         vcpu_has(         7, EBX,  4, ctxt, ops)
-#define vcpu_has_avx2()        vcpu_has(         7, EBX,  5, ctxt, ops)
-#define vcpu_has_bmi2()        vcpu_has(         7, EBX,  8, ctxt, ops)
-#define vcpu_has_rtm()         vcpu_has(         7, EBX, 11, ctxt, ops)
-#define vcpu_has_mpx()         vcpu_has(         7, EBX, 14, ctxt, ops)
-#define vcpu_has_rdseed()      vcpu_has(         7, EBX, 18, ctxt, ops)
-#define vcpu_has_adx()         vcpu_has(         7, EBX, 19, ctxt, ops)
-#define vcpu_has_smap()        vcpu_has(         7, EBX, 20, ctxt, ops)
-#define vcpu_has_clflushopt()  vcpu_has(         7, EBX, 23, ctxt, ops)
-#define vcpu_has_clwb()        vcpu_has(         7, EBX, 24, ctxt, ops)
-#define vcpu_has_sha()         vcpu_has(         7, EBX, 29, ctxt, ops)
-#define vcpu_has_rdpid()       vcpu_has(         7, ECX, 22, ctxt, ops)
-#define vcpu_has_clzero()      vcpu_has(0x80000008, EBX,  0, ctxt, ops)
+#define vcpu_has_3dnow_ext()   vcpu_has(0x80000001, X, EDX, 30, ctxt, ops)
+#define vcpu_has_3dnow()       vcpu_has(0x80000001, X, EDX, 31, ctxt, ops)
+#define vcpu_has_lahf_lm()     vcpu_has(0x80000001, X, ECX,  0, ctxt, ops)
+#define vcpu_has_cr8_legacy()  vcpu_has(0x80000001, X, ECX,  4, ctxt, ops)
+#define vcpu_has_lzcnt()       vcpu_has(0x80000001, X, ECX,  5, ctxt, ops)
+#define vcpu_has_sse4a()       vcpu_has(0x80000001, X, ECX,  6, ctxt, ops)
+#define vcpu_has_misalignsse() vcpu_has(0x80000001, X, ECX,  7, ctxt, ops)
+#define vcpu_has_xop()         vcpu_has(0x80000001, X, ECX, 12, ctxt, ops)
+#define vcpu_has_fma4()        vcpu_has(0x80000001, X, ECX, 16, ctxt, ops)
+#define vcpu_has_tbm()         vcpu_has(0x80000001, X, ECX, 21, ctxt, ops)
+#define vcpu_has_bmi1()        vcpu_has(         7, 0, EBX,  3, ctxt, ops)
+#define vcpu_has_hle()         vcpu_has(         7, 0, EBX,  4, ctxt, ops)
+#define vcpu_has_avx2()        vcpu_has(         7, 0, EBX,  5, ctxt, ops)
+#define vcpu_has_bmi2()        vcpu_has(         7, 0, EBX,  8, ctxt, ops)
+#define vcpu_has_rtm()         vcpu_has(         7, 0, EBX, 11, ctxt, ops)
+#define vcpu_has_mpx()         vcpu_has(         7, 0, EBX, 14, ctxt, ops)
+#define vcpu_has_rdseed()      vcpu_has(         7, 0, EBX, 18, ctxt, ops)
+#define vcpu_has_adx()         vcpu_has(         7, 0, EBX, 19, ctxt, ops)
+#define vcpu_has_smap()        vcpu_has(         7, 0, EBX, 20, ctxt, ops)
+#define vcpu_has_clflushopt()  vcpu_has(         7, 0, EBX, 23, ctxt, ops)
+#define vcpu_has_clwb()        vcpu_has(         7, 0, EBX, 24, ctxt, ops)
+#define vcpu_has_sha()         vcpu_has(         7, 0, EBX, 29, ctxt, ops)
+#define vcpu_has_rdpid()       vcpu_has(         7, 0, ECX, 22, ctxt, ops)
+#define vcpu_has_xgetbv1()     vcpu_has(       0xd, 1, EAX,  2, ctxt, ops)
+#define vcpu_has_clzero()      vcpu_has(0x80000008, X, EBX,  0, ctxt, ops)
 
 #define vcpu_must_have(feat) \
     generate_exception_if(!vcpu_has_##feat(), EXC_UD)
@@ -5144,18 +5165,33 @@ x86_emulate(
                 _regs.eflags |= X86_EFLAGS_AC;
             goto complete_insn;
 
-#ifdef __XEN__
-        case 0xd1: /* xsetbv */
+        case 0xd0: /* xgetbv */
             generate_exception_if(vex.pfx, EXC_UD);
-            if ( !ops->read_cr || ops->read_cr(4, &cr4, ctxt) != X86EMUL_OKAY )
+            if ( !ops->read_cr || !ops->read_xcr ||
+                 ops->read_cr(4, &cr4, ctxt) != X86EMUL_OKAY )
                 cr4 = 0;
             generate_exception_if(!(cr4 & X86_CR4_OSXSAVE), EXC_UD);
-            generate_exception_if(!mode_ring0() ||
-                                  handle_xsetbv(_regs.ecx,
-                                                _regs.eax | (_regs.rdx << 32)),
+            generate_exception_if(_regs.ecx > (vcpu_has_xgetbv1() ? 1 : 0),
                                   EXC_GP, 0);
+            rc = ops->read_xcr(_regs.ecx, &msr_val, ctxt);
+            if ( rc != X86EMUL_OKAY )
+                goto done;
+            _regs.r(ax) = (uint32_t)msr_val;
+            _regs.r(dx) = msr_val >> 32;
+            goto complete_insn;
+
+        case 0xd1: /* xsetbv */
+            generate_exception_if(vex.pfx, EXC_UD);
+            if ( !ops->read_cr || !ops->write_xcr ||
+                 ops->read_cr(4, &cr4, ctxt) != X86EMUL_OKAY )
+                cr4 = 0;
+            generate_exception_if(!(cr4 & X86_CR4_OSXSAVE), EXC_UD);
+            generate_exception_if(!mode_ring0() || _regs.ecx, EXC_GP, 0);
+            rc = ops->write_xcr(_regs.ecx,
+                                _regs.eax | ((uint64_t)_regs.edx << 32), ctxt);
+            if ( rc != X86EMUL_OKAY )
+                goto done;
             goto complete_insn;
-#endif
 
         case 0xd4: /* vmfunc */
             generate_exception_if(vex.pfx, EXC_UD);
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -380,6 +380,24 @@ struct x86_emulate_ops
         struct x86_emulate_ctxt *ctxt);
 
     /*
+     * read_xcr: Read from extended control register.
+     *  @reg:   [IN ] Register to read.
+     */
+    int (*read_xcr)(
+        unsigned int reg,
+        uint64_t *val,
+        struct x86_emulate_ctxt *ctxt);
+
+    /*
+     * write_xcr: Write to extended control register.
+     *  @reg:   [IN ] Register to write.
+     */
+    int (*write_xcr)(
+        unsigned int reg,
+        uint64_t val,
+        struct x86_emulate_ctxt *ctxt);
+
+    /*
      * read_msr: Read from model-specific register.
      *  @reg:   [IN ] Register to read.
      */
--- a/xen/include/asm-x86/hvm/trace.h
+++ b/xen/include/asm-x86/hvm/trace.h
@@ -33,6 +33,8 @@
 #define DO_TRC_HVM_CR_WRITE64  DEFAULT_HVM_REGACCESS
 #define DO_TRC_HVM_DR_READ     DEFAULT_HVM_REGACCESS
 #define DO_TRC_HVM_DR_WRITE    DEFAULT_HVM_REGACCESS
+#define DO_TRC_HVM_XCR_READ64  DEFAULT_HVM_REGACCESS
+#define DO_TRC_HVM_XCR_WRITE64 DEFAULT_HVM_REGACCESS
 #define DO_TRC_HVM_MSR_READ    DEFAULT_HVM_REGACCESS
 #define DO_TRC_HVM_MSR_WRITE   DEFAULT_HVM_REGACCESS
 #define DO_TRC_HVM_RDTSC       DEFAULT_HVM_REGACCESS
--- a/xen/include/asm-x86/x86-defns.h
+++ b/xen/include/asm-x86/x86-defns.h
@@ -66,4 +66,28 @@
 #define X86_CR4_SMAP       0x00200000 /* enable SMAP */
 #define X86_CR4_PKE        0x00400000 /* enable PKE */
 
+/*
+ * XSTATE component flags in XCR0
+ */
+#define _XSTATE_FP                0
+#define XSTATE_FP                 (1ULL << _XSTATE_FP)
+#define _XSTATE_SSE               1
+#define XSTATE_SSE                (1ULL << _XSTATE_SSE)
+#define _XSTATE_YMM               2
+#define XSTATE_YMM                (1ULL << _XSTATE_YMM)
+#define _XSTATE_BNDREGS           3
+#define XSTATE_BNDREGS            (1ULL << _XSTATE_BNDREGS)
+#define _XSTATE_BNDCSR            4
+#define XSTATE_BNDCSR             (1ULL << _XSTATE_BNDCSR)
+#define _XSTATE_OPMASK            5
+#define XSTATE_OPMASK             (1ULL << _XSTATE_OPMASK)
+#define _XSTATE_ZMM               6
+#define XSTATE_ZMM                (1ULL << _XSTATE_ZMM)
+#define _XSTATE_HI_ZMM            7
+#define XSTATE_HI_ZMM             (1ULL << _XSTATE_HI_ZMM)
+#define _XSTATE_PKRU              9
+#define XSTATE_PKRU               (1ULL << _XSTATE_PKRU)
+#define _XSTATE_LWP               62
+#define XSTATE_LWP                (1ULL << _XSTATE_LWP)
+
 #endif	/* __XEN_X86_DEFNS_H__ */
--- a/xen/include/asm-x86/xstate.h
+++ b/xen/include/asm-x86/xstate.h
@@ -10,6 +10,7 @@
 
 #include <xen/sched.h>
 #include <asm/cpufeature.h>
+#include <asm/x86-defns.h>
 
 #define FCW_DEFAULT               0x037f
 #define FCW_RESET                 0x0040
@@ -28,27 +29,6 @@ extern uint32_t mxcsr_mask;
 #define XSAVE_HDR_OFFSET          FXSAVE_SIZE
 #define XSTATE_AREA_MIN_SIZE      (FXSAVE_SIZE + XSAVE_HDR_SIZE)
 
-#define _XSTATE_FP                0
-#define XSTATE_FP                 (1ULL << _XSTATE_FP)
-#define _XSTATE_SSE               1
-#define XSTATE_SSE                (1ULL << _XSTATE_SSE)
-#define _XSTATE_YMM               2
-#define XSTATE_YMM                (1ULL << _XSTATE_YMM)
-#define _XSTATE_BNDREGS           3
-#define XSTATE_BNDREGS            (1ULL << _XSTATE_BNDREGS)
-#define _XSTATE_BNDCSR            4
-#define XSTATE_BNDCSR             (1ULL << _XSTATE_BNDCSR)
-#define _XSTATE_OPMASK            5
-#define XSTATE_OPMASK             (1ULL << _XSTATE_OPMASK)
-#define _XSTATE_ZMM               6
-#define XSTATE_ZMM                (1ULL << _XSTATE_ZMM)
-#define _XSTATE_HI_ZMM            7
-#define XSTATE_HI_ZMM             (1ULL << _XSTATE_HI_ZMM)
-#define _XSTATE_PKRU              9
-#define XSTATE_PKRU               (1ULL << _XSTATE_PKRU)
-#define _XSTATE_LWP               62
-#define XSTATE_LWP                (1ULL << _XSTATE_LWP)
-
 #define XSTATE_FP_SSE  (XSTATE_FP | XSTATE_SSE)
 #define XCNTXT_MASK    (XSTATE_FP | XSTATE_SSE | XSTATE_YMM | XSTATE_OPMASK | \
                         XSTATE_ZMM | XSTATE_HI_ZMM | XSTATE_NONLAZY)
--- a/xen/include/public/trace.h
+++ b/xen/include/public/trace.h
@@ -234,6 +234,8 @@
 #define TRC_HVM_TRAP             (TRC_HVM_HANDLER + 0x23)
 #define TRC_HVM_TRAP_DEBUG       (TRC_HVM_HANDLER + 0x24)
 #define TRC_HVM_VLAPIC           (TRC_HVM_HANDLER + 0x25)
+#define TRC_HVM_XCR_READ64      (TRC_HVM_HANDLER + TRC_64_FLAG + 0x26)
+#define TRC_HVM_XCR_WRITE64     (TRC_HVM_HANDLER + TRC_64_FLAG + 0x27)
 
 #define TRC_HVM_IOPORT_WRITE    (TRC_HVM_HANDLER + 0x216)
 #define TRC_HVM_IOMEM_WRITE     (TRC_HVM_HANDLER + 0x217)

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 15/17] x86emul: adjust_bnd() should check XCR0
  2017-06-21 11:23 [PATCH 00/17] x86: emulator enhancements Jan Beulich
                   ` (13 preceding siblings ...)
  2017-06-21 12:07 ` [PATCH 14/17] x86emul: abstract out XCRn accesses Jan Beulich
@ 2017-06-21 12:07 ` Jan Beulich
  2017-06-21 12:08 ` [PATCH 16/17] x86emul: make all FPU emulation use the stub Jan Beulich
                   ` (2 subsequent siblings)
  17 siblings, 0 replies; 27+ messages in thread
From: Jan Beulich @ 2017-06-21 12:07 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 935 bytes --]

Experimentally MPX instructions have been confirmed to behave as NOPs
unless both related XCR0 bits are set to 1. By implication branches
then also don't clear BNDn.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -2141,12 +2141,16 @@ static bool umip_active(struct x86_emula
 static void adjust_bnd(struct x86_emulate_ctxt *ctxt,
                        const struct x86_emulate_ops *ops, enum vex_pfx pfx)
 {
-    uint64_t bndcfg;
+    uint64_t xcr0, bndcfg;
     int rc;
 
     if ( pfx == vex_f2 || !cpu_has_mpx || !vcpu_has_mpx() )
         return;
 
+    if ( !ops->read_xcr || ops->read_xcr(0, &xcr0, ctxt) != X86EMUL_OKAY ||
+         !(xcr0 & XSTATE_BNDREGS) || !(xcr0 & XSTATE_BNDCSR) )
+        return;
+
     if ( !mode_ring0() )
         bndcfg = read_bndcfgu();
     else if ( !ops->read_msr ||




[-- Attachment #2: x86emul-adjust_bnd-XCR0.patch --]
[-- Type: text/plain, Size: 972 bytes --]

x86emul: adjust_bnd() should check XCR0

Experimentally MPX instructions have been confirmed to behave as NOPs
unless both related XCR0 bits are set to 1. By implication branches
then also don't clear BNDn.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -2141,12 +2141,16 @@ static bool umip_active(struct x86_emula
 static void adjust_bnd(struct x86_emulate_ctxt *ctxt,
                        const struct x86_emulate_ops *ops, enum vex_pfx pfx)
 {
-    uint64_t bndcfg;
+    uint64_t xcr0, bndcfg;
     int rc;
 
     if ( pfx == vex_f2 || !cpu_has_mpx || !vcpu_has_mpx() )
         return;
 
+    if ( !ops->read_xcr || ops->read_xcr(0, &xcr0, ctxt) != X86EMUL_OKAY ||
+         !(xcr0 & XSTATE_BNDREGS) || !(xcr0 & XSTATE_BNDCSR) )
+        return;
+
     if ( !mode_ring0() )
         bndcfg = read_bndcfgu();
     else if ( !ops->read_msr ||

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 16/17] x86emul: make all FPU emulation use the stub
  2017-06-21 11:23 [PATCH 00/17] x86: emulator enhancements Jan Beulich
                   ` (14 preceding siblings ...)
  2017-06-21 12:07 ` [PATCH 15/17] x86emul: adjust_bnd() should check XCR0 Jan Beulich
@ 2017-06-21 12:08 ` Jan Beulich
  2017-06-21 12:09 ` [PATCH 17/17] x86/HVM: eliminate custom #MF/#XM handling Jan Beulich
  2017-09-05 17:08 ` [PATCH 00/17] x86: emulator enhancements George Dunlap
  17 siblings, 0 replies; 27+ messages in thread
From: Jan Beulich @ 2017-06-21 12:08 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 19787 bytes --]

While this means quite some reduction of (source) code, the main
purpose is to no longer have exceptions raised from other than stubs.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1261,28 +1261,25 @@ static inline bool fpu_check_write(void)
     return !(fsw & FSW_ES);
 }
 
-#define emulate_fpu_insn(_op)                           \
-    asm volatile (                                      \
-        "movb $2f-1f,%0 \n"                             \
-        "1: " _op "     \n"                             \
-        "2:             \n"                             \
-        : "=m" (fic.insn_bytes) : : "memory" )
-
-#define emulate_fpu_insn_memdst(_op, _arg)              \
-    asm volatile (                                      \
-        "movb $2f-1f,%0 \n"                             \
-        "1: " _op " %1  \n"                             \
-        "2:             \n"                             \
-        : "=m" (fic.insn_bytes), "=m" (_arg)            \
-        : : "memory" )
-
-#define emulate_fpu_insn_memsrc(_op, _arg)              \
-    asm volatile (                                      \
-        "movb $2f-1f,%0 \n"                             \
-        "1: " _op " %1  \n"                             \
-        "2:             \n"                             \
-        : "=m" (fic.insn_bytes)                         \
-        : "m" (_arg) : "memory" )
+#define emulate_fpu_insn_memdst(opc, ext, arg)                          \
+do {                                                                    \
+    /* ModRM: mod=0, reg=ext, rm=0, i.e. a (%rax) operand */            \
+    fic.insn_bytes = 2;                                                 \
+    memcpy(get_stub(stub),                                              \
+           ((uint8_t[]){ opc, ((ext) & 7) << 3, 0xc3 }), 3);            \
+    invoke_stub("", "", "+m" (fic), "+m" (arg) : "a" (&(arg)));         \
+    put_stub(stub);                                                     \
+} while (0)
+
+#define emulate_fpu_insn_memsrc(opc, ext, arg)                          \
+do {                                                                    \
+    /* ModRM: mod=0, reg=ext, rm=0, i.e. a (%rax) operand */            \
+    fic.insn_bytes = 2;                                                 \
+    memcpy(get_stub(stub),                                              \
+           ((uint8_t[]){ opc, ((ext) & 7) << 3, 0xc3 }), 3);            \
+    invoke_stub("", "", "+m" (fic) : "m" (arg), "a" (&(arg)));          \
+    put_stub(stub);                                                     \
+} while (0)
 
 #define emulate_fpu_insn_stub(bytes...)                                 \
 do {                                                                    \
@@ -3834,8 +3831,7 @@ x86_emulate(
     case 0x9b:  /* wait/fwait */
         host_and_vcpu_must_have(fpu);
         get_fpu(X86EMUL_FPU_wait, &fic);
-        fic.insn_bytes = 1;
-        asm volatile ( "fwait" ::: "memory" );
+        emulate_fpu_insn_stub(b);
         check_fpu_exn(&fic);
         break;
 
@@ -4253,37 +4249,13 @@ x86_emulate(
             emulate_fpu_insn_stub(0xd8, modrm);
             break;
         default:
+        fpu_memsrc32:
             ASSERT(ea.type == OP_MEM);
             if ( (rc = ops->read(ea.mem.seg, ea.mem.off, &src.val,
                                  4, ctxt)) != X86EMUL_OKAY )
                 goto done;
-            switch ( modrm_reg & 7 )
-            {
-            case 0: /* fadd */
-                emulate_fpu_insn_memsrc("fadds", src.val);
-                break;
-            case 1: /* fmul */
-                emulate_fpu_insn_memsrc("fmuls", src.val);
-                break;
-            case 2: /* fcom */
-                emulate_fpu_insn_memsrc("fcoms", src.val);
-                break;
-            case 3: /* fcomp */
-                emulate_fpu_insn_memsrc("fcomps", src.val);
-                break;
-            case 4: /* fsub */
-                emulate_fpu_insn_memsrc("fsubs", src.val);
-                break;
-            case 5: /* fsubr */
-                emulate_fpu_insn_memsrc("fsubrs", src.val);
-                break;
-            case 6: /* fdiv */
-                emulate_fpu_insn_memsrc("fdivs", src.val);
-                break;
-            case 7: /* fdivr */
-                emulate_fpu_insn_memsrc("fdivrs", src.val);
-                break;
-            }
+            emulate_fpu_insn_memsrc(b, modrm_reg, src.val);
+            break;
         }
         check_fpu_exn(&fic);
         break;
@@ -4330,52 +4302,46 @@ x86_emulate(
             break;
         default:
             generate_exception_if(ea.type != OP_MEM, EXC_UD);
-            dst = ea;
             switch ( modrm_reg & 7 )
             {
             case 0: /* fld m32fp */
-                if ( (rc = ops->read(ea.mem.seg, ea.mem.off, &src.val,
-                                     4, ctxt)) != X86EMUL_OKAY )
-                    goto done;
-                emulate_fpu_insn_memsrc("flds", src.val);
-                dst.type = OP_NONE;
-                break;
+                goto fpu_memsrc32;
             case 2: /* fst m32fp */
-                emulate_fpu_insn_memdst("fsts", dst.val);
-                dst.bytes = 4;
-                break;
             case 3: /* fstp m32fp */
-                emulate_fpu_insn_memdst("fstps", dst.val);
+            fpu_memdst32:
+                dst = ea;
                 dst.bytes = 4;
+                emulate_fpu_insn_memdst(b, modrm_reg, dst.val);
                 break;
             case 4: /* fldenv - TODO */
                 state->fpu_ctrl = true;
                 goto cannot_emulate;
             case 5: /* fldcw m2byte */
                 state->fpu_ctrl = true;
+            fpu_memsrc16:
                 if ( (rc = ops->read(ea.mem.seg, ea.mem.off, &src.val,
                                      2, ctxt)) != X86EMUL_OKAY )
                     goto done;
-                emulate_fpu_insn_memsrc("fldcw", src.val);
-                dst.type = OP_NONE;
+                emulate_fpu_insn_memsrc(b, modrm_reg, src.val);
                 break;
             case 6: /* fnstenv - TODO */
                 state->fpu_ctrl = true;
                 goto cannot_emulate;
             case 7: /* fnstcw m2byte */
                 state->fpu_ctrl = true;
-                emulate_fpu_insn_memdst("fnstcw", dst.val);
+            fpu_memdst16:
+                dst = ea;
                 dst.bytes = 2;
+                emulate_fpu_insn_memdst(b, modrm_reg, dst.val);
                 break;
             default:
                 generate_exception(EXC_UD);
             }
             /*
              * Control instructions can't raise FPU exceptions, so we need
-             * to consider suppressing writes only for non-control ones. All
-             * of them in this group have data width 4.
+             * to consider suppressing writes only for non-control ones.
              */
-            if ( dst.type == OP_MEM && dst.bytes == 4 && !fpu_check_write() )
+            if ( dst.type == OP_MEM && !state->fpu_ctrl && !fpu_check_write() )
                 dst.type = OP_NONE;
         }
         check_fpu_exn(&fic);
@@ -4398,36 +4364,7 @@ x86_emulate(
             break;
         default:
             generate_exception_if(ea.type != OP_MEM, EXC_UD);
-            if ( (rc = ops->read(ea.mem.seg, ea.mem.off, &src.val,
-                                 4, ctxt)) != X86EMUL_OKAY )
-                goto done;
-            switch ( modrm_reg & 7 )
-            {
-            case 0: /* fiadd m32i */
-                emulate_fpu_insn_memsrc("fiaddl", src.val);
-                break;
-            case 1: /* fimul m32i */
-                emulate_fpu_insn_memsrc("fimull", src.val);
-                break;
-            case 2: /* ficom m32i */
-                emulate_fpu_insn_memsrc("ficoml", src.val);
-                break;
-            case 3: /* ficomp m32i */
-                emulate_fpu_insn_memsrc("ficompl", src.val);
-                break;
-            case 4: /* fisub m32i */
-                emulate_fpu_insn_memsrc("fisubl", src.val);
-                break;
-            case 5: /* fisubr m32i */
-                emulate_fpu_insn_memsrc("fisubrl", src.val);
-                break;
-            case 6: /* fidiv m32i */
-                emulate_fpu_insn_memsrc("fidivl", src.val);
-                break;
-            case 7: /* fidivr m32i */
-                emulate_fpu_insn_memsrc("fidivrl", src.val);
-                break;
-            }
+            goto fpu_memsrc32;
         }
         check_fpu_exn(&fic);
         break;
@@ -4457,50 +4394,35 @@ x86_emulate(
             break;
         default:
             generate_exception_if(ea.type != OP_MEM, EXC_UD);
-            dst = ea;
             switch ( modrm_reg & 7 )
             {
             case 0: /* fild m32i */
-                if ( (rc = ops->read(ea.mem.seg, ea.mem.off, &src.val,
-                                     4, ctxt)) != X86EMUL_OKAY )
-                    goto done;
-                emulate_fpu_insn_memsrc("fildl", src.val);
-                dst.type = OP_NONE;
-                break;
+                goto fpu_memsrc32;
             case 1: /* fisttp m32i */
                 host_and_vcpu_must_have(sse3);
-                emulate_fpu_insn_memdst("fisttpl", dst.val);
-                dst.bytes = 4;
-                break;
+                /* fall through */
             case 2: /* fist m32i */
-                emulate_fpu_insn_memdst("fistl", dst.val);
-                dst.bytes = 4;
-                break;
             case 3: /* fistp m32i */
-                emulate_fpu_insn_memdst("fistpl", dst.val);
-                dst.bytes = 4;
-                break;
+                goto fpu_memdst32;
             case 5: /* fld m80fp */
+            fpu_memsrc80:
                 if ( (rc = ops->read(ea.mem.seg, ea.mem.off, mmvalp,
                                      10, ctxt)) != X86EMUL_OKAY )
                     goto done;
-                emulate_fpu_insn_memsrc("fldt", *mmvalp);
-                dst.type = OP_NONE;
+                emulate_fpu_insn_memsrc(b, modrm_reg, *mmvalp);
                 break;
             case 7: /* fstp m80fp */
+            fpu_memdst80:
                 fail_if(!ops->write);
-                emulate_fpu_insn_memdst("fstpt", *mmvalp);
+                emulate_fpu_insn_memdst(b, modrm_reg, *mmvalp);
                 if ( fpu_check_write() &&
                      (rc = ops->write(ea.mem.seg, ea.mem.off, mmvalp,
                                       10, ctxt)) != X86EMUL_OKAY )
                     goto done;
-                dst.type = OP_NONE;
                 break;
             default:
                 generate_exception(EXC_UD);
             }
-            if ( dst.type == OP_MEM && !fpu_check_write() )
-                dst.type = OP_NONE;
         }
         check_fpu_exn(&fic);
         break;
@@ -4521,37 +4443,13 @@ x86_emulate(
             emulate_fpu_insn_stub(0xdc, modrm);
             break;
         default:
+        fpu_memsrc64:
             ASSERT(ea.type == OP_MEM);
             if ( (rc = ops->read(ea.mem.seg, ea.mem.off, &src.val,
                                  8, ctxt)) != X86EMUL_OKAY )
                 goto done;
-            switch ( modrm_reg & 7 )
-            {
-            case 0: /* fadd m64fp */
-                emulate_fpu_insn_memsrc("faddl", src.val);
-                break;
-            case 1: /* fmul m64fp */
-                emulate_fpu_insn_memsrc("fmull", src.val);
-                break;
-            case 2: /* fcom m64fp */
-                emulate_fpu_insn_memsrc("fcoml", src.val);
-                break;
-            case 3: /* fcomp m64fp */
-                emulate_fpu_insn_memsrc("fcompl", src.val);
-                break;
-            case 4: /* fsub m64fp */
-                emulate_fpu_insn_memsrc("fsubl", src.val);
-                break;
-            case 5: /* fsubr m64fp */
-                emulate_fpu_insn_memsrc("fsubrl", src.val);
-                break;
-            case 6: /* fdiv m64fp */
-                emulate_fpu_insn_memsrc("fdivl", src.val);
-                break;
-            case 7: /* fdivr m64fp */
-                emulate_fpu_insn_memsrc("fdivrl", src.val);
-                break;
-            }
+            emulate_fpu_insn_memsrc(b, modrm_reg, src.val);
+            break;
         }
         check_fpu_exn(&fic);
         break;
@@ -4571,28 +4469,19 @@ x86_emulate(
             break;
         default:
             generate_exception_if(ea.type != OP_MEM, EXC_UD);
-            dst = ea;
             switch ( modrm_reg & 7 )
             {
             case 0: /* fld m64fp */;
-                if ( (rc = ops->read(ea.mem.seg, ea.mem.off, &src.val,
-                                     8, ctxt)) != X86EMUL_OKAY )
-                    goto done;
-                emulate_fpu_insn_memsrc("fldl", src.val);
-                dst.type = OP_NONE;
-                break;
+                goto fpu_memsrc64;
             case 1: /* fisttp m64i */
                 host_and_vcpu_must_have(sse3);
-                emulate_fpu_insn_memdst("fisttpll", dst.val);
-                dst.bytes = 8;
-                break;
+                /* fall through */
             case 2: /* fst m64fp */
-                emulate_fpu_insn_memdst("fstl", dst.val);
-                dst.bytes = 8;
-                break;
             case 3: /* fstp m64fp */
-                emulate_fpu_insn_memdst("fstpl", dst.val);
+            fpu_memdst64:
+                dst = ea;
                 dst.bytes = 8;
+                emulate_fpu_insn_memdst(b, modrm_reg, dst.val);
                 break;
             case 4: /* frstor - TODO */
             case 6: /* fnsave - TODO */
@@ -4600,18 +4489,15 @@ x86_emulate(
                 goto cannot_emulate;
             case 7: /* fnstsw m2byte */
                 state->fpu_ctrl = true;
-                emulate_fpu_insn_memdst("fnstsw", dst.val);
-                dst.bytes = 2;
-                break;
+                goto fpu_memdst16;
             default:
                 generate_exception(EXC_UD);
             }
             /*
              * Control instructions can't raise FPU exceptions, so we need
-             * to consider suppressing writes only for non-control ones. All
-             * of them in this group have data width 8.
+             * to consider suppressing writes only for non-control ones.
              */
-            if ( dst.type == OP_MEM && dst.bytes == 8 && !fpu_check_write() )
+            if ( dst.type == OP_MEM && !state->fpu_ctrl && !fpu_check_write() )
                 dst.type = OP_NONE;
         }
         check_fpu_exn(&fic);
@@ -4634,33 +4520,8 @@ x86_emulate(
             break;
         default:
             generate_exception_if(ea.type != OP_MEM, EXC_UD);
-            switch ( modrm_reg & 7 )
-            {
-            case 0: /* fiadd m16i */
-                emulate_fpu_insn_memsrc("fiadds", src.val);
-                break;
-            case 1: /* fimul m16i */
-                emulate_fpu_insn_memsrc("fimuls", src.val);
-                break;
-            case 2: /* ficom m16i */
-                emulate_fpu_insn_memsrc("ficoms", src.val);
-                break;
-            case 3: /* ficomp m16i */
-                emulate_fpu_insn_memsrc("ficomps", src.val);
-                break;
-            case 4: /* fisub m16i */
-                emulate_fpu_insn_memsrc("fisubs", src.val);
-                break;
-            case 5: /* fisubr m16i */
-                emulate_fpu_insn_memsrc("fisubrs", src.val);
-                break;
-            case 6: /* fidiv m16i */
-                emulate_fpu_insn_memsrc("fidivs", src.val);
-                break;
-            case 7: /* fidivr m16i */
-                emulate_fpu_insn_memsrc("fidivrs", src.val);
-                break;
-            }
+            emulate_fpu_insn_memsrc(b, modrm_reg, src.val);
+            break;
         }
         check_fpu_exn(&fic);
         break;
@@ -4676,7 +4537,7 @@ x86_emulate(
             dst.bytes = 2;
             dst.type = OP_REG;
             dst.reg = (void *)&_regs.ax;
-            emulate_fpu_insn_memdst("fnstsw", dst.val);
+            emulate_fpu_insn_memdst(b, modrm_reg, dst.val);
             break;
         case 0xe8 ... 0xef: /* fucomip %stN */
         case 0xf0 ... 0xf7: /* fcomip %stN */
@@ -4691,59 +4552,26 @@ x86_emulate(
             break;
         default:
             generate_exception_if(ea.type != OP_MEM, EXC_UD);
-            dst = ea;
             switch ( modrm_reg & 7 )
             {
             case 0: /* fild m16i */
-                if ( (rc = ops->read(ea.mem.seg, ea.mem.off, &src.val,
-                                     2, ctxt)) != X86EMUL_OKAY )
-                    goto done;
-                emulate_fpu_insn_memsrc("filds", src.val);
-                dst.type = OP_NONE;
-                break;
+                goto fpu_memsrc16;
             case 1: /* fisttp m16i */
                 host_and_vcpu_must_have(sse3);
-                emulate_fpu_insn_memdst("fisttps", dst.val);
-                dst.bytes = 2;
-                break;
+                /* fall through */
             case 2: /* fist m16i */
-                emulate_fpu_insn_memdst("fists", dst.val);
-                dst.bytes = 2;
-                break;
             case 3: /* fistp m16i */
-                emulate_fpu_insn_memdst("fistps", dst.val);
-                dst.bytes = 2;
-                break;
+                goto fpu_memdst16;
             case 4: /* fbld m80dec */
-                if ( (rc = ops->read(ea.mem.seg, ea.mem.off, mmvalp,
-                                     10, ctxt)) != X86EMUL_OKAY )
-                    goto done;
-                emulate_fpu_insn_memsrc("fbld", *mmvalp);
-                dst.type = OP_NONE;
-                break;
+                goto fpu_memsrc80;
             case 5: /* fild m64i */
-                if ( (rc = ops->read(ea.mem.seg, ea.mem.off, &src.val,
-                                     8, ctxt)) != X86EMUL_OKAY )
-                    goto done;
-                emulate_fpu_insn_memsrc("fildll", src.val);
                 dst.type = OP_NONE;
-                break;
+                goto fpu_memsrc64;
             case 6: /* fbstp packed bcd */
-                fail_if(!ops->write);
-                emulate_fpu_insn_memdst("fbstp", *mmvalp);
-                if ( fpu_check_write() &&
-                     (rc = ops->write(ea.mem.seg, ea.mem.off, mmvalp,
-                                      10, ctxt)) != X86EMUL_OKAY )
-                    goto done;
-                dst.type = OP_NONE;
-                break;
+                goto fpu_memdst80;
             case 7: /* fistp m64i */
-                emulate_fpu_insn_memdst("fistpll", dst.val);
-                dst.bytes = 8;
-                break;
+                goto fpu_memdst64;
             }
-            if ( dst.type == OP_MEM && !fpu_check_write() )
-                dst.type = OP_NONE;
         }
         check_fpu_exn(&fic);
         break;



[-- Attachment #2: x86emul-FPU-use-stub.patch --]
[-- Type: text/plain, Size: 19831 bytes --]

x86emul: make all FPU emulation use the stub

While this means quite some reduction of (source) code, the main
purpose is to no longer have exceptions raised from other than stubs.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1261,28 +1261,25 @@ static inline bool fpu_check_write(void)
     return !(fsw & FSW_ES);
 }
 
-#define emulate_fpu_insn(_op)                           \
-    asm volatile (                                      \
-        "movb $2f-1f,%0 \n"                             \
-        "1: " _op "     \n"                             \
-        "2:             \n"                             \
-        : "=m" (fic.insn_bytes) : : "memory" )
-
-#define emulate_fpu_insn_memdst(_op, _arg)              \
-    asm volatile (                                      \
-        "movb $2f-1f,%0 \n"                             \
-        "1: " _op " %1  \n"                             \
-        "2:             \n"                             \
-        : "=m" (fic.insn_bytes), "=m" (_arg)            \
-        : : "memory" )
-
-#define emulate_fpu_insn_memsrc(_op, _arg)              \
-    asm volatile (                                      \
-        "movb $2f-1f,%0 \n"                             \
-        "1: " _op " %1  \n"                             \
-        "2:             \n"                             \
-        : "=m" (fic.insn_bytes)                         \
-        : "m" (_arg) : "memory" )
+#define emulate_fpu_insn_memdst(opc, ext, arg)                          \
+do {                                                                    \
+    /* ModRM: mod=0, reg=ext, rm=0, i.e. a (%rax) operand */            \
+    fic.insn_bytes = 2;                                                 \
+    memcpy(get_stub(stub),                                              \
+           ((uint8_t[]){ opc, ((ext) & 7) << 3, 0xc3 }), 3);            \
+    invoke_stub("", "", "+m" (fic), "+m" (arg) : "a" (&(arg)));         \
+    put_stub(stub);                                                     \
+} while (0)
+
+#define emulate_fpu_insn_memsrc(opc, ext, arg)                          \
+do {                                                                    \
+    /* ModRM: mod=0, reg=ext, rm=0, i.e. a (%rax) operand */            \
+    fic.insn_bytes = 2;                                                 \
+    memcpy(get_stub(stub),                                              \
+           ((uint8_t[]){ opc, ((ext) & 7) << 3, 0xc3 }), 3);            \
+    invoke_stub("", "", "+m" (fic) : "m" (arg), "a" (&(arg)));          \
+    put_stub(stub);                                                     \
+} while (0)
 
 #define emulate_fpu_insn_stub(bytes...)                                 \
 do {                                                                    \
@@ -3834,8 +3831,7 @@ x86_emulate(
     case 0x9b:  /* wait/fwait */
         host_and_vcpu_must_have(fpu);
         get_fpu(X86EMUL_FPU_wait, &fic);
-        fic.insn_bytes = 1;
-        asm volatile ( "fwait" ::: "memory" );
+        emulate_fpu_insn_stub(b);
         check_fpu_exn(&fic);
         break;
 
@@ -4253,37 +4249,13 @@ x86_emulate(
             emulate_fpu_insn_stub(0xd8, modrm);
             break;
         default:
+        fpu_memsrc32:
             ASSERT(ea.type == OP_MEM);
             if ( (rc = ops->read(ea.mem.seg, ea.mem.off, &src.val,
                                  4, ctxt)) != X86EMUL_OKAY )
                 goto done;
-            switch ( modrm_reg & 7 )
-            {
-            case 0: /* fadd */
-                emulate_fpu_insn_memsrc("fadds", src.val);
-                break;
-            case 1: /* fmul */
-                emulate_fpu_insn_memsrc("fmuls", src.val);
-                break;
-            case 2: /* fcom */
-                emulate_fpu_insn_memsrc("fcoms", src.val);
-                break;
-            case 3: /* fcomp */
-                emulate_fpu_insn_memsrc("fcomps", src.val);
-                break;
-            case 4: /* fsub */
-                emulate_fpu_insn_memsrc("fsubs", src.val);
-                break;
-            case 5: /* fsubr */
-                emulate_fpu_insn_memsrc("fsubrs", src.val);
-                break;
-            case 6: /* fdiv */
-                emulate_fpu_insn_memsrc("fdivs", src.val);
-                break;
-            case 7: /* fdivr */
-                emulate_fpu_insn_memsrc("fdivrs", src.val);
-                break;
-            }
+            emulate_fpu_insn_memsrc(b, modrm_reg, src.val);
+            break;
         }
         check_fpu_exn(&fic);
         break;
@@ -4330,52 +4302,46 @@ x86_emulate(
             break;
         default:
             generate_exception_if(ea.type != OP_MEM, EXC_UD);
-            dst = ea;
             switch ( modrm_reg & 7 )
             {
             case 0: /* fld m32fp */
-                if ( (rc = ops->read(ea.mem.seg, ea.mem.off, &src.val,
-                                     4, ctxt)) != X86EMUL_OKAY )
-                    goto done;
-                emulate_fpu_insn_memsrc("flds", src.val);
-                dst.type = OP_NONE;
-                break;
+                goto fpu_memsrc32;
             case 2: /* fst m32fp */
-                emulate_fpu_insn_memdst("fsts", dst.val);
-                dst.bytes = 4;
-                break;
             case 3: /* fstp m32fp */
-                emulate_fpu_insn_memdst("fstps", dst.val);
+            fpu_memdst32:
+                dst = ea;
                 dst.bytes = 4;
+                emulate_fpu_insn_memdst(b, modrm_reg, dst.val);
                 break;
             case 4: /* fldenv - TODO */
                 state->fpu_ctrl = true;
                 goto cannot_emulate;
             case 5: /* fldcw m2byte */
                 state->fpu_ctrl = true;
+            fpu_memsrc16:
                 if ( (rc = ops->read(ea.mem.seg, ea.mem.off, &src.val,
                                      2, ctxt)) != X86EMUL_OKAY )
                     goto done;
-                emulate_fpu_insn_memsrc("fldcw", src.val);
-                dst.type = OP_NONE;
+                emulate_fpu_insn_memsrc(b, modrm_reg, src.val);
                 break;
             case 6: /* fnstenv - TODO */
                 state->fpu_ctrl = true;
                 goto cannot_emulate;
             case 7: /* fnstcw m2byte */
                 state->fpu_ctrl = true;
-                emulate_fpu_insn_memdst("fnstcw", dst.val);
+            fpu_memdst16:
+                dst = ea;
                 dst.bytes = 2;
+                emulate_fpu_insn_memdst(b, modrm_reg, dst.val);
                 break;
             default:
                 generate_exception(EXC_UD);
             }
             /*
              * Control instructions can't raise FPU exceptions, so we need
-             * to consider suppressing writes only for non-control ones. All
-             * of them in this group have data width 4.
+             * to consider suppressing writes only for non-control ones.
              */
-            if ( dst.type == OP_MEM && dst.bytes == 4 && !fpu_check_write() )
+            if ( dst.type == OP_MEM && !state->fpu_ctrl && !fpu_check_write() )
                 dst.type = OP_NONE;
         }
         check_fpu_exn(&fic);
@@ -4398,36 +4364,7 @@ x86_emulate(
             break;
         default:
             generate_exception_if(ea.type != OP_MEM, EXC_UD);
-            if ( (rc = ops->read(ea.mem.seg, ea.mem.off, &src.val,
-                                 4, ctxt)) != X86EMUL_OKAY )
-                goto done;
-            switch ( modrm_reg & 7 )
-            {
-            case 0: /* fiadd m32i */
-                emulate_fpu_insn_memsrc("fiaddl", src.val);
-                break;
-            case 1: /* fimul m32i */
-                emulate_fpu_insn_memsrc("fimull", src.val);
-                break;
-            case 2: /* ficom m32i */
-                emulate_fpu_insn_memsrc("ficoml", src.val);
-                break;
-            case 3: /* ficomp m32i */
-                emulate_fpu_insn_memsrc("ficompl", src.val);
-                break;
-            case 4: /* fisub m32i */
-                emulate_fpu_insn_memsrc("fisubl", src.val);
-                break;
-            case 5: /* fisubr m32i */
-                emulate_fpu_insn_memsrc("fisubrl", src.val);
-                break;
-            case 6: /* fidiv m32i */
-                emulate_fpu_insn_memsrc("fidivl", src.val);
-                break;
-            case 7: /* fidivr m32i */
-                emulate_fpu_insn_memsrc("fidivrl", src.val);
-                break;
-            }
+            goto fpu_memsrc32;
         }
         check_fpu_exn(&fic);
         break;
@@ -4457,50 +4394,35 @@ x86_emulate(
             break;
         default:
             generate_exception_if(ea.type != OP_MEM, EXC_UD);
-            dst = ea;
             switch ( modrm_reg & 7 )
             {
             case 0: /* fild m32i */
-                if ( (rc = ops->read(ea.mem.seg, ea.mem.off, &src.val,
-                                     4, ctxt)) != X86EMUL_OKAY )
-                    goto done;
-                emulate_fpu_insn_memsrc("fildl", src.val);
-                dst.type = OP_NONE;
-                break;
+                goto fpu_memsrc32;
             case 1: /* fisttp m32i */
                 host_and_vcpu_must_have(sse3);
-                emulate_fpu_insn_memdst("fisttpl", dst.val);
-                dst.bytes = 4;
-                break;
+                /* fall through */
             case 2: /* fist m32i */
-                emulate_fpu_insn_memdst("fistl", dst.val);
-                dst.bytes = 4;
-                break;
             case 3: /* fistp m32i */
-                emulate_fpu_insn_memdst("fistpl", dst.val);
-                dst.bytes = 4;
-                break;
+                goto fpu_memdst32;
             case 5: /* fld m80fp */
+            fpu_memsrc80:
                 if ( (rc = ops->read(ea.mem.seg, ea.mem.off, mmvalp,
                                      10, ctxt)) != X86EMUL_OKAY )
                     goto done;
-                emulate_fpu_insn_memsrc("fldt", *mmvalp);
-                dst.type = OP_NONE;
+                emulate_fpu_insn_memsrc(b, modrm_reg, *mmvalp);
                 break;
             case 7: /* fstp m80fp */
+            fpu_memdst80:
                 fail_if(!ops->write);
-                emulate_fpu_insn_memdst("fstpt", *mmvalp);
+                emulate_fpu_insn_memdst(b, modrm_reg, *mmvalp);
                 if ( fpu_check_write() &&
                      (rc = ops->write(ea.mem.seg, ea.mem.off, mmvalp,
                                       10, ctxt)) != X86EMUL_OKAY )
                     goto done;
-                dst.type = OP_NONE;
                 break;
             default:
                 generate_exception(EXC_UD);
             }
-            if ( dst.type == OP_MEM && !fpu_check_write() )
-                dst.type = OP_NONE;
         }
         check_fpu_exn(&fic);
         break;
@@ -4521,37 +4443,13 @@ x86_emulate(
             emulate_fpu_insn_stub(0xdc, modrm);
             break;
         default:
+        fpu_memsrc64:
             ASSERT(ea.type == OP_MEM);
             if ( (rc = ops->read(ea.mem.seg, ea.mem.off, &src.val,
                                  8, ctxt)) != X86EMUL_OKAY )
                 goto done;
-            switch ( modrm_reg & 7 )
-            {
-            case 0: /* fadd m64fp */
-                emulate_fpu_insn_memsrc("faddl", src.val);
-                break;
-            case 1: /* fmul m64fp */
-                emulate_fpu_insn_memsrc("fmull", src.val);
-                break;
-            case 2: /* fcom m64fp */
-                emulate_fpu_insn_memsrc("fcoml", src.val);
-                break;
-            case 3: /* fcomp m64fp */
-                emulate_fpu_insn_memsrc("fcompl", src.val);
-                break;
-            case 4: /* fsub m64fp */
-                emulate_fpu_insn_memsrc("fsubl", src.val);
-                break;
-            case 5: /* fsubr m64fp */
-                emulate_fpu_insn_memsrc("fsubrl", src.val);
-                break;
-            case 6: /* fdiv m64fp */
-                emulate_fpu_insn_memsrc("fdivl", src.val);
-                break;
-            case 7: /* fdivr m64fp */
-                emulate_fpu_insn_memsrc("fdivrl", src.val);
-                break;
-            }
+            emulate_fpu_insn_memsrc(b, modrm_reg, src.val);
+            break;
         }
         check_fpu_exn(&fic);
         break;
@@ -4571,28 +4469,19 @@ x86_emulate(
             break;
         default:
             generate_exception_if(ea.type != OP_MEM, EXC_UD);
-            dst = ea;
             switch ( modrm_reg & 7 )
             {
             case 0: /* fld m64fp */;
-                if ( (rc = ops->read(ea.mem.seg, ea.mem.off, &src.val,
-                                     8, ctxt)) != X86EMUL_OKAY )
-                    goto done;
-                emulate_fpu_insn_memsrc("fldl", src.val);
-                dst.type = OP_NONE;
-                break;
+                goto fpu_memsrc64;
             case 1: /* fisttp m64i */
                 host_and_vcpu_must_have(sse3);
-                emulate_fpu_insn_memdst("fisttpll", dst.val);
-                dst.bytes = 8;
-                break;
+                /* fall through */
             case 2: /* fst m64fp */
-                emulate_fpu_insn_memdst("fstl", dst.val);
-                dst.bytes = 8;
-                break;
             case 3: /* fstp m64fp */
-                emulate_fpu_insn_memdst("fstpl", dst.val);
+            fpu_memdst64:
+                dst = ea;
                 dst.bytes = 8;
+                emulate_fpu_insn_memdst(b, modrm_reg, dst.val);
                 break;
             case 4: /* frstor - TODO */
             case 6: /* fnsave - TODO */
@@ -4600,18 +4489,15 @@ x86_emulate(
                 goto cannot_emulate;
             case 7: /* fnstsw m2byte */
                 state->fpu_ctrl = true;
-                emulate_fpu_insn_memdst("fnstsw", dst.val);
-                dst.bytes = 2;
-                break;
+                goto fpu_memdst16;
             default:
                 generate_exception(EXC_UD);
             }
             /*
              * Control instructions can't raise FPU exceptions, so we need
-             * to consider suppressing writes only for non-control ones. All
-             * of them in this group have data width 8.
+             * to consider suppressing writes only for non-control ones.
              */
-            if ( dst.type == OP_MEM && dst.bytes == 8 && !fpu_check_write() )
+            if ( dst.type == OP_MEM && !state->fpu_ctrl && !fpu_check_write() )
                 dst.type = OP_NONE;
         }
         check_fpu_exn(&fic);
@@ -4634,33 +4520,8 @@ x86_emulate(
             break;
         default:
             generate_exception_if(ea.type != OP_MEM, EXC_UD);
-            switch ( modrm_reg & 7 )
-            {
-            case 0: /* fiadd m16i */
-                emulate_fpu_insn_memsrc("fiadds", src.val);
-                break;
-            case 1: /* fimul m16i */
-                emulate_fpu_insn_memsrc("fimuls", src.val);
-                break;
-            case 2: /* ficom m16i */
-                emulate_fpu_insn_memsrc("ficoms", src.val);
-                break;
-            case 3: /* ficomp m16i */
-                emulate_fpu_insn_memsrc("ficomps", src.val);
-                break;
-            case 4: /* fisub m16i */
-                emulate_fpu_insn_memsrc("fisubs", src.val);
-                break;
-            case 5: /* fisubr m16i */
-                emulate_fpu_insn_memsrc("fisubrs", src.val);
-                break;
-            case 6: /* fidiv m16i */
-                emulate_fpu_insn_memsrc("fidivs", src.val);
-                break;
-            case 7: /* fidivr m16i */
-                emulate_fpu_insn_memsrc("fidivrs", src.val);
-                break;
-            }
+            emulate_fpu_insn_memsrc(b, modrm_reg, src.val);
+            break;
         }
         check_fpu_exn(&fic);
         break;
@@ -4676,7 +4537,7 @@ x86_emulate(
             dst.bytes = 2;
             dst.type = OP_REG;
             dst.reg = (void *)&_regs.ax;
-            emulate_fpu_insn_memdst("fnstsw", dst.val);
+            emulate_fpu_insn_memdst(b, modrm_reg, dst.val);
             break;
         case 0xe8 ... 0xef: /* fucomip %stN */
         case 0xf0 ... 0xf7: /* fcomip %stN */
@@ -4691,59 +4552,26 @@ x86_emulate(
             break;
         default:
             generate_exception_if(ea.type != OP_MEM, EXC_UD);
-            dst = ea;
             switch ( modrm_reg & 7 )
             {
             case 0: /* fild m16i */
-                if ( (rc = ops->read(ea.mem.seg, ea.mem.off, &src.val,
-                                     2, ctxt)) != X86EMUL_OKAY )
-                    goto done;
-                emulate_fpu_insn_memsrc("filds", src.val);
-                dst.type = OP_NONE;
-                break;
+                goto fpu_memsrc16;
             case 1: /* fisttp m16i */
                 host_and_vcpu_must_have(sse3);
-                emulate_fpu_insn_memdst("fisttps", dst.val);
-                dst.bytes = 2;
-                break;
+                /* fall through */
             case 2: /* fist m16i */
-                emulate_fpu_insn_memdst("fists", dst.val);
-                dst.bytes = 2;
-                break;
             case 3: /* fistp m16i */
-                emulate_fpu_insn_memdst("fistps", dst.val);
-                dst.bytes = 2;
-                break;
+                goto fpu_memdst16;
             case 4: /* fbld m80dec */
-                if ( (rc = ops->read(ea.mem.seg, ea.mem.off, mmvalp,
-                                     10, ctxt)) != X86EMUL_OKAY )
-                    goto done;
-                emulate_fpu_insn_memsrc("fbld", *mmvalp);
-                dst.type = OP_NONE;
-                break;
+                goto fpu_memsrc80;
             case 5: /* fild m64i */
-                if ( (rc = ops->read(ea.mem.seg, ea.mem.off, &src.val,
-                                     8, ctxt)) != X86EMUL_OKAY )
-                    goto done;
-                emulate_fpu_insn_memsrc("fildll", src.val);
                 dst.type = OP_NONE;
-                break;
+                goto fpu_memsrc64;
             case 6: /* fbstp packed bcd */
-                fail_if(!ops->write);
-                emulate_fpu_insn_memdst("fbstp", *mmvalp);
-                if ( fpu_check_write() &&
-                     (rc = ops->write(ea.mem.seg, ea.mem.off, mmvalp,
-                                      10, ctxt)) != X86EMUL_OKAY )
-                    goto done;
-                dst.type = OP_NONE;
-                break;
+                goto fpu_memdst80;
             case 7: /* fistp m64i */
-                emulate_fpu_insn_memdst("fistpll", dst.val);
-                dst.bytes = 8;
-                break;
+                goto fpu_memdst64;
             }
-            if ( dst.type == OP_MEM && !fpu_check_write() )
-                dst.type = OP_NONE;
         }
         check_fpu_exn(&fic);
         break;

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 17/17] x86/HVM: eliminate custom #MF/#XM handling
  2017-06-21 11:23 [PATCH 00/17] x86: emulator enhancements Jan Beulich
                   ` (15 preceding siblings ...)
  2017-06-21 12:08 ` [PATCH 16/17] x86emul: make all FPU emulation use the stub Jan Beulich
@ 2017-06-21 12:09 ` Jan Beulich
  2017-09-05 17:08 ` [PATCH 00/17] x86: emulator enhancements George Dunlap
  17 siblings, 0 replies; 27+ messages in thread
From: Jan Beulich @ 2017-06-21 12:09 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 38970 bytes --]

Use the generic stub exception handling instead.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/tools/tests/x86_emulator/x86_emulate.c
+++ b/tools/tests/x86_emulator/x86_emulate.c
@@ -134,8 +134,6 @@ int emul_test_read_xcr(
 }
 
 int emul_test_get_fpu(
-    void (*exception_callback)(void *, struct cpu_user_regs *),
-    void *exception_callback_arg,
     enum x86_emulate_fpu_type type,
     struct x86_emulate_ctxt *ctxt)
 {
--- a/tools/tests/x86_emulator/x86_emulate.h
+++ b/tools/tests/x86_emulator/x86_emulate.h
@@ -221,8 +221,6 @@ int emul_test_read_xcr(
     struct x86_emulate_ctxt *ctxt);
 
 int emul_test_get_fpu(
-    void (*exception_callback)(void *, struct cpu_user_regs *),
-    void *exception_callback_arg,
     enum x86_emulate_fpu_type type,
     struct x86_emulate_ctxt *ctxt);
 
--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -1727,8 +1727,6 @@ int hvmemul_cpuid(uint32_t leaf, uint32_
 }
 
 static int hvmemul_get_fpu(
-    void (*exception_callback)(void *, struct cpu_user_regs *),
-    void *exception_callback_arg,
     enum x86_emulate_fpu_type type,
     struct x86_emulate_ctxt *ctxt)
 {
@@ -1766,9 +1764,6 @@ static int hvmemul_get_fpu(
         }
     }
 
-    curr->arch.hvm_vcpu.fpu_exception_callback = exception_callback;
-    curr->arch.hvm_vcpu.fpu_exception_callback_arg = exception_callback_arg;
-
     return X86EMUL_OKAY;
 }
 
@@ -1779,8 +1774,6 @@ static void hvmemul_put_fpu(
 {
     struct vcpu *curr = current;
 
-    curr->arch.hvm_vcpu.fpu_exception_callback = NULL;
-
     if ( aux )
     {
         typeof(curr->arch.xsave_area->fpu_sse) *fpu_ctxt = curr->arch.fpu_ctxt;
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -780,7 +780,6 @@ void do_reserved_trap(struct cpu_user_re
 
 void do_trap(struct cpu_user_regs *regs)
 {
-    struct vcpu *curr = current;
     unsigned int trapnr = regs->entry_vector;
     unsigned long fixup;
 
@@ -800,15 +799,6 @@ void do_trap(struct cpu_user_regs *regs)
         return;
     }
 
-    if ( ((trapnr == TRAP_copro_error) || (trapnr == TRAP_simd_error)) &&
-         system_state >= SYS_STATE_active && is_hvm_vcpu(curr) &&
-         curr->arch.hvm_vcpu.fpu_exception_callback )
-    {
-        curr->arch.hvm_vcpu.fpu_exception_callback(
-            curr->arch.hvm_vcpu.fpu_exception_callback_arg, regs);
-        return;
-    }
-
     if ( likely((fixup = search_exception_table(regs)) != 0) )
     {
         dprintk(XENLOG_ERR, "Trap %u: %p [%ps] -> %p\n",
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -959,6 +959,33 @@ static inline int mkec(uint8_t e, int32_
 #define generate_exception(e, ec...) generate_exception_if(true, e, ##ec)
 
 #ifdef __XEN__
+static int exception_from_stub(union stub_exception_token res,
+                               void *stub, unsigned int line,
+                               struct x86_emulate_ctxt *ctxt,
+                               const struct x86_emulate_ops *ops)
+{
+    int rc = X86EMUL_UNHANDLEABLE;
+
+    generate_exception_if(res.fields.trapnr == EXC_MF, EXC_MF);
+    if ( res.fields.trapnr == EXC_XM )
+    {
+        unsigned long cr4;
+
+        if ( !ops->read_cr || !ops->read_cr(4, &cr4, ctxt) == X86EMUL_OKAY )
+            cr4 = X86_CR4_OSXMMEXCPT;
+        generate_exception(cr4 & X86_CR4_OSXMMEXCPT ? EXC_XM : EXC_UD);
+    }
+    gprintk(XENLOG_WARNING,
+            "exception %u (ec=%04x) in emulation stub (line %u)\n",
+            res.fields.trapnr, res.fields.ec, line);
+    gprintk(XENLOG_INFO, "stub: %"__stringify(MAX_INST_LEN)"ph\n",  stub);
+    generate_exception_if(res.fields.trapnr == EXC_UD, EXC_UD);
+    domain_crash(current->domain);
+
+ done:
+    return rc;
+}
+
 # define invoke_stub(pre, post, constraints...) do {                    \
     union stub_exception_token res_ = { .raw = ~0 };                    \
     asm volatile ( pre "\n\tcall *%[stub]\n\t" post "\n"                \
@@ -974,14 +1001,8 @@ static inline int mkec(uint8_t e, int32_
                      "m" (*(uint8_t(*)[MAX_INST_LEN + 1])stub.ptr) );   \
     if ( unlikely(~res_.raw) )                                          \
     {                                                                   \
-        gprintk(XENLOG_WARNING,                                         \
-                "exception %u (ec=%04x) in emulation stub (line %u)\n", \
-                res_.fields.trapnr, res_.fields.ec, __LINE__);          \
-        gprintk(XENLOG_INFO, "stub: %"__stringify(MAX_INST_LEN)"ph\n",  \
-                stub.func);                                             \
-        generate_exception_if(res_.fields.trapnr == EXC_UD, EXC_UD);    \
-        domain_crash(current->domain);                                  \
-        goto cannot_emulate;                                            \
+        rc = exception_from_stub(res_, stub.func, __LINE__, ctxt, ops); \
+        goto done;                                                      \
     }                                                                   \
 } while (0)
 #else
@@ -1097,23 +1118,8 @@ do {
     ops->write_segment(x86_seg_cs, cs, ctxt);                           \
 })
 
-struct fpu_insn_ctxt {
-    uint8_t insn_bytes;
-    uint8_t type;
-    int8_t exn_raised;
-};
-
-static void fpu_handle_exception(void *_fic, struct cpu_user_regs *regs)
-{
-    struct fpu_insn_ctxt *fic = _fic;
-    ASSERT(regs->entry_vector < 0x20);
-    fic->exn_raised = regs->entry_vector;
-    regs->r(ip) += fic->insn_bytes;
-}
-
 static int _get_fpu(
     enum x86_emulate_fpu_type type,
-    struct fpu_insn_ctxt *fic,
     struct x86_emulate_ctxt *ctxt,
     const struct x86_emulate_ops *ops)
 {
@@ -1138,14 +1144,13 @@ static int _get_fpu(
         break;
     }
 
-    rc = ops->get_fpu(fpu_handle_exception, fic, type, ctxt);
+    rc = ops->get_fpu(type, ctxt);
 
     if ( rc == X86EMUL_OKAY )
     {
         unsigned long cr0;
 
         fail_if(type == X86EMUL_FPU_fpu && !ops->put_fpu);
-        fic->type = type;
 
         fail_if(!ops->read_cr);
         if ( type >= X86EMUL_FPU_xmm )
@@ -1183,37 +1188,22 @@ static int _get_fpu(
     return rc;
 }
 
-#define get_fpu(_type, _fic)                                    \
+#define get_fpu(type)                                           \
 do {                                                            \
-    rc = _get_fpu(_type, _fic, ctxt, ops);                      \
+    rc = _get_fpu(fpu_type = (type), ctxt, ops);                \
     if ( rc ) goto done;                                        \
 } while (0)
 
-#define check_fpu_exn(fic)                                      \
-do {                                                            \
-    generate_exception_if((fic)->exn_raised >= 0,               \
-                          (fic)->exn_raised);                   \
-} while (0)
-
-#define check_xmm_exn(fic)                                      \
-do {                                                            \
-    if ( (fic)->exn_raised == EXC_XM && ops->read_cr &&         \
-         ops->read_cr(4, &cr4, ctxt) == X86EMUL_OKAY &&         \
-         !(cr4 & X86_CR4_OSXMMEXCPT) )                          \
-        (fic)->exn_raised = EXC_UD;                             \
-    check_fpu_exn(fic);                                         \
-} while (0)
-
 static void put_fpu(
-    struct fpu_insn_ctxt *fic,
+    enum x86_emulate_fpu_type type,
     bool failed_late,
     const struct x86_emulate_state *state,
     struct x86_emulate_ctxt *ctxt,
     const struct x86_emulate_ops *ops)
 {
-    if ( unlikely(failed_late) && fic->type == X86EMUL_FPU_fpu )
+    if ( unlikely(failed_late) && type == X86EMUL_FPU_fpu )
         ops->put_fpu(ctxt, X86EMUL_FPU_fpu, NULL);
-    else if ( unlikely(fic->type == X86EMUL_FPU_fpu) && !state->fpu_ctrl )
+    else if ( unlikely(type == X86EMUL_FPU_fpu) && !state->fpu_ctrl )
     {
         struct x86_emul_fpu_aux aux = {
             .ip = ctxt->regs->r(ip),
@@ -1247,9 +1237,8 @@ static void put_fpu(
         }
         ops->put_fpu(ctxt, X86EMUL_FPU_none, &aux);
     }
-    else if ( fic->type != X86EMUL_FPU_none && ops->put_fpu )
+    else if ( type != X86EMUL_FPU_none && ops->put_fpu )
         ops->put_fpu(ctxt, X86EMUL_FPU_none, NULL);
-    fic->type = X86EMUL_FPU_none;
 }
 
 static inline bool fpu_check_write(void)
@@ -1264,29 +1253,27 @@ static inline bool fpu_check_write(void)
 #define emulate_fpu_insn_memdst(opc, ext, arg)                          \
 do {                                                                    \
     /* ModRM: mod=0, reg=ext, rm=0, i.e. a (%rax) operand */            \
-    fic.insn_bytes = 2;                                                 \
+    insn_bytes = 2;                                                     \
     memcpy(get_stub(stub),                                              \
            ((uint8_t[]){ opc, ((ext) & 7) << 3, 0xc3 }), 3);            \
-    invoke_stub("", "", "+m" (fic), "+m" (arg) : "a" (&(arg)));         \
+    invoke_stub("", "", "+m" (arg) : "a" (&(arg)));                     \
     put_stub(stub);                                                     \
 } while (0)
 
 #define emulate_fpu_insn_memsrc(opc, ext, arg)                          \
 do {                                                                    \
     /* ModRM: mod=0, reg=ext, rm=0, i.e. a (%rax) operand */            \
-    fic.insn_bytes = 2;                                                 \
     memcpy(get_stub(stub),                                              \
            ((uint8_t[]){ opc, ((ext) & 7) << 3, 0xc3 }), 3);            \
-    invoke_stub("", "", "+m" (fic) : "m" (arg), "a" (&(arg)));          \
+    invoke_stub("", "", "=m" (dummy) : "m" (arg), "a" (&(arg)));        \
     put_stub(stub);                                                     \
 } while (0)
 
 #define emulate_fpu_insn_stub(bytes...)                                 \
 do {                                                                    \
     unsigned int nr_ = sizeof((uint8_t[]){ bytes });                    \
-    fic.insn_bytes = nr_;                                               \
     memcpy(get_stub(stub), ((uint8_t[]){ bytes, 0xc3 }), nr_ + 1);      \
-    invoke_stub("", "", "=m" (fic) : "m" (fic));                        \
+    invoke_stub("", "", "=m" (dummy) : "i" (0));                        \
     put_stub(stub);                                                     \
 } while (0)
 
@@ -1294,12 +1281,10 @@ do {
 do {                                                                    \
     unsigned int nr_ = sizeof((uint8_t[]){ bytes });                    \
     unsigned long tmp_;                                                 \
-    fic.insn_bytes = nr_;                                               \
     memcpy(get_stub(stub), ((uint8_t[]){ bytes, 0xc3 }), nr_ + 1);      \
     invoke_stub(_PRE_EFLAGS("[eflags]", "[mask]", "[tmp]"),             \
                 _POST_EFLAGS("[eflags]", "[mask]", "[tmp]"),            \
-                [eflags] "+g" (_regs.eflags), [tmp] "=&r" (tmp_),       \
-                "+m" (fic)                                              \
+                [eflags] "+g" (_regs.eflags), [tmp] "=&r" (tmp_)        \
                 : [mask] "i" (X86_EFLAGS_ZF|X86_EFLAGS_PF|X86_EFLAGS_CF)); \
     put_stub(stub);                                                     \
 } while (0)
@@ -3134,14 +3119,14 @@ x86_emulate(
     struct x86_emulate_state state;
     int rc;
     uint8_t b, d, *opc = NULL;
-    unsigned int first_byte = 0;
+    unsigned int first_byte = 0, insn_bytes = 0;
     bool singlestep = (_regs.eflags & X86_EFLAGS_TF) &&
 	    !is_branch_step(ctxt, ops);
     bool sfence = false;
     struct operand src = { .reg = PTR_POISON };
     struct operand dst = { .reg = PTR_POISON };
     unsigned long cr4;
-    struct fpu_insn_ctxt fic = { .type = X86EMUL_FPU_none, .exn_raised = -1 };
+    enum x86_emulate_fpu_type fpu_type = X86EMUL_FPU_none;
     struct x86_emulate_stub stub = {};
     DECLARE_ALIGNED(mmval_t, mmval);
 
@@ -3830,9 +3815,8 @@ x86_emulate(
 
     case 0x9b:  /* wait/fwait */
         host_and_vcpu_must_have(fpu);
-        get_fpu(X86EMUL_FPU_wait, &fic);
+        get_fpu(X86EMUL_FPU_wait);
         emulate_fpu_insn_stub(b);
-        check_fpu_exn(&fic);
         break;
 
     case 0x9c: /* pushf */
@@ -4235,7 +4219,7 @@ x86_emulate(
 
     case 0xd8: /* FPU 0xd8 */
         host_and_vcpu_must_have(fpu);
-        get_fpu(X86EMUL_FPU_fpu, &fic);
+        get_fpu(X86EMUL_FPU_fpu);
         switch ( modrm )
         {
         case 0xc0 ... 0xc7: /* fadd %stN,%st */
@@ -4257,12 +4241,11 @@ x86_emulate(
             emulate_fpu_insn_memsrc(b, modrm_reg, src.val);
             break;
         }
-        check_fpu_exn(&fic);
         break;
 
     case 0xd9: /* FPU 0xd9 */
         host_and_vcpu_must_have(fpu);
-        get_fpu(X86EMUL_FPU_fpu, &fic);
+        get_fpu(X86EMUL_FPU_fpu);
         switch ( modrm )
         {
         case 0xfb: /* fsincos */
@@ -4344,12 +4327,11 @@ x86_emulate(
             if ( dst.type == OP_MEM && !state->fpu_ctrl && !fpu_check_write() )
                 dst.type = OP_NONE;
         }
-        check_fpu_exn(&fic);
         break;
 
     case 0xda: /* FPU 0xda */
         host_and_vcpu_must_have(fpu);
-        get_fpu(X86EMUL_FPU_fpu, &fic);
+        get_fpu(X86EMUL_FPU_fpu);
         switch ( modrm )
         {
         case 0xc0 ... 0xc7: /* fcmovb %stN */
@@ -4366,12 +4348,11 @@ x86_emulate(
             generate_exception_if(ea.type != OP_MEM, EXC_UD);
             goto fpu_memsrc32;
         }
-        check_fpu_exn(&fic);
         break;
 
     case 0xdb: /* FPU 0xdb */
         host_and_vcpu_must_have(fpu);
-        get_fpu(X86EMUL_FPU_fpu, &fic);
+        get_fpu(X86EMUL_FPU_fpu);
         switch ( modrm )
         {
         case 0xc0 ... 0xc7: /* fcmovnb %stN */
@@ -4424,12 +4405,11 @@ x86_emulate(
                 generate_exception(EXC_UD);
             }
         }
-        check_fpu_exn(&fic);
         break;
 
     case 0xdc: /* FPU 0xdc */
         host_and_vcpu_must_have(fpu);
-        get_fpu(X86EMUL_FPU_fpu, &fic);
+        get_fpu(X86EMUL_FPU_fpu);
         switch ( modrm )
         {
         case 0xc0 ... 0xc7: /* fadd %st,%stN */
@@ -4451,12 +4431,11 @@ x86_emulate(
             emulate_fpu_insn_memsrc(b, modrm_reg, src.val);
             break;
         }
-        check_fpu_exn(&fic);
         break;
 
     case 0xdd: /* FPU 0xdd */
         host_and_vcpu_must_have(fpu);
-        get_fpu(X86EMUL_FPU_fpu, &fic);
+        get_fpu(X86EMUL_FPU_fpu);
         switch ( modrm )
         {
         case 0xc0 ... 0xc7: /* ffree %stN */
@@ -4500,12 +4479,11 @@ x86_emulate(
             if ( dst.type == OP_MEM && !state->fpu_ctrl && !fpu_check_write() )
                 dst.type = OP_NONE;
         }
-        check_fpu_exn(&fic);
         break;
 
     case 0xde: /* FPU 0xde */
         host_and_vcpu_must_have(fpu);
-        get_fpu(X86EMUL_FPU_fpu, &fic);
+        get_fpu(X86EMUL_FPU_fpu);
         switch ( modrm )
         {
         case 0xc0 ... 0xc7: /* faddp %stN */
@@ -4523,12 +4501,11 @@ x86_emulate(
             emulate_fpu_insn_memsrc(b, modrm_reg, src.val);
             break;
         }
-        check_fpu_exn(&fic);
         break;
 
     case 0xdf: /* FPU 0xdf */
         host_and_vcpu_must_have(fpu);
-        get_fpu(X86EMUL_FPU_fpu, &fic);
+        get_fpu(X86EMUL_FPU_fpu);
         switch ( modrm )
         {
         case 0xe0:
@@ -4573,7 +4550,6 @@ x86_emulate(
                 goto fpu_memdst64;
             }
         }
-        check_fpu_exn(&fic);
         break;
 
     case 0xe0 ... 0xe2: /* loop{,z,nz} */ {
@@ -5393,7 +5369,7 @@ x86_emulate(
         else
             generate_exception(EXC_UD);
 
-        get_fpu(X86EMUL_FPU_mmx, &fic);
+        get_fpu(X86EMUL_FPU_mmx);
 
         d = DstReg | SrcMem;
         op_bytes = 8;
@@ -5483,7 +5459,7 @@ x86_emulate(
             else
                 vcpu_must_have(sse);
     simd_0f_xmm:
-            get_fpu(X86EMUL_FPU_xmm, &fic);
+            get_fpu(X86EMUL_FPU_xmm);
         }
         else
         {
@@ -5493,7 +5469,7 @@ x86_emulate(
     simd_0f_avx:
             host_and_vcpu_must_have(avx);
     simd_0f_ymm:
-            get_fpu(X86EMUL_FPU_ymm, &fic);
+            get_fpu(X86EMUL_FPU_ymm);
         }
     simd_0f_common:
         opc = init_prefixes(stub);
@@ -5506,7 +5482,7 @@ x86_emulate(
             vex.b = 1;
             opc[1] &= 0x38;
         }
-        fic.insn_bytes = PFX_BYTES + 2;
+        insn_bytes = PFX_BYTES + 2;
         break;
 
     case X86EMUL_OPC_66(0x0f, 0x12):       /* movlpd m64,xmm */
@@ -5593,12 +5569,12 @@ x86_emulate(
                 vcpu_must_have(sse2);
             else
                 vcpu_must_have(sse);
-            get_fpu(X86EMUL_FPU_xmm, &fic);
+            get_fpu(X86EMUL_FPU_xmm);
         }
         else
         {
             host_and_vcpu_must_have(avx);
-            get_fpu(X86EMUL_FPU_ymm, &fic);
+            get_fpu(X86EMUL_FPU_ymm);
         }
 
         if ( ea.type == OP_MEM )
@@ -5624,7 +5600,7 @@ x86_emulate(
                 vcpu_must_have(sse2);
             else
                 vcpu_must_have(sse);
-            get_fpu(X86EMUL_FPU_xmm, &fic);
+            get_fpu(X86EMUL_FPU_xmm);
         }
         else
         {
@@ -5632,7 +5608,7 @@ x86_emulate(
                 vex.l = 0;
             generate_exception_if(vex.l || vex.reg != 0xf, EXC_UD);
             host_and_vcpu_must_have(avx);
-            get_fpu(X86EMUL_FPU_ymm, &fic);
+            get_fpu(X86EMUL_FPU_ymm);
         }
 
         opc = init_prefixes(stub);
@@ -5655,17 +5631,14 @@ x86_emulate(
             opc[1] = modrm & 0xc7;
         if ( !mode_64bit() )
             vex.w = 0;
-        fic.insn_bytes = PFX_BYTES + 2;
+        insn_bytes = PFX_BYTES + 2;
         opc[2] = 0xc3;
 
         copy_REX_VEX(opc, rex_prefix, vex);
         ea.reg = decode_register(modrm_reg, &_regs, 0);
-        invoke_stub("", "", "=a" (*ea.reg), "+m" (fic.exn_raised)
-                            : "c" (mmvalp), "m" (*mmvalp));
+        invoke_stub("", "", "=a" (*ea.reg) : "c" (mmvalp), "m" (*mmvalp));
 
         put_stub(stub);
-        check_xmm_exn(&fic);
-
         state->simd_size = simd_none;
         break;
 
@@ -5679,13 +5652,13 @@ x86_emulate(
                 vcpu_must_have(sse2);
             else
                 vcpu_must_have(sse);
-            get_fpu(X86EMUL_FPU_xmm, &fic);
+            get_fpu(X86EMUL_FPU_xmm);
         }
         else
         {
             generate_exception_if(vex.reg != 0xf, EXC_UD);
             host_and_vcpu_must_have(avx);
-            get_fpu(X86EMUL_FPU_ymm, &fic);
+            get_fpu(X86EMUL_FPU_ymm);
         }
 
         opc = init_prefixes(stub);
@@ -5703,20 +5676,17 @@ x86_emulate(
             vex.b = 1;
             opc[1] &= 0x38;
         }
-        fic.insn_bytes = PFX_BYTES + 2;
+        insn_bytes = PFX_BYTES + 2;
         opc[2] = 0xc3;
 
         copy_REX_VEX(opc, rex_prefix, vex);
         invoke_stub(_PRE_EFLAGS("[eflags]", "[mask]", "[tmp]"),
                     _POST_EFLAGS("[eflags]", "[mask]", "[tmp]"),
                     [eflags] "+g" (_regs.eflags),
-                    [tmp] "=&r" (dummy), "+m" (*mmvalp),
-                    "+m" (fic.exn_raised)
+                    [tmp] "=&r" (dummy), "+m" (*mmvalp)
                     : "a" (mmvalp), [mask] "i" (EFLAGS_MASK));
 
         put_stub(stub);
-        check_xmm_exn(&fic);
-
         ASSERT(!state->simd_size);
         break;
 
@@ -5854,9 +5824,9 @@ x86_emulate(
         if ( !mode_64bit() )
             vex.w = 0;
         opc[1] = modrm & 0xc7;
-        fic.insn_bytes = PFX_BYTES + 2;
+        insn_bytes = PFX_BYTES + 2;
     simd_0f_to_gpr:
-        opc[fic.insn_bytes - PFX_BYTES] = 0xc3;
+        opc[insn_bytes - PFX_BYTES] = 0xc3;
 
         generate_exception_if(ea.type != OP_REG, EXC_UD);
 
@@ -5875,9 +5845,9 @@ x86_emulate(
                     vcpu_must_have(sse);
             }
             if ( b == 0x50 || (vex.pfx & VEX_PREFIX_DOUBLE_MASK) )
-                get_fpu(X86EMUL_FPU_xmm, &fic);
+                get_fpu(X86EMUL_FPU_xmm);
             else
-                get_fpu(X86EMUL_FPU_mmx, &fic);
+                get_fpu(X86EMUL_FPU_mmx);
         }
         else
         {
@@ -5886,14 +5856,13 @@ x86_emulate(
                 host_and_vcpu_must_have(avx);
             else
                 host_and_vcpu_must_have(avx2);
-            get_fpu(X86EMUL_FPU_ymm, &fic);
+            get_fpu(X86EMUL_FPU_ymm);
         }
 
         copy_REX_VEX(opc, rex_prefix, vex);
         invoke_stub("", "", "=a" (dst.val) : [dummy] "i" (0));
 
         put_stub(stub);
-        check_xmm_exn(&fic);
 
         ASSERT(!state->simd_size);
         dst.bytes = 4;
@@ -6059,7 +6028,7 @@ x86_emulate(
             goto simd_0f_sse2;
     simd_0f_mmx:
         host_and_vcpu_must_have(mmx);
-        get_fpu(X86EMUL_FPU_mmx, &fic);
+        get_fpu(X86EMUL_FPU_mmx);
         goto simd_0f_common;
 
     CASE_SIMD_PACKED_INT(0x0f, 0x6e):    /* mov{d,q} r/m,{,x}mm */
@@ -6070,17 +6039,17 @@ x86_emulate(
         {
             generate_exception_if(vex.l || vex.reg != 0xf, EXC_UD);
             host_and_vcpu_must_have(avx);
-            get_fpu(X86EMUL_FPU_ymm, &fic);
+            get_fpu(X86EMUL_FPU_ymm);
         }
         else if ( vex.pfx )
         {
             vcpu_must_have(sse2);
-            get_fpu(X86EMUL_FPU_xmm, &fic);
+            get_fpu(X86EMUL_FPU_xmm);
         }
         else
         {
             host_and_vcpu_must_have(mmx);
-            get_fpu(X86EMUL_FPU_mmx, &fic);
+            get_fpu(X86EMUL_FPU_mmx);
         }
 
     simd_0f_rm:
@@ -6092,17 +6061,14 @@ x86_emulate(
         if ( !mode_64bit() )
             vex.w = 0;
         opc[1] = modrm & 0x38;
-        fic.insn_bytes = PFX_BYTES + 2;
+        insn_bytes = PFX_BYTES + 2;
         opc[2] = 0xc3;
 
         copy_REX_VEX(opc, rex_prefix, vex);
-        invoke_stub("", "", "+m" (src.val), "+m" (fic.exn_raised)
-                            : "a" (&src.val));
+        invoke_stub("", "", "+m" (src.val) : "a" (&src.val));
         dst.val = src.val;
 
         put_stub(stub);
-        check_xmm_exn(&fic);
-
         ASSERT(!state->simd_size);
         break;
 
@@ -6168,19 +6134,19 @@ x86_emulate(
                 host_and_vcpu_must_have(avx);
             }
     simd_0f_imm8_ymm:
-            get_fpu(X86EMUL_FPU_ymm, &fic);
+            get_fpu(X86EMUL_FPU_ymm);
         }
         else if ( vex.pfx )
         {
     simd_0f_imm8_sse2:
             vcpu_must_have(sse2);
-            get_fpu(X86EMUL_FPU_xmm, &fic);
+            get_fpu(X86EMUL_FPU_xmm);
         }
         else
         {
             host_and_vcpu_must_have(mmx);
             vcpu_must_have(mmxext);
-            get_fpu(X86EMUL_FPU_mmx, &fic);
+            get_fpu(X86EMUL_FPU_mmx);
         }
     simd_0f_imm8:
         opc = init_prefixes(stub);
@@ -6194,7 +6160,7 @@ x86_emulate(
             opc[1] &= 0x38;
         }
         opc[2] = imm1;
-        fic.insn_bytes = PFX_BYTES + 3;
+        insn_bytes = PFX_BYTES + 3;
         break;
 
     CASE_SIMD_PACKED_INT(0x0f, 0x71):    /* Grp12 */
@@ -6222,33 +6188,31 @@ x86_emulate(
                 host_and_vcpu_must_have(avx2);
             else
                 host_and_vcpu_must_have(avx);
-            get_fpu(X86EMUL_FPU_ymm, &fic);
+            get_fpu(X86EMUL_FPU_ymm);
         }
         else if ( vex.pfx )
         {
             vcpu_must_have(sse2);
-            get_fpu(X86EMUL_FPU_xmm, &fic);
+            get_fpu(X86EMUL_FPU_xmm);
         }
         else
         {
             host_and_vcpu_must_have(mmx);
-            get_fpu(X86EMUL_FPU_mmx, &fic);
+            get_fpu(X86EMUL_FPU_mmx);
         }
 
         opc = init_prefixes(stub);
         opc[0] = b;
         opc[1] = modrm;
         opc[2] = imm1;
-        fic.insn_bytes = PFX_BYTES + 3;
+        insn_bytes = PFX_BYTES + 3;
     simd_0f_reg_only:
-        opc[fic.insn_bytes - PFX_BYTES] = 0xc3;
+        opc[insn_bytes - PFX_BYTES] = 0xc3;
 
         copy_REX_VEX(opc, rex_prefix, vex);
         invoke_stub("", "", [dummy_out] "=g" (dummy) : [dummy_in] "i" (0) );
 
         put_stub(stub);
-        check_xmm_exn(&fic);
-
         ASSERT(!state->simd_size);
         break;
 
@@ -6283,7 +6247,7 @@ x86_emulate(
         {
             generate_exception_if(vex.reg != 0xf, EXC_UD);
             host_and_vcpu_must_have(avx);
-            get_fpu(X86EMUL_FPU_ymm, &fic);
+            get_fpu(X86EMUL_FPU_ymm);
 
 #ifdef __x86_64__
             if ( !mode_64bit() )
@@ -6325,12 +6289,12 @@ x86_emulate(
         else
         {
             host_and_vcpu_must_have(mmx);
-            get_fpu(X86EMUL_FPU_mmx, &fic);
+            get_fpu(X86EMUL_FPU_mmx);
         }
 
         opc = init_prefixes(stub);
         opc[0] = b;
-        fic.insn_bytes = PFX_BYTES + 1;
+        insn_bytes = PFX_BYTES + 1;
         goto simd_0f_reg_only;
 
     case X86EMUL_OPC_66(0x0f, 0x78):     /* Grp17 */
@@ -6346,14 +6310,14 @@ x86_emulate(
         generate_exception_if(ea.type != OP_REG, EXC_UD);
 
         host_and_vcpu_must_have(sse4a);
-        get_fpu(X86EMUL_FPU_xmm, &fic);
+        get_fpu(X86EMUL_FPU_xmm);
 
         opc = init_prefixes(stub);
         opc[0] = b;
         opc[1] = modrm;
         opc[2] = imm1;
         opc[3] = imm2;
-        fic.insn_bytes = PFX_BYTES + 4;
+        insn_bytes = PFX_BYTES + 4;
         goto simd_0f_reg_only;
 
     case X86EMUL_OPC_66(0x0f, 0x79):     /* extrq xmm,xmm */
@@ -6481,7 +6445,7 @@ x86_emulate(
             vcpu_must_have(sse);
         ldmxcsr:
             generate_exception_if(src.type != OP_MEM, EXC_UD);
-            get_fpu(vex.opcx ? X86EMUL_FPU_ymm : X86EMUL_FPU_xmm, &fic);
+            get_fpu(vex.opcx ? X86EMUL_FPU_ymm : X86EMUL_FPU_xmm);
             generate_exception_if(src.val & ~mxcsr_mask, EXC_GP, 0);
             asm volatile ( "ldmxcsr %0" :: "m" (src.val) );
             break;
@@ -6491,7 +6455,7 @@ x86_emulate(
             vcpu_must_have(sse);
         stmxcsr:
             generate_exception_if(dst.type != OP_MEM, EXC_UD);
-            get_fpu(vex.opcx ? X86EMUL_FPU_ymm : X86EMUL_FPU_xmm, &fic);
+            get_fpu(vex.opcx ? X86EMUL_FPU_ymm : X86EMUL_FPU_xmm);
             asm volatile ( "stmxcsr %0" : "=m" (dst.val) );
             break;
 
@@ -6745,7 +6709,7 @@ x86_emulate(
             if ( vex.pfx & VEX_PREFIX_DOUBLE_MASK )
                 goto simd_0f_imm8_sse2;
             vcpu_must_have(sse);
-            get_fpu(X86EMUL_FPU_xmm, &fic);
+            get_fpu(X86EMUL_FPU_xmm);
             goto simd_0f_imm8;
         }
         goto simd_0f_imm8_avx;
@@ -6777,7 +6741,7 @@ x86_emulate(
             vex.w = 0;
         opc[1] = modrm & 0xc7;
         opc[2] = imm1;
-        fic.insn_bytes = PFX_BYTES + 3;
+        insn_bytes = PFX_BYTES + 3;
         goto simd_0f_to_gpr;
 
     case X86EMUL_OPC(0x0f, 0xc7): /* Grp9 */
@@ -7023,18 +6987,18 @@ x86_emulate(
             generate_exception_if(vex.l || vex.reg != 0xf, EXC_UD);
             d |= TwoOp;
             host_and_vcpu_must_have(avx);
-            get_fpu(X86EMUL_FPU_ymm, &fic);
+            get_fpu(X86EMUL_FPU_ymm);
         }
         else if ( vex.pfx )
         {
             vcpu_must_have(sse2);
-            get_fpu(X86EMUL_FPU_xmm, &fic);
+            get_fpu(X86EMUL_FPU_xmm);
         }
         else
         {
             host_and_vcpu_must_have(mmx);
             vcpu_must_have(mmxext);
-            get_fpu(X86EMUL_FPU_mmx, &fic);
+            get_fpu(X86EMUL_FPU_mmx);
         }
 
         /*
@@ -7054,7 +7018,6 @@ x86_emulate(
         if ( !mode_64bit() )
             vex.w = 0;
         opc[1] = modrm & 0xc7;
-        fic.insn_bytes = PFX_BYTES + 2;
         opc[2] = 0xc3;
 
         copy_REX_VEX(opc, rex_prefix, vex);
@@ -7067,6 +7030,7 @@ x86_emulate(
         opc = init_prefixes(stub);
         opc[0] = b;
         opc[1] = modrm;
+        insn_bytes = PFX_BYTES + 2;
         /* Restore high bit of XMM destination. */
         if ( sfence )
         {
@@ -7113,12 +7077,12 @@ x86_emulate(
         if ( vex.pfx )
         {
     simd_0f38_common:
-            get_fpu(X86EMUL_FPU_xmm, &fic);
+            get_fpu(X86EMUL_FPU_xmm);
         }
         else
         {
             host_and_vcpu_must_have(mmx);
-            get_fpu(X86EMUL_FPU_mmx, &fic);
+            get_fpu(X86EMUL_FPU_mmx);
         }
         opc = init_prefixes(stub);
         opc[0] = 0x38;
@@ -7131,7 +7095,7 @@ x86_emulate(
             vex.b = 1;
             opc[2] &= 0x38;
         }
-        fic.insn_bytes = PFX_BYTES + 3;
+        insn_bytes = PFX_BYTES + 3;
         break;
 
     case X86EMUL_OPC_VEX_66(0x0f38, 0x19): /* vbroadcastsd m64,ymm */
@@ -7155,13 +7119,13 @@ x86_emulate(
         if ( vex.opcx == vex_none )
         {
             host_and_vcpu_must_have(sse4_1);
-            get_fpu(X86EMUL_FPU_xmm, &fic);
+            get_fpu(X86EMUL_FPU_xmm);
         }
         else
         {
             generate_exception_if(vex.reg != 0xf, EXC_UD);
             host_and_vcpu_must_have(avx);
-            get_fpu(X86EMUL_FPU_ymm, &fic);
+            get_fpu(X86EMUL_FPU_ymm);
         }
 
         opc = init_prefixes(stub);
@@ -7180,21 +7144,19 @@ x86_emulate(
             vex.b = 1;
             opc[1] &= 0x38;
         }
-        fic.insn_bytes = PFX_BYTES + 2;
+        insn_bytes = PFX_BYTES + 2;
         opc[2] = 0xc3;
         if ( vex.opcx == vex_none )
         {
             /* Cover for extra prefix byte. */
             --opc;
-            ++fic.insn_bytes;
+            ++insn_bytes;
         }
 
         copy_REX_VEX(opc, rex_prefix, vex);
         emulate_stub("+m" (*mmvalp), "a" (mmvalp));
 
         put_stub(stub);
-        check_xmm_exn(&fic);
-
         state->simd_size = simd_none;
         dst.type = OP_NONE;
         break;
@@ -7283,7 +7245,7 @@ x86_emulate(
 
         generate_exception_if(ea.type != OP_MEM || vex.w, EXC_UD);
         host_and_vcpu_must_have(avx);
-        get_fpu(X86EMUL_FPU_ymm, &fic);
+        get_fpu(X86EMUL_FPU_ymm);
 
         /*
          * While we can't reasonably provide fully correct behavior here
@@ -7332,7 +7294,7 @@ x86_emulate(
         rex_prefix &= ~REX_B;
         vex.b = 1;
         opc[1] = modrm & 0x38;
-        fic.insn_bytes = PFX_BYTES + 2;
+        insn_bytes = PFX_BYTES + 2;
 
         break;
     }
@@ -7381,7 +7343,7 @@ x86_emulate(
 
         generate_exception_if(ea.type != OP_MEM, EXC_UD);
         host_and_vcpu_must_have(avx2);
-        get_fpu(X86EMUL_FPU_ymm, &fic);
+        get_fpu(X86EMUL_FPU_ymm);
 
         /*
          * While we can't reasonably provide fully correct behavior here
@@ -7428,7 +7390,7 @@ x86_emulate(
         rex_prefix &= ~REX_B;
         vex.b = 1;
         opc[1] = modrm & 0x38;
-        fic.insn_bytes = PFX_BYTES + 2;
+        insn_bytes = PFX_BYTES + 2;
 
         break;
     }
@@ -7451,7 +7413,7 @@ x86_emulate(
                               state->sib_index == mask_reg, EXC_UD);
         generate_exception_if(!cpu_has_avx, EXC_UD);
         vcpu_must_have(avx2);
-        get_fpu(X86EMUL_FPU_ymm, &fic);
+        get_fpu(X86EMUL_FPU_ymm);
 
         /* Read destination, index, and mask registers. */
         opc = init_prefixes(stub);
@@ -7788,12 +7750,12 @@ x86_emulate(
         if ( vex.pfx )
         {
     simd_0f3a_common:
-            get_fpu(X86EMUL_FPU_xmm, &fic);
+            get_fpu(X86EMUL_FPU_xmm);
         }
         else
         {
             host_and_vcpu_must_have(mmx);
-            get_fpu(X86EMUL_FPU_mmx, &fic);
+            get_fpu(X86EMUL_FPU_mmx);
         }
         opc = init_prefixes(stub);
         opc[0] = 0x3a;
@@ -7807,7 +7769,7 @@ x86_emulate(
             opc[2] &= 0x38;
         }
         opc[3] = imm1;
-        fic.insn_bytes = PFX_BYTES + 4;
+        insn_bytes = PFX_BYTES + 4;
         break;
 
     case X86EMUL_OPC_66(0x0f3a, 0x14): /* pextrb $imm8,xmm,r/m */
@@ -7815,7 +7777,7 @@ x86_emulate(
     case X86EMUL_OPC_66(0x0f3a, 0x16): /* pextr{d,q} $imm8,xmm,r/m */
     case X86EMUL_OPC_66(0x0f3a, 0x17): /* extractps $imm8,xmm,r/m */
         host_and_vcpu_must_have(sse4_1);
-        get_fpu(X86EMUL_FPU_xmm, &fic);
+        get_fpu(X86EMUL_FPU_xmm);
 
         opc = init_prefixes(stub);
         opc++[0] = 0x3a;
@@ -7828,20 +7790,16 @@ x86_emulate(
             vex.w = 0;
         opc[1] = modrm & 0x38;
         opc[2] = imm1;
-        fic.insn_bytes = PFX_BYTES + 3;
         opc[3] = 0xc3;
         if ( vex.opcx == vex_none )
         {
             /* Cover for extra prefix byte. */
             --opc;
-            ++fic.insn_bytes;
         }
 
         copy_REX_VEX(opc, rex_prefix, vex);
         invoke_stub("", "", "=m" (dst.val) : "a" (&dst.val));
-
         put_stub(stub);
-        check_xmm_exn(&fic);
 
         ASSERT(!state->simd_size);
         dst.bytes = dst.type == OP_REG || b == 0x17 ? 4 : 1 << (b & 3);
@@ -7855,7 +7813,7 @@ x86_emulate(
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x17): /* vextractps $imm8,xmm,r/m */
         generate_exception_if(vex.l || vex.reg != 0xf, EXC_UD);
         host_and_vcpu_must_have(avx);
-        get_fpu(X86EMUL_FPU_ymm, &fic);
+        get_fpu(X86EMUL_FPU_ymm);
         opc = init_prefixes(stub);
         goto pextr;
 
@@ -7877,17 +7835,15 @@ x86_emulate(
             opc[1] &= 0x38;
         }
         opc[2] = imm1;
-        fic.insn_bytes = PFX_BYTES + 3;
+        insn_bytes = PFX_BYTES + 3;
         opc[3] = 0xc3;
 
         copy_VEX(opc, vex);
         /* Latch MXCSR - we may need to restore it below. */
         invoke_stub("stmxcsr %[mxcsr]", "",
-                    "=m" (*mmvalp), "+m" (fic.exn_raised), [mxcsr] "=m" (mxcsr)
-                    : "a" (mmvalp));
+                    "=m" (*mmvalp), [mxcsr] "=m" (mxcsr) : "a" (mmvalp));
 
         put_stub(stub);
-        check_xmm_exn(&fic);
 
         if ( ea.type == OP_MEM )
         {
@@ -7906,7 +7862,7 @@ x86_emulate(
     case X86EMUL_OPC_66(0x0f3a, 0x20): /* pinsrb $imm8,r32/m8,xmm */
     case X86EMUL_OPC_66(0x0f3a, 0x22): /* pinsr{d,q} $imm8,r/m,xmm */
         host_and_vcpu_must_have(sse4_1);
-        get_fpu(X86EMUL_FPU_xmm, &fic);
+        get_fpu(X86EMUL_FPU_xmm);
         memcpy(mmvalp, &src.val, op_bytes);
         ea.type = OP_MEM;
         op_bytes = src.bytes;
@@ -8014,13 +7970,13 @@ x86_emulate(
         if ( vex.opcx == vex_none )
         {
             host_and_vcpu_must_have(sse4_2);
-            get_fpu(X86EMUL_FPU_xmm, &fic);
+            get_fpu(X86EMUL_FPU_xmm);
         }
         else
         {
             generate_exception_if(vex.l || vex.reg != 0xf, EXC_UD);
             host_and_vcpu_must_have(avx);
-            get_fpu(X86EMUL_FPU_ymm, &fic);
+            get_fpu(X86EMUL_FPU_ymm);
         }
 
         opc = init_prefixes(stub);
@@ -8041,13 +7997,13 @@ x86_emulate(
                 goto done;
         }
         opc[2] = imm1;
-        fic.insn_bytes = PFX_BYTES + 3;
+        insn_bytes = PFX_BYTES + 3;
         opc[3] = 0xc3;
         if ( vex.opcx == vex_none )
         {
             /* Cover for extra prefix byte. */
             --opc;
-            ++fic.insn_bytes;
+            ++insn_bytes;
         }
 
         copy_REX_VEX(opc, rex_prefix, vex);
@@ -8275,7 +8231,7 @@ x86_emulate(
 
         if ( !opc )
             BUG();
-        opc[fic.insn_bytes - PFX_BYTES] = 0xc3;
+        opc[insn_bytes - PFX_BYTES] = 0xc3;
         copy_REX_VEX(opc, rex_prefix, vex);
 
         if ( ea.type == OP_MEM )
@@ -8352,13 +8308,11 @@ x86_emulate(
         if ( likely((ctxt->opcode & ~(X86EMUL_OPC_PFX_MASK |
                                       X86EMUL_OPC_ENCODING_MASK)) !=
                     X86EMUL_OPC(0x0f, 0xf7)) )
-            invoke_stub("", "", "+m" (*mmvalp), "+m" (fic.exn_raised)
-                                : "a" (mmvalp));
+            invoke_stub("", "", "+m" (*mmvalp) : "a" (mmvalp));
         else
             invoke_stub("", "", "+m" (*mmvalp) : "D" (mmvalp));
 
         put_stub(stub);
-        check_xmm_exn(&fic);
     }
 
     switch ( dst.type )
@@ -8401,7 +8355,8 @@ x86_emulate(
     }
 
  complete_insn: /* Commit shadow register state. */
-    put_fpu(&fic, false, state, ctxt, ops);
+    put_fpu(fpu_type, false, state, ctxt, ops);
+    fpu_type = X86EMUL_FPU_none;
 
     /* Zero the upper 32 bits of %rip if not in 64-bit mode. */
     if ( !mode_64bit() )
@@ -8425,7 +8380,7 @@ x86_emulate(
     ctxt->regs->eflags &= ~X86_EFLAGS_RF;
 
  done:
-    put_fpu(&fic, fic.insn_bytes > 0 && dst.type == OP_MEM, state, ctxt, ops);
+    put_fpu(fpu_type, insn_bytes > 0 && dst.type == OP_MEM, state, ctxt, ops);
     put_stub(stub);
     return rc;
 #undef state
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -428,12 +428,8 @@ struct x86_emulate_ops
 
     /*
      * get_fpu: Load emulated environment's FPU state onto processor.
-     *  @exn_callback: On any FPU or SIMD exception, pass control to
-     *                 (*exception_callback)(exception_callback_arg, regs).
      */
     int (*get_fpu)(
-        void (*exception_callback)(void *, struct cpu_user_regs *),
-        void *exception_callback_arg,
         enum x86_emulate_fpu_type type,
         struct x86_emulate_ctxt *ctxt);
 
--- a/xen/include/asm-x86/hvm/vcpu.h
+++ b/xen/include/asm-x86/hvm/vcpu.h
@@ -196,10 +196,6 @@ struct hvm_vcpu {
 
     struct hvm_vcpu_io  hvm_io;
 
-    /* Callback into x86_emulate when emulating FPU/MMX/XMM instructions. */
-    void (*fpu_exception_callback)(void *, struct cpu_user_regs *);
-    void *fpu_exception_callback_arg;
-
     /* Pending hw/sw interrupt (.vector = -1 means nothing pending). */
     struct x86_event     inject_event;
 



[-- Attachment #2: x86emul-MF-XM-handling.patch --]
[-- Type: text/plain, Size: 39012 bytes --]

x86/HVM: eliminate custom #MF/#XM handling

Use the generic stub exception handling instead.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/tools/tests/x86_emulator/x86_emulate.c
+++ b/tools/tests/x86_emulator/x86_emulate.c
@@ -134,8 +134,6 @@ int emul_test_read_xcr(
 }
 
 int emul_test_get_fpu(
-    void (*exception_callback)(void *, struct cpu_user_regs *),
-    void *exception_callback_arg,
     enum x86_emulate_fpu_type type,
     struct x86_emulate_ctxt *ctxt)
 {
--- a/tools/tests/x86_emulator/x86_emulate.h
+++ b/tools/tests/x86_emulator/x86_emulate.h
@@ -221,8 +221,6 @@ int emul_test_read_xcr(
     struct x86_emulate_ctxt *ctxt);
 
 int emul_test_get_fpu(
-    void (*exception_callback)(void *, struct cpu_user_regs *),
-    void *exception_callback_arg,
     enum x86_emulate_fpu_type type,
     struct x86_emulate_ctxt *ctxt);
 
--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -1727,8 +1727,6 @@ int hvmemul_cpuid(uint32_t leaf, uint32_
 }
 
 static int hvmemul_get_fpu(
-    void (*exception_callback)(void *, struct cpu_user_regs *),
-    void *exception_callback_arg,
     enum x86_emulate_fpu_type type,
     struct x86_emulate_ctxt *ctxt)
 {
@@ -1766,9 +1764,6 @@ static int hvmemul_get_fpu(
         }
     }
 
-    curr->arch.hvm_vcpu.fpu_exception_callback = exception_callback;
-    curr->arch.hvm_vcpu.fpu_exception_callback_arg = exception_callback_arg;
-
     return X86EMUL_OKAY;
 }
 
@@ -1779,8 +1774,6 @@ static void hvmemul_put_fpu(
 {
     struct vcpu *curr = current;
 
-    curr->arch.hvm_vcpu.fpu_exception_callback = NULL;
-
     if ( aux )
     {
         typeof(curr->arch.xsave_area->fpu_sse) *fpu_ctxt = curr->arch.fpu_ctxt;
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -780,7 +780,6 @@ void do_reserved_trap(struct cpu_user_re
 
 void do_trap(struct cpu_user_regs *regs)
 {
-    struct vcpu *curr = current;
     unsigned int trapnr = regs->entry_vector;
     unsigned long fixup;
 
@@ -800,15 +799,6 @@ void do_trap(struct cpu_user_regs *regs)
         return;
     }
 
-    if ( ((trapnr == TRAP_copro_error) || (trapnr == TRAP_simd_error)) &&
-         system_state >= SYS_STATE_active && is_hvm_vcpu(curr) &&
-         curr->arch.hvm_vcpu.fpu_exception_callback )
-    {
-        curr->arch.hvm_vcpu.fpu_exception_callback(
-            curr->arch.hvm_vcpu.fpu_exception_callback_arg, regs);
-        return;
-    }
-
     if ( likely((fixup = search_exception_table(regs)) != 0) )
     {
         dprintk(XENLOG_ERR, "Trap %u: %p [%ps] -> %p\n",
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -959,6 +959,33 @@ static inline int mkec(uint8_t e, int32_
 #define generate_exception(e, ec...) generate_exception_if(true, e, ##ec)
 
 #ifdef __XEN__
+static int exception_from_stub(union stub_exception_token res,
+                               void *stub, unsigned int line,
+                               struct x86_emulate_ctxt *ctxt,
+                               const struct x86_emulate_ops *ops)
+{
+    int rc = X86EMUL_UNHANDLEABLE;
+
+    generate_exception_if(res.fields.trapnr == EXC_MF, EXC_MF);
+    if ( res.fields.trapnr == EXC_XM )
+    {
+        unsigned long cr4;
+
+        if ( !ops->read_cr || !ops->read_cr(4, &cr4, ctxt) == X86EMUL_OKAY )
+            cr4 = X86_CR4_OSXMMEXCPT;
+        generate_exception(cr4 & X86_CR4_OSXMMEXCPT ? EXC_XM : EXC_UD);
+    }
+    gprintk(XENLOG_WARNING,
+            "exception %u (ec=%04x) in emulation stub (line %u)\n",
+            res.fields.trapnr, res.fields.ec, line);
+    gprintk(XENLOG_INFO, "stub: %"__stringify(MAX_INST_LEN)"ph\n",  stub);
+    generate_exception_if(res.fields.trapnr == EXC_UD, EXC_UD);
+    domain_crash(current->domain);
+
+ done:
+    return rc;
+}
+
 # define invoke_stub(pre, post, constraints...) do {                    \
     union stub_exception_token res_ = { .raw = ~0 };                    \
     asm volatile ( pre "\n\tcall *%[stub]\n\t" post "\n"                \
@@ -974,14 +1001,8 @@ static inline int mkec(uint8_t e, int32_
                      "m" (*(uint8_t(*)[MAX_INST_LEN + 1])stub.ptr) );   \
     if ( unlikely(~res_.raw) )                                          \
     {                                                                   \
-        gprintk(XENLOG_WARNING,                                         \
-                "exception %u (ec=%04x) in emulation stub (line %u)\n", \
-                res_.fields.trapnr, res_.fields.ec, __LINE__);          \
-        gprintk(XENLOG_INFO, "stub: %"__stringify(MAX_INST_LEN)"ph\n",  \
-                stub.func);                                             \
-        generate_exception_if(res_.fields.trapnr == EXC_UD, EXC_UD);    \
-        domain_crash(current->domain);                                  \
-        goto cannot_emulate;                                            \
+        rc = exception_from_stub(res_, stub.func, __LINE__, ctxt, ops); \
+        goto done;                                                      \
     }                                                                   \
 } while (0)
 #else
@@ -1097,23 +1118,8 @@ do {
     ops->write_segment(x86_seg_cs, cs, ctxt);                           \
 })
 
-struct fpu_insn_ctxt {
-    uint8_t insn_bytes;
-    uint8_t type;
-    int8_t exn_raised;
-};
-
-static void fpu_handle_exception(void *_fic, struct cpu_user_regs *regs)
-{
-    struct fpu_insn_ctxt *fic = _fic;
-    ASSERT(regs->entry_vector < 0x20);
-    fic->exn_raised = regs->entry_vector;
-    regs->r(ip) += fic->insn_bytes;
-}
-
 static int _get_fpu(
     enum x86_emulate_fpu_type type,
-    struct fpu_insn_ctxt *fic,
     struct x86_emulate_ctxt *ctxt,
     const struct x86_emulate_ops *ops)
 {
@@ -1138,14 +1144,13 @@ static int _get_fpu(
         break;
     }
 
-    rc = ops->get_fpu(fpu_handle_exception, fic, type, ctxt);
+    rc = ops->get_fpu(type, ctxt);
 
     if ( rc == X86EMUL_OKAY )
     {
         unsigned long cr0;
 
         fail_if(type == X86EMUL_FPU_fpu && !ops->put_fpu);
-        fic->type = type;
 
         fail_if(!ops->read_cr);
         if ( type >= X86EMUL_FPU_xmm )
@@ -1183,37 +1188,22 @@ static int _get_fpu(
     return rc;
 }
 
-#define get_fpu(_type, _fic)                                    \
+#define get_fpu(type)                                           \
 do {                                                            \
-    rc = _get_fpu(_type, _fic, ctxt, ops);                      \
+    rc = _get_fpu(fpu_type = (type), ctxt, ops);                \
     if ( rc ) goto done;                                        \
 } while (0)
 
-#define check_fpu_exn(fic)                                      \
-do {                                                            \
-    generate_exception_if((fic)->exn_raised >= 0,               \
-                          (fic)->exn_raised);                   \
-} while (0)
-
-#define check_xmm_exn(fic)                                      \
-do {                                                            \
-    if ( (fic)->exn_raised == EXC_XM && ops->read_cr &&         \
-         ops->read_cr(4, &cr4, ctxt) == X86EMUL_OKAY &&         \
-         !(cr4 & X86_CR4_OSXMMEXCPT) )                          \
-        (fic)->exn_raised = EXC_UD;                             \
-    check_fpu_exn(fic);                                         \
-} while (0)
-
 static void put_fpu(
-    struct fpu_insn_ctxt *fic,
+    enum x86_emulate_fpu_type type,
     bool failed_late,
     const struct x86_emulate_state *state,
     struct x86_emulate_ctxt *ctxt,
     const struct x86_emulate_ops *ops)
 {
-    if ( unlikely(failed_late) && fic->type == X86EMUL_FPU_fpu )
+    if ( unlikely(failed_late) && type == X86EMUL_FPU_fpu )
         ops->put_fpu(ctxt, X86EMUL_FPU_fpu, NULL);
-    else if ( unlikely(fic->type == X86EMUL_FPU_fpu) && !state->fpu_ctrl )
+    else if ( unlikely(type == X86EMUL_FPU_fpu) && !state->fpu_ctrl )
     {
         struct x86_emul_fpu_aux aux = {
             .ip = ctxt->regs->r(ip),
@@ -1247,9 +1237,8 @@ static void put_fpu(
         }
         ops->put_fpu(ctxt, X86EMUL_FPU_none, &aux);
     }
-    else if ( fic->type != X86EMUL_FPU_none && ops->put_fpu )
+    else if ( type != X86EMUL_FPU_none && ops->put_fpu )
         ops->put_fpu(ctxt, X86EMUL_FPU_none, NULL);
-    fic->type = X86EMUL_FPU_none;
 }
 
 static inline bool fpu_check_write(void)
@@ -1264,29 +1253,27 @@ static inline bool fpu_check_write(void)
 #define emulate_fpu_insn_memdst(opc, ext, arg)                          \
 do {                                                                    \
     /* ModRM: mod=0, reg=ext, rm=0, i.e. a (%rax) operand */            \
-    fic.insn_bytes = 2;                                                 \
+    insn_bytes = 2;                                                     \
     memcpy(get_stub(stub),                                              \
            ((uint8_t[]){ opc, ((ext) & 7) << 3, 0xc3 }), 3);            \
-    invoke_stub("", "", "+m" (fic), "+m" (arg) : "a" (&(arg)));         \
+    invoke_stub("", "", "+m" (arg) : "a" (&(arg)));                     \
     put_stub(stub);                                                     \
 } while (0)
 
 #define emulate_fpu_insn_memsrc(opc, ext, arg)                          \
 do {                                                                    \
     /* ModRM: mod=0, reg=ext, rm=0, i.e. a (%rax) operand */            \
-    fic.insn_bytes = 2;                                                 \
     memcpy(get_stub(stub),                                              \
            ((uint8_t[]){ opc, ((ext) & 7) << 3, 0xc3 }), 3);            \
-    invoke_stub("", "", "+m" (fic) : "m" (arg), "a" (&(arg)));          \
+    invoke_stub("", "", "=m" (dummy) : "m" (arg), "a" (&(arg)));        \
     put_stub(stub);                                                     \
 } while (0)
 
 #define emulate_fpu_insn_stub(bytes...)                                 \
 do {                                                                    \
     unsigned int nr_ = sizeof((uint8_t[]){ bytes });                    \
-    fic.insn_bytes = nr_;                                               \
     memcpy(get_stub(stub), ((uint8_t[]){ bytes, 0xc3 }), nr_ + 1);      \
-    invoke_stub("", "", "=m" (fic) : "m" (fic));                        \
+    invoke_stub("", "", "=m" (dummy) : "i" (0));                        \
     put_stub(stub);                                                     \
 } while (0)
 
@@ -1294,12 +1281,10 @@ do {
 do {                                                                    \
     unsigned int nr_ = sizeof((uint8_t[]){ bytes });                    \
     unsigned long tmp_;                                                 \
-    fic.insn_bytes = nr_;                                               \
     memcpy(get_stub(stub), ((uint8_t[]){ bytes, 0xc3 }), nr_ + 1);      \
     invoke_stub(_PRE_EFLAGS("[eflags]", "[mask]", "[tmp]"),             \
                 _POST_EFLAGS("[eflags]", "[mask]", "[tmp]"),            \
-                [eflags] "+g" (_regs.eflags), [tmp] "=&r" (tmp_),       \
-                "+m" (fic)                                              \
+                [eflags] "+g" (_regs.eflags), [tmp] "=&r" (tmp_)        \
                 : [mask] "i" (X86_EFLAGS_ZF|X86_EFLAGS_PF|X86_EFLAGS_CF)); \
     put_stub(stub);                                                     \
 } while (0)
@@ -3134,14 +3119,14 @@ x86_emulate(
     struct x86_emulate_state state;
     int rc;
     uint8_t b, d, *opc = NULL;
-    unsigned int first_byte = 0;
+    unsigned int first_byte = 0, insn_bytes = 0;
     bool singlestep = (_regs.eflags & X86_EFLAGS_TF) &&
 	    !is_branch_step(ctxt, ops);
     bool sfence = false;
     struct operand src = { .reg = PTR_POISON };
     struct operand dst = { .reg = PTR_POISON };
     unsigned long cr4;
-    struct fpu_insn_ctxt fic = { .type = X86EMUL_FPU_none, .exn_raised = -1 };
+    enum x86_emulate_fpu_type fpu_type = X86EMUL_FPU_none;
     struct x86_emulate_stub stub = {};
     DECLARE_ALIGNED(mmval_t, mmval);
 
@@ -3830,9 +3815,8 @@ x86_emulate(
 
     case 0x9b:  /* wait/fwait */
         host_and_vcpu_must_have(fpu);
-        get_fpu(X86EMUL_FPU_wait, &fic);
+        get_fpu(X86EMUL_FPU_wait);
         emulate_fpu_insn_stub(b);
-        check_fpu_exn(&fic);
         break;
 
     case 0x9c: /* pushf */
@@ -4235,7 +4219,7 @@ x86_emulate(
 
     case 0xd8: /* FPU 0xd8 */
         host_and_vcpu_must_have(fpu);
-        get_fpu(X86EMUL_FPU_fpu, &fic);
+        get_fpu(X86EMUL_FPU_fpu);
         switch ( modrm )
         {
         case 0xc0 ... 0xc7: /* fadd %stN,%st */
@@ -4257,12 +4241,11 @@ x86_emulate(
             emulate_fpu_insn_memsrc(b, modrm_reg, src.val);
             break;
         }
-        check_fpu_exn(&fic);
         break;
 
     case 0xd9: /* FPU 0xd9 */
         host_and_vcpu_must_have(fpu);
-        get_fpu(X86EMUL_FPU_fpu, &fic);
+        get_fpu(X86EMUL_FPU_fpu);
         switch ( modrm )
         {
         case 0xfb: /* fsincos */
@@ -4344,12 +4327,11 @@ x86_emulate(
             if ( dst.type == OP_MEM && !state->fpu_ctrl && !fpu_check_write() )
                 dst.type = OP_NONE;
         }
-        check_fpu_exn(&fic);
         break;
 
     case 0xda: /* FPU 0xda */
         host_and_vcpu_must_have(fpu);
-        get_fpu(X86EMUL_FPU_fpu, &fic);
+        get_fpu(X86EMUL_FPU_fpu);
         switch ( modrm )
         {
         case 0xc0 ... 0xc7: /* fcmovb %stN */
@@ -4366,12 +4348,11 @@ x86_emulate(
             generate_exception_if(ea.type != OP_MEM, EXC_UD);
             goto fpu_memsrc32;
         }
-        check_fpu_exn(&fic);
         break;
 
     case 0xdb: /* FPU 0xdb */
         host_and_vcpu_must_have(fpu);
-        get_fpu(X86EMUL_FPU_fpu, &fic);
+        get_fpu(X86EMUL_FPU_fpu);
         switch ( modrm )
         {
         case 0xc0 ... 0xc7: /* fcmovnb %stN */
@@ -4424,12 +4405,11 @@ x86_emulate(
                 generate_exception(EXC_UD);
             }
         }
-        check_fpu_exn(&fic);
         break;
 
     case 0xdc: /* FPU 0xdc */
         host_and_vcpu_must_have(fpu);
-        get_fpu(X86EMUL_FPU_fpu, &fic);
+        get_fpu(X86EMUL_FPU_fpu);
         switch ( modrm )
         {
         case 0xc0 ... 0xc7: /* fadd %st,%stN */
@@ -4451,12 +4431,11 @@ x86_emulate(
             emulate_fpu_insn_memsrc(b, modrm_reg, src.val);
             break;
         }
-        check_fpu_exn(&fic);
         break;
 
     case 0xdd: /* FPU 0xdd */
         host_and_vcpu_must_have(fpu);
-        get_fpu(X86EMUL_FPU_fpu, &fic);
+        get_fpu(X86EMUL_FPU_fpu);
         switch ( modrm )
         {
         case 0xc0 ... 0xc7: /* ffree %stN */
@@ -4500,12 +4479,11 @@ x86_emulate(
             if ( dst.type == OP_MEM && !state->fpu_ctrl && !fpu_check_write() )
                 dst.type = OP_NONE;
         }
-        check_fpu_exn(&fic);
         break;
 
     case 0xde: /* FPU 0xde */
         host_and_vcpu_must_have(fpu);
-        get_fpu(X86EMUL_FPU_fpu, &fic);
+        get_fpu(X86EMUL_FPU_fpu);
         switch ( modrm )
         {
         case 0xc0 ... 0xc7: /* faddp %stN */
@@ -4523,12 +4501,11 @@ x86_emulate(
             emulate_fpu_insn_memsrc(b, modrm_reg, src.val);
             break;
         }
-        check_fpu_exn(&fic);
         break;
 
     case 0xdf: /* FPU 0xdf */
         host_and_vcpu_must_have(fpu);
-        get_fpu(X86EMUL_FPU_fpu, &fic);
+        get_fpu(X86EMUL_FPU_fpu);
         switch ( modrm )
         {
         case 0xe0:
@@ -4573,7 +4550,6 @@ x86_emulate(
                 goto fpu_memdst64;
             }
         }
-        check_fpu_exn(&fic);
         break;
 
     case 0xe0 ... 0xe2: /* loop{,z,nz} */ {
@@ -5393,7 +5369,7 @@ x86_emulate(
         else
             generate_exception(EXC_UD);
 
-        get_fpu(X86EMUL_FPU_mmx, &fic);
+        get_fpu(X86EMUL_FPU_mmx);
 
         d = DstReg | SrcMem;
         op_bytes = 8;
@@ -5483,7 +5459,7 @@ x86_emulate(
             else
                 vcpu_must_have(sse);
     simd_0f_xmm:
-            get_fpu(X86EMUL_FPU_xmm, &fic);
+            get_fpu(X86EMUL_FPU_xmm);
         }
         else
         {
@@ -5493,7 +5469,7 @@ x86_emulate(
     simd_0f_avx:
             host_and_vcpu_must_have(avx);
     simd_0f_ymm:
-            get_fpu(X86EMUL_FPU_ymm, &fic);
+            get_fpu(X86EMUL_FPU_ymm);
         }
     simd_0f_common:
         opc = init_prefixes(stub);
@@ -5506,7 +5482,7 @@ x86_emulate(
             vex.b = 1;
             opc[1] &= 0x38;
         }
-        fic.insn_bytes = PFX_BYTES + 2;
+        insn_bytes = PFX_BYTES + 2;
         break;
 
     case X86EMUL_OPC_66(0x0f, 0x12):       /* movlpd m64,xmm */
@@ -5593,12 +5569,12 @@ x86_emulate(
                 vcpu_must_have(sse2);
             else
                 vcpu_must_have(sse);
-            get_fpu(X86EMUL_FPU_xmm, &fic);
+            get_fpu(X86EMUL_FPU_xmm);
         }
         else
         {
             host_and_vcpu_must_have(avx);
-            get_fpu(X86EMUL_FPU_ymm, &fic);
+            get_fpu(X86EMUL_FPU_ymm);
         }
 
         if ( ea.type == OP_MEM )
@@ -5624,7 +5600,7 @@ x86_emulate(
                 vcpu_must_have(sse2);
             else
                 vcpu_must_have(sse);
-            get_fpu(X86EMUL_FPU_xmm, &fic);
+            get_fpu(X86EMUL_FPU_xmm);
         }
         else
         {
@@ -5632,7 +5608,7 @@ x86_emulate(
                 vex.l = 0;
             generate_exception_if(vex.l || vex.reg != 0xf, EXC_UD);
             host_and_vcpu_must_have(avx);
-            get_fpu(X86EMUL_FPU_ymm, &fic);
+            get_fpu(X86EMUL_FPU_ymm);
         }
 
         opc = init_prefixes(stub);
@@ -5655,17 +5631,14 @@ x86_emulate(
             opc[1] = modrm & 0xc7;
         if ( !mode_64bit() )
             vex.w = 0;
-        fic.insn_bytes = PFX_BYTES + 2;
+        insn_bytes = PFX_BYTES + 2;
         opc[2] = 0xc3;
 
         copy_REX_VEX(opc, rex_prefix, vex);
         ea.reg = decode_register(modrm_reg, &_regs, 0);
-        invoke_stub("", "", "=a" (*ea.reg), "+m" (fic.exn_raised)
-                            : "c" (mmvalp), "m" (*mmvalp));
+        invoke_stub("", "", "=a" (*ea.reg) : "c" (mmvalp), "m" (*mmvalp));
 
         put_stub(stub);
-        check_xmm_exn(&fic);
-
         state->simd_size = simd_none;
         break;
 
@@ -5679,13 +5652,13 @@ x86_emulate(
                 vcpu_must_have(sse2);
             else
                 vcpu_must_have(sse);
-            get_fpu(X86EMUL_FPU_xmm, &fic);
+            get_fpu(X86EMUL_FPU_xmm);
         }
         else
         {
             generate_exception_if(vex.reg != 0xf, EXC_UD);
             host_and_vcpu_must_have(avx);
-            get_fpu(X86EMUL_FPU_ymm, &fic);
+            get_fpu(X86EMUL_FPU_ymm);
         }
 
         opc = init_prefixes(stub);
@@ -5703,20 +5676,17 @@ x86_emulate(
             vex.b = 1;
             opc[1] &= 0x38;
         }
-        fic.insn_bytes = PFX_BYTES + 2;
+        insn_bytes = PFX_BYTES + 2;
         opc[2] = 0xc3;
 
         copy_REX_VEX(opc, rex_prefix, vex);
         invoke_stub(_PRE_EFLAGS("[eflags]", "[mask]", "[tmp]"),
                     _POST_EFLAGS("[eflags]", "[mask]", "[tmp]"),
                     [eflags] "+g" (_regs.eflags),
-                    [tmp] "=&r" (dummy), "+m" (*mmvalp),
-                    "+m" (fic.exn_raised)
+                    [tmp] "=&r" (dummy), "+m" (*mmvalp)
                     : "a" (mmvalp), [mask] "i" (EFLAGS_MASK));
 
         put_stub(stub);
-        check_xmm_exn(&fic);
-
         ASSERT(!state->simd_size);
         break;
 
@@ -5854,9 +5824,9 @@ x86_emulate(
         if ( !mode_64bit() )
             vex.w = 0;
         opc[1] = modrm & 0xc7;
-        fic.insn_bytes = PFX_BYTES + 2;
+        insn_bytes = PFX_BYTES + 2;
     simd_0f_to_gpr:
-        opc[fic.insn_bytes - PFX_BYTES] = 0xc3;
+        opc[insn_bytes - PFX_BYTES] = 0xc3;
 
         generate_exception_if(ea.type != OP_REG, EXC_UD);
 
@@ -5875,9 +5845,9 @@ x86_emulate(
                     vcpu_must_have(sse);
             }
             if ( b == 0x50 || (vex.pfx & VEX_PREFIX_DOUBLE_MASK) )
-                get_fpu(X86EMUL_FPU_xmm, &fic);
+                get_fpu(X86EMUL_FPU_xmm);
             else
-                get_fpu(X86EMUL_FPU_mmx, &fic);
+                get_fpu(X86EMUL_FPU_mmx);
         }
         else
         {
@@ -5886,14 +5856,13 @@ x86_emulate(
                 host_and_vcpu_must_have(avx);
             else
                 host_and_vcpu_must_have(avx2);
-            get_fpu(X86EMUL_FPU_ymm, &fic);
+            get_fpu(X86EMUL_FPU_ymm);
         }
 
         copy_REX_VEX(opc, rex_prefix, vex);
         invoke_stub("", "", "=a" (dst.val) : [dummy] "i" (0));
 
         put_stub(stub);
-        check_xmm_exn(&fic);
 
         ASSERT(!state->simd_size);
         dst.bytes = 4;
@@ -6059,7 +6028,7 @@ x86_emulate(
             goto simd_0f_sse2;
     simd_0f_mmx:
         host_and_vcpu_must_have(mmx);
-        get_fpu(X86EMUL_FPU_mmx, &fic);
+        get_fpu(X86EMUL_FPU_mmx);
         goto simd_0f_common;
 
     CASE_SIMD_PACKED_INT(0x0f, 0x6e):    /* mov{d,q} r/m,{,x}mm */
@@ -6070,17 +6039,17 @@ x86_emulate(
         {
             generate_exception_if(vex.l || vex.reg != 0xf, EXC_UD);
             host_and_vcpu_must_have(avx);
-            get_fpu(X86EMUL_FPU_ymm, &fic);
+            get_fpu(X86EMUL_FPU_ymm);
         }
         else if ( vex.pfx )
         {
             vcpu_must_have(sse2);
-            get_fpu(X86EMUL_FPU_xmm, &fic);
+            get_fpu(X86EMUL_FPU_xmm);
         }
         else
         {
             host_and_vcpu_must_have(mmx);
-            get_fpu(X86EMUL_FPU_mmx, &fic);
+            get_fpu(X86EMUL_FPU_mmx);
         }
 
     simd_0f_rm:
@@ -6092,17 +6061,14 @@ x86_emulate(
         if ( !mode_64bit() )
             vex.w = 0;
         opc[1] = modrm & 0x38;
-        fic.insn_bytes = PFX_BYTES + 2;
+        insn_bytes = PFX_BYTES + 2;
         opc[2] = 0xc3;
 
         copy_REX_VEX(opc, rex_prefix, vex);
-        invoke_stub("", "", "+m" (src.val), "+m" (fic.exn_raised)
-                            : "a" (&src.val));
+        invoke_stub("", "", "+m" (src.val) : "a" (&src.val));
         dst.val = src.val;
 
         put_stub(stub);
-        check_xmm_exn(&fic);
-
         ASSERT(!state->simd_size);
         break;
 
@@ -6168,19 +6134,19 @@ x86_emulate(
                 host_and_vcpu_must_have(avx);
             }
     simd_0f_imm8_ymm:
-            get_fpu(X86EMUL_FPU_ymm, &fic);
+            get_fpu(X86EMUL_FPU_ymm);
         }
         else if ( vex.pfx )
         {
     simd_0f_imm8_sse2:
             vcpu_must_have(sse2);
-            get_fpu(X86EMUL_FPU_xmm, &fic);
+            get_fpu(X86EMUL_FPU_xmm);
         }
         else
         {
             host_and_vcpu_must_have(mmx);
             vcpu_must_have(mmxext);
-            get_fpu(X86EMUL_FPU_mmx, &fic);
+            get_fpu(X86EMUL_FPU_mmx);
         }
     simd_0f_imm8:
         opc = init_prefixes(stub);
@@ -6194,7 +6160,7 @@ x86_emulate(
             opc[1] &= 0x38;
         }
         opc[2] = imm1;
-        fic.insn_bytes = PFX_BYTES + 3;
+        insn_bytes = PFX_BYTES + 3;
         break;
 
     CASE_SIMD_PACKED_INT(0x0f, 0x71):    /* Grp12 */
@@ -6222,33 +6188,31 @@ x86_emulate(
                 host_and_vcpu_must_have(avx2);
             else
                 host_and_vcpu_must_have(avx);
-            get_fpu(X86EMUL_FPU_ymm, &fic);
+            get_fpu(X86EMUL_FPU_ymm);
         }
         else if ( vex.pfx )
         {
             vcpu_must_have(sse2);
-            get_fpu(X86EMUL_FPU_xmm, &fic);
+            get_fpu(X86EMUL_FPU_xmm);
         }
         else
         {
             host_and_vcpu_must_have(mmx);
-            get_fpu(X86EMUL_FPU_mmx, &fic);
+            get_fpu(X86EMUL_FPU_mmx);
         }
 
         opc = init_prefixes(stub);
         opc[0] = b;
         opc[1] = modrm;
         opc[2] = imm1;
-        fic.insn_bytes = PFX_BYTES + 3;
+        insn_bytes = PFX_BYTES + 3;
     simd_0f_reg_only:
-        opc[fic.insn_bytes - PFX_BYTES] = 0xc3;
+        opc[insn_bytes - PFX_BYTES] = 0xc3;
 
         copy_REX_VEX(opc, rex_prefix, vex);
         invoke_stub("", "", [dummy_out] "=g" (dummy) : [dummy_in] "i" (0) );
 
         put_stub(stub);
-        check_xmm_exn(&fic);
-
         ASSERT(!state->simd_size);
         break;
 
@@ -6283,7 +6247,7 @@ x86_emulate(
         {
             generate_exception_if(vex.reg != 0xf, EXC_UD);
             host_and_vcpu_must_have(avx);
-            get_fpu(X86EMUL_FPU_ymm, &fic);
+            get_fpu(X86EMUL_FPU_ymm);
 
 #ifdef __x86_64__
             if ( !mode_64bit() )
@@ -6325,12 +6289,12 @@ x86_emulate(
         else
         {
             host_and_vcpu_must_have(mmx);
-            get_fpu(X86EMUL_FPU_mmx, &fic);
+            get_fpu(X86EMUL_FPU_mmx);
         }
 
         opc = init_prefixes(stub);
         opc[0] = b;
-        fic.insn_bytes = PFX_BYTES + 1;
+        insn_bytes = PFX_BYTES + 1;
         goto simd_0f_reg_only;
 
     case X86EMUL_OPC_66(0x0f, 0x78):     /* Grp17 */
@@ -6346,14 +6310,14 @@ x86_emulate(
         generate_exception_if(ea.type != OP_REG, EXC_UD);
 
         host_and_vcpu_must_have(sse4a);
-        get_fpu(X86EMUL_FPU_xmm, &fic);
+        get_fpu(X86EMUL_FPU_xmm);
 
         opc = init_prefixes(stub);
         opc[0] = b;
         opc[1] = modrm;
         opc[2] = imm1;
         opc[3] = imm2;
-        fic.insn_bytes = PFX_BYTES + 4;
+        insn_bytes = PFX_BYTES + 4;
         goto simd_0f_reg_only;
 
     case X86EMUL_OPC_66(0x0f, 0x79):     /* extrq xmm,xmm */
@@ -6481,7 +6445,7 @@ x86_emulate(
             vcpu_must_have(sse);
         ldmxcsr:
             generate_exception_if(src.type != OP_MEM, EXC_UD);
-            get_fpu(vex.opcx ? X86EMUL_FPU_ymm : X86EMUL_FPU_xmm, &fic);
+            get_fpu(vex.opcx ? X86EMUL_FPU_ymm : X86EMUL_FPU_xmm);
             generate_exception_if(src.val & ~mxcsr_mask, EXC_GP, 0);
             asm volatile ( "ldmxcsr %0" :: "m" (src.val) );
             break;
@@ -6491,7 +6455,7 @@ x86_emulate(
             vcpu_must_have(sse);
         stmxcsr:
             generate_exception_if(dst.type != OP_MEM, EXC_UD);
-            get_fpu(vex.opcx ? X86EMUL_FPU_ymm : X86EMUL_FPU_xmm, &fic);
+            get_fpu(vex.opcx ? X86EMUL_FPU_ymm : X86EMUL_FPU_xmm);
             asm volatile ( "stmxcsr %0" : "=m" (dst.val) );
             break;
 
@@ -6745,7 +6709,7 @@ x86_emulate(
             if ( vex.pfx & VEX_PREFIX_DOUBLE_MASK )
                 goto simd_0f_imm8_sse2;
             vcpu_must_have(sse);
-            get_fpu(X86EMUL_FPU_xmm, &fic);
+            get_fpu(X86EMUL_FPU_xmm);
             goto simd_0f_imm8;
         }
         goto simd_0f_imm8_avx;
@@ -6777,7 +6741,7 @@ x86_emulate(
             vex.w = 0;
         opc[1] = modrm & 0xc7;
         opc[2] = imm1;
-        fic.insn_bytes = PFX_BYTES + 3;
+        insn_bytes = PFX_BYTES + 3;
         goto simd_0f_to_gpr;
 
     case X86EMUL_OPC(0x0f, 0xc7): /* Grp9 */
@@ -7023,18 +6987,18 @@ x86_emulate(
             generate_exception_if(vex.l || vex.reg != 0xf, EXC_UD);
             d |= TwoOp;
             host_and_vcpu_must_have(avx);
-            get_fpu(X86EMUL_FPU_ymm, &fic);
+            get_fpu(X86EMUL_FPU_ymm);
         }
         else if ( vex.pfx )
         {
             vcpu_must_have(sse2);
-            get_fpu(X86EMUL_FPU_xmm, &fic);
+            get_fpu(X86EMUL_FPU_xmm);
         }
         else
         {
             host_and_vcpu_must_have(mmx);
             vcpu_must_have(mmxext);
-            get_fpu(X86EMUL_FPU_mmx, &fic);
+            get_fpu(X86EMUL_FPU_mmx);
         }
 
         /*
@@ -7054,7 +7018,6 @@ x86_emulate(
         if ( !mode_64bit() )
             vex.w = 0;
         opc[1] = modrm & 0xc7;
-        fic.insn_bytes = PFX_BYTES + 2;
         opc[2] = 0xc3;
 
         copy_REX_VEX(opc, rex_prefix, vex);
@@ -7067,6 +7030,7 @@ x86_emulate(
         opc = init_prefixes(stub);
         opc[0] = b;
         opc[1] = modrm;
+        insn_bytes = PFX_BYTES + 2;
         /* Restore high bit of XMM destination. */
         if ( sfence )
         {
@@ -7113,12 +7077,12 @@ x86_emulate(
         if ( vex.pfx )
         {
     simd_0f38_common:
-            get_fpu(X86EMUL_FPU_xmm, &fic);
+            get_fpu(X86EMUL_FPU_xmm);
         }
         else
         {
             host_and_vcpu_must_have(mmx);
-            get_fpu(X86EMUL_FPU_mmx, &fic);
+            get_fpu(X86EMUL_FPU_mmx);
         }
         opc = init_prefixes(stub);
         opc[0] = 0x38;
@@ -7131,7 +7095,7 @@ x86_emulate(
             vex.b = 1;
             opc[2] &= 0x38;
         }
-        fic.insn_bytes = PFX_BYTES + 3;
+        insn_bytes = PFX_BYTES + 3;
         break;
 
     case X86EMUL_OPC_VEX_66(0x0f38, 0x19): /* vbroadcastsd m64,ymm */
@@ -7155,13 +7119,13 @@ x86_emulate(
         if ( vex.opcx == vex_none )
         {
             host_and_vcpu_must_have(sse4_1);
-            get_fpu(X86EMUL_FPU_xmm, &fic);
+            get_fpu(X86EMUL_FPU_xmm);
         }
         else
         {
             generate_exception_if(vex.reg != 0xf, EXC_UD);
             host_and_vcpu_must_have(avx);
-            get_fpu(X86EMUL_FPU_ymm, &fic);
+            get_fpu(X86EMUL_FPU_ymm);
         }
 
         opc = init_prefixes(stub);
@@ -7180,21 +7144,19 @@ x86_emulate(
             vex.b = 1;
             opc[1] &= 0x38;
         }
-        fic.insn_bytes = PFX_BYTES + 2;
+        insn_bytes = PFX_BYTES + 2;
         opc[2] = 0xc3;
         if ( vex.opcx == vex_none )
         {
             /* Cover for extra prefix byte. */
             --opc;
-            ++fic.insn_bytes;
+            ++insn_bytes;
         }
 
         copy_REX_VEX(opc, rex_prefix, vex);
         emulate_stub("+m" (*mmvalp), "a" (mmvalp));
 
         put_stub(stub);
-        check_xmm_exn(&fic);
-
         state->simd_size = simd_none;
         dst.type = OP_NONE;
         break;
@@ -7283,7 +7245,7 @@ x86_emulate(
 
         generate_exception_if(ea.type != OP_MEM || vex.w, EXC_UD);
         host_and_vcpu_must_have(avx);
-        get_fpu(X86EMUL_FPU_ymm, &fic);
+        get_fpu(X86EMUL_FPU_ymm);
 
         /*
          * While we can't reasonably provide fully correct behavior here
@@ -7332,7 +7294,7 @@ x86_emulate(
         rex_prefix &= ~REX_B;
         vex.b = 1;
         opc[1] = modrm & 0x38;
-        fic.insn_bytes = PFX_BYTES + 2;
+        insn_bytes = PFX_BYTES + 2;
 
         break;
     }
@@ -7381,7 +7343,7 @@ x86_emulate(
 
         generate_exception_if(ea.type != OP_MEM, EXC_UD);
         host_and_vcpu_must_have(avx2);
-        get_fpu(X86EMUL_FPU_ymm, &fic);
+        get_fpu(X86EMUL_FPU_ymm);
 
         /*
          * While we can't reasonably provide fully correct behavior here
@@ -7428,7 +7390,7 @@ x86_emulate(
         rex_prefix &= ~REX_B;
         vex.b = 1;
         opc[1] = modrm & 0x38;
-        fic.insn_bytes = PFX_BYTES + 2;
+        insn_bytes = PFX_BYTES + 2;
 
         break;
     }
@@ -7451,7 +7413,7 @@ x86_emulate(
                               state->sib_index == mask_reg, EXC_UD);
         generate_exception_if(!cpu_has_avx, EXC_UD);
         vcpu_must_have(avx2);
-        get_fpu(X86EMUL_FPU_ymm, &fic);
+        get_fpu(X86EMUL_FPU_ymm);
 
         /* Read destination, index, and mask registers. */
         opc = init_prefixes(stub);
@@ -7788,12 +7750,12 @@ x86_emulate(
         if ( vex.pfx )
         {
     simd_0f3a_common:
-            get_fpu(X86EMUL_FPU_xmm, &fic);
+            get_fpu(X86EMUL_FPU_xmm);
         }
         else
         {
             host_and_vcpu_must_have(mmx);
-            get_fpu(X86EMUL_FPU_mmx, &fic);
+            get_fpu(X86EMUL_FPU_mmx);
         }
         opc = init_prefixes(stub);
         opc[0] = 0x3a;
@@ -7807,7 +7769,7 @@ x86_emulate(
             opc[2] &= 0x38;
         }
         opc[3] = imm1;
-        fic.insn_bytes = PFX_BYTES + 4;
+        insn_bytes = PFX_BYTES + 4;
         break;
 
     case X86EMUL_OPC_66(0x0f3a, 0x14): /* pextrb $imm8,xmm,r/m */
@@ -7815,7 +7777,7 @@ x86_emulate(
     case X86EMUL_OPC_66(0x0f3a, 0x16): /* pextr{d,q} $imm8,xmm,r/m */
     case X86EMUL_OPC_66(0x0f3a, 0x17): /* extractps $imm8,xmm,r/m */
         host_and_vcpu_must_have(sse4_1);
-        get_fpu(X86EMUL_FPU_xmm, &fic);
+        get_fpu(X86EMUL_FPU_xmm);
 
         opc = init_prefixes(stub);
         opc++[0] = 0x3a;
@@ -7828,20 +7790,16 @@ x86_emulate(
             vex.w = 0;
         opc[1] = modrm & 0x38;
         opc[2] = imm1;
-        fic.insn_bytes = PFX_BYTES + 3;
         opc[3] = 0xc3;
         if ( vex.opcx == vex_none )
         {
             /* Cover for extra prefix byte. */
             --opc;
-            ++fic.insn_bytes;
         }
 
         copy_REX_VEX(opc, rex_prefix, vex);
         invoke_stub("", "", "=m" (dst.val) : "a" (&dst.val));
-
         put_stub(stub);
-        check_xmm_exn(&fic);
 
         ASSERT(!state->simd_size);
         dst.bytes = dst.type == OP_REG || b == 0x17 ? 4 : 1 << (b & 3);
@@ -7855,7 +7813,7 @@ x86_emulate(
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x17): /* vextractps $imm8,xmm,r/m */
         generate_exception_if(vex.l || vex.reg != 0xf, EXC_UD);
         host_and_vcpu_must_have(avx);
-        get_fpu(X86EMUL_FPU_ymm, &fic);
+        get_fpu(X86EMUL_FPU_ymm);
         opc = init_prefixes(stub);
         goto pextr;
 
@@ -7877,17 +7835,15 @@ x86_emulate(
             opc[1] &= 0x38;
         }
         opc[2] = imm1;
-        fic.insn_bytes = PFX_BYTES + 3;
+        insn_bytes = PFX_BYTES + 3;
         opc[3] = 0xc3;
 
         copy_VEX(opc, vex);
         /* Latch MXCSR - we may need to restore it below. */
         invoke_stub("stmxcsr %[mxcsr]", "",
-                    "=m" (*mmvalp), "+m" (fic.exn_raised), [mxcsr] "=m" (mxcsr)
-                    : "a" (mmvalp));
+                    "=m" (*mmvalp), [mxcsr] "=m" (mxcsr) : "a" (mmvalp));
 
         put_stub(stub);
-        check_xmm_exn(&fic);
 
         if ( ea.type == OP_MEM )
         {
@@ -7906,7 +7862,7 @@ x86_emulate(
     case X86EMUL_OPC_66(0x0f3a, 0x20): /* pinsrb $imm8,r32/m8,xmm */
     case X86EMUL_OPC_66(0x0f3a, 0x22): /* pinsr{d,q} $imm8,r/m,xmm */
         host_and_vcpu_must_have(sse4_1);
-        get_fpu(X86EMUL_FPU_xmm, &fic);
+        get_fpu(X86EMUL_FPU_xmm);
         memcpy(mmvalp, &src.val, op_bytes);
         ea.type = OP_MEM;
         op_bytes = src.bytes;
@@ -8014,13 +7970,13 @@ x86_emulate(
         if ( vex.opcx == vex_none )
         {
             host_and_vcpu_must_have(sse4_2);
-            get_fpu(X86EMUL_FPU_xmm, &fic);
+            get_fpu(X86EMUL_FPU_xmm);
         }
         else
         {
             generate_exception_if(vex.l || vex.reg != 0xf, EXC_UD);
             host_and_vcpu_must_have(avx);
-            get_fpu(X86EMUL_FPU_ymm, &fic);
+            get_fpu(X86EMUL_FPU_ymm);
         }
 
         opc = init_prefixes(stub);
@@ -8041,13 +7997,13 @@ x86_emulate(
                 goto done;
         }
         opc[2] = imm1;
-        fic.insn_bytes = PFX_BYTES + 3;
+        insn_bytes = PFX_BYTES + 3;
         opc[3] = 0xc3;
         if ( vex.opcx == vex_none )
         {
             /* Cover for extra prefix byte. */
             --opc;
-            ++fic.insn_bytes;
+            ++insn_bytes;
         }
 
         copy_REX_VEX(opc, rex_prefix, vex);
@@ -8275,7 +8231,7 @@ x86_emulate(
 
         if ( !opc )
             BUG();
-        opc[fic.insn_bytes - PFX_BYTES] = 0xc3;
+        opc[insn_bytes - PFX_BYTES] = 0xc3;
         copy_REX_VEX(opc, rex_prefix, vex);
 
         if ( ea.type == OP_MEM )
@@ -8352,13 +8308,11 @@ x86_emulate(
         if ( likely((ctxt->opcode & ~(X86EMUL_OPC_PFX_MASK |
                                       X86EMUL_OPC_ENCODING_MASK)) !=
                     X86EMUL_OPC(0x0f, 0xf7)) )
-            invoke_stub("", "", "+m" (*mmvalp), "+m" (fic.exn_raised)
-                                : "a" (mmvalp));
+            invoke_stub("", "", "+m" (*mmvalp) : "a" (mmvalp));
         else
             invoke_stub("", "", "+m" (*mmvalp) : "D" (mmvalp));
 
         put_stub(stub);
-        check_xmm_exn(&fic);
     }
 
     switch ( dst.type )
@@ -8401,7 +8355,8 @@ x86_emulate(
     }
 
  complete_insn: /* Commit shadow register state. */
-    put_fpu(&fic, false, state, ctxt, ops);
+    put_fpu(fpu_type, false, state, ctxt, ops);
+    fpu_type = X86EMUL_FPU_none;
 
     /* Zero the upper 32 bits of %rip if not in 64-bit mode. */
     if ( !mode_64bit() )
@@ -8425,7 +8380,7 @@ x86_emulate(
     ctxt->regs->eflags &= ~X86_EFLAGS_RF;
 
  done:
-    put_fpu(&fic, fic.insn_bytes > 0 && dst.type == OP_MEM, state, ctxt, ops);
+    put_fpu(fpu_type, insn_bytes > 0 && dst.type == OP_MEM, state, ctxt, ops);
     put_stub(stub);
     return rc;
 #undef state
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -428,12 +428,8 @@ struct x86_emulate_ops
 
     /*
      * get_fpu: Load emulated environment's FPU state onto processor.
-     *  @exn_callback: On any FPU or SIMD exception, pass control to
-     *                 (*exception_callback)(exception_callback_arg, regs).
      */
     int (*get_fpu)(
-        void (*exception_callback)(void *, struct cpu_user_regs *),
-        void *exception_callback_arg,
         enum x86_emulate_fpu_type type,
         struct x86_emulate_ctxt *ctxt);
 
--- a/xen/include/asm-x86/hvm/vcpu.h
+++ b/xen/include/asm-x86/hvm/vcpu.h
@@ -196,10 +196,6 @@ struct hvm_vcpu {
 
     struct hvm_vcpu_io  hvm_io;
 
-    /* Callback into x86_emulate when emulating FPU/MMX/XMM instructions. */
-    void (*fpu_exception_callback)(void *, struct cpu_user_regs *);
-    void *fpu_exception_callback_arg;
-
     /* Pending hw/sw interrupt (.vector = -1 means nothing pending). */
     struct x86_event     inject_event;
 

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 00/17] x86: emulator enhancements
  2017-06-21 11:23 [PATCH 00/17] x86: emulator enhancements Jan Beulich
                   ` (16 preceding siblings ...)
  2017-06-21 12:09 ` [PATCH 17/17] x86/HVM: eliminate custom #MF/#XM handling Jan Beulich
@ 2017-09-05 17:08 ` George Dunlap
  17 siblings, 0 replies; 27+ messages in thread
From: George Dunlap @ 2017-09-05 17:08 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Andrew Cooper

Jan,

I'm really sorry, but would you mind sending this again (rebased if
you have it, without attachments)?

My tooling is completely failing to do anything sensible with this.

 -George

On Wed, Jun 21, 2017 at 12:23 PM, Jan Beulich <JBeulich@suse.com> wrote:
> 01: support remaining AVX insns
> 02: re-order cases of main switch statement
> 03: build SIMD tests with -Os
> 04: support F16C insns
> 05: support FMA4 insns
> 06: support FMA insns
> 07: support most remaining AVX2 insns
> 08: fold/eliminate some local variables
> 09: support AVX2 gather insns
> 10: add tables for XOP 08 and 09 extension spaces
> 11: support XOP insns
> 12: support 3DNow! insns
> 13: re-order checks in test harness
> 14: abstract out XCRn accesses
> 15: adjust_bnd() should check XCR0
> 16: make all FPU emulation use the stub
> 17: eliminate custom #MF/#XM handling
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> https://lists.xen.org/xen-devel

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 01/17] x86emul: support remaining AVX insns
  2017-06-21 11:59 ` [PATCH 01/17] x86emul: support remaining AVX insns Jan Beulich
@ 2017-09-13 15:02   ` George Dunlap
  2017-09-13 15:31     ` Jan Beulich
  0 siblings, 1 reply; 27+ messages in thread
From: George Dunlap @ 2017-09-13 15:02 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Andrew Cooper

On Wed, Jun 21, 2017 at 12:59 PM, Jan Beulich <JBeulich@suse.com> wrote:
> I.e. those not being equivalents of SSEn ones.
>
> There's one necessary change to generic code: Faulting behavior of
> VMASKMOVP{S,D} requires us to do partial reads/writes.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Having dug a bit through the manuals about AVX, then come back to this
patch again, I feel the same way that I did when I first read it:
This discription is woefully inadequate.  It doesn't look so much like
a changelog as a note to yourself.  If someone else sent a patch
description like this for a patch this complicated and expected you to
review it you'd certainly complain, and there's a good chance you
wouldn't review it.

It's not your job to teach me how this code is laid out and how it
works, but there at least needs to be enough description of what the
patch is actually trying to do that I have some hope of reconstructing
its intent.

 -George

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 02/17] x86emul: re-order cases of main switch statement
  2017-06-21 11:59 ` [PATCH 02/17] x86emul: re-order cases of main switch statement Jan Beulich
@ 2017-09-13 15:15   ` George Dunlap
  0 siblings, 0 replies; 27+ messages in thread
From: George Dunlap @ 2017-09-13 15:15 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Andrew Cooper

On Wed, Jun 21, 2017 at 12:59 PM, Jan Beulich <JBeulich@suse.com> wrote:
> Re-store intended numerical ordering, which has become "violated"
> mostly by incremental additions where moving around bigger chunks did
> not seem advisable. One exception though at the very top of the
> switch(): Keeping the arithmetic ops together seems preferable over
> entirely strict ordering.

+1

> Additionally move a few macro definitions before their first uses (the
> placement is benign as long as those uses are themselves only macro
> definitions, but that's going to change when those macros have helpers
> broken out).
>
> No (intended) functional change.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

As far as I can tell, the 'no functional change' is true:

Reviewed-by:  George Dunlap <george.dunlap@citrix.com>

>
> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -843,6 +843,27 @@ do{ asm volatile (
>  #define __emulate_1op_8byte(_op, _dst, _eflags)
>  #endif /* __i386__ */
>
> +#define fail_if(p)                                      \
> +do {                                                    \
> +    rc = (p) ? X86EMUL_UNHANDLEABLE : X86EMUL_OKAY;     \
> +    if ( rc ) goto done;                                \
> +} while (0)
> +
> +static inline int mkec(uint8_t e, int32_t ec, ...)
> +{
> +    return (e < 32 && ((1u << e) & EXC_HAS_EC)) ? ec : X86_EVENT_NO_EC;
> +}
> +
> +#define generate_exception_if(p, e, ec...)                                \
> +({  if ( (p) ) {                                                          \
> +        x86_emul_hw_exception(e, mkec(e, ##ec, 0), ctxt);                 \
> +        rc = X86EMUL_EXCEPTION;                                           \
> +        goto done;                                                        \
> +    }                                                                     \
> +})
> +
> +#define generate_exception(e, ec...) generate_exception_if(true, e, ##ec)
> +
>  #ifdef __XEN__
>  # define invoke_stub(pre, post, constraints...) do {                    \
>      union stub_exception_token res_ = { .raw = ~0 };                    \
> @@ -911,27 +932,6 @@ do{ asm volatile (
>  # define mode_64bit() false
>  #endif
>
> -#define fail_if(p)                                      \
> -do {                                                    \
> -    rc = (p) ? X86EMUL_UNHANDLEABLE : X86EMUL_OKAY;     \
> -    if ( rc ) goto done;                                \
> -} while (0)
> -
> -static inline int mkec(uint8_t e, int32_t ec, ...)
> -{
> -    return (e < 32 && ((1u << e) & EXC_HAS_EC)) ? ec : X86_EVENT_NO_EC;
> -}
> -
> -#define generate_exception_if(p, e, ec...)                                \
> -({  if ( (p) ) {                                                          \
> -        x86_emul_hw_exception(e, mkec(e, ##ec, 0), ctxt);                 \
> -        rc = X86EMUL_EXCEPTION;                                           \
> -        goto done;                                                        \
> -    }                                                                     \
> -})
> -
> -#define generate_exception(e, ec...) generate_exception_if(true, e, ##ec)
> -
>  /*
>   * Given byte has even parity (even number of 1s)? SDM Vol. 1 Sec. 3.4.3.1,
>   * "Status Flags": EFLAGS.PF reflects parity of least-sig. byte of result only.
> @@ -3596,6 +3596,11 @@ x86_emulate(
>              dst.bytes = 2;
>          break;
>
> +    case 0x8d: /* lea */
> +        generate_exception_if(ea.type != OP_MEM, EXC_UD);
> +        dst.val = ea.mem.off;
> +        break;
> +
>      case 0x8e: /* mov r/m,Sreg */
>          seg = modrm_reg & 7; /* REX.R is ignored. */
>          generate_exception_if(!is_x86_user_segment(seg) ||
> @@ -3607,11 +3612,6 @@ x86_emulate(
>          dst.type = OP_NONE;
>          break;
>
> -    case 0x8d: /* lea */
> -        generate_exception_if(ea.type != OP_MEM, EXC_UD);
> -        dst.val = ea.mem.off;
> -        break;
> -
>      case 0x8f: /* pop (sole member of Grp1a) */
>          generate_exception_if((modrm_reg & 7) != 0, EXC_UD);
>          /* 64-bit mode: POP defaults to a 64-bit operand. */
> @@ -5746,12 +5746,6 @@ x86_emulate(
>          _regs.r(ax) = (uint32_t)msr_val;
>          break;
>
> -    case X86EMUL_OPC(0x0f, 0x40) ... X86EMUL_OPC(0x0f, 0x4f): /* cmovcc */
> -        vcpu_must_have(cmov);
> -        if ( test_cc(b, _regs.eflags) )
> -            dst.val = src.val;
> -        break;
> -
>      case X86EMUL_OPC(0x0f, 0x34): /* sysenter */
>          vcpu_must_have(sep);
>          generate_exception_if(mode_ring0(), EXC_GP, 0);
> @@ -5834,6 +5828,12 @@ x86_emulate(
>          singlestep = _regs.eflags & X86_EFLAGS_TF;
>          break;
>
> +    case X86EMUL_OPC(0x0f, 0x40) ... X86EMUL_OPC(0x0f, 0x4f): /* cmovcc */
> +        vcpu_must_have(cmov);
> +        if ( test_cc(b, _regs.eflags) )
> +            dst.val = src.val;
> +        break;
> +
>      CASE_SIMD_PACKED_FP(, 0x0f, 0x50):     /* movmskp{s,d} xmm,reg */
>      CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x50): /* vmovmskp{s,d} {x,y}mm,reg */
>      CASE_SIMD_PACKED_INT(0x0f, 0xd7):      /* pmovmskb {,x}mm,reg */
> @@ -6050,10 +6050,6 @@ x86_emulate(
>          get_fpu(X86EMUL_FPU_mmx, &fic);
>          goto simd_0f_common;
>
> -    case X86EMUL_OPC_VEX_66(0x0f38, 0x41): /* vphminposuw xmm/m128,xmm,xmm */
> -        generate_exception_if(vex.l, EXC_UD);
> -        goto simd_0f_avx;
> -
>      CASE_SIMD_PACKED_INT(0x0f, 0x6e):    /* mov{d,q} r/m,{,x}mm */
>      case X86EMUL_OPC_VEX_66(0x0f, 0x6e): /* vmov{d,q} r/m,xmm */
>      CASE_SIMD_PACKED_INT(0x0f, 0x7e):    /* mov{d,q} {,x}mm,r/m */
> @@ -6351,12 +6347,6 @@ x86_emulate(
>          op_bytes = 8;
>          goto simd_0f_xmm;
>
> -    case X86EMUL_OPC_F3(0x0f, 0x7e):     /* movq xmm/m64,xmm */
> -    case X86EMUL_OPC_VEX_F3(0x0f, 0x7e): /* vmovq xmm/m64,xmm */
> -        generate_exception_if(vex.l, EXC_UD);
> -        op_bytes = 8;
> -        goto simd_0f_int;
> -
>      case X86EMUL_OPC_F2(0x0f, 0xf0):     /* lddqu m128,xmm */
>      case X86EMUL_OPC_VEX_F2(0x0f, 0xf0): /* vlddqu mem,{x,y}mm */
>          generate_exception_if(ea.type != OP_MEM, EXC_UD);
> @@ -6376,6 +6366,12 @@ x86_emulate(
>          op_bytes = 16 << vex.l;
>          goto simd_0f_sse3_avx;
>
> +    case X86EMUL_OPC_F3(0x0f, 0x7e):     /* movq xmm/m64,xmm */
> +    case X86EMUL_OPC_VEX_F3(0x0f, 0x7e): /* vmovq xmm/m64,xmm */
> +        generate_exception_if(vex.l, EXC_UD);
> +        op_bytes = 8;
> +        goto simd_0f_int;
> +
>      case X86EMUL_OPC(0x0f, 0x80) ... X86EMUL_OPC(0x0f, 0x8f): /* jcc (near) */
>          if ( test_cc(b, _regs.eflags) )
>              jmp_rel((int32_t)src.val);
> @@ -7134,39 +7130,6 @@ x86_emulate(
>          generate_exception_if(vex.w, EXC_UD);
>          goto simd_0f_avx;
>
> -    case X86EMUL_OPC_66(0x0f38, 0x20): /* pmovsxbw xmm/m64,xmm */
> -    case X86EMUL_OPC_66(0x0f38, 0x21): /* pmovsxbd xmm/m32,xmm */
> -    case X86EMUL_OPC_66(0x0f38, 0x22): /* pmovsxbq xmm/m16,xmm */
> -    case X86EMUL_OPC_66(0x0f38, 0x23): /* pmovsxwd xmm/m64,xmm */
> -    case X86EMUL_OPC_66(0x0f38, 0x24): /* pmovsxwq xmm/m32,xmm */
> -    case X86EMUL_OPC_66(0x0f38, 0x25): /* pmovsxdq xmm/m64,xmm */
> -    case X86EMUL_OPC_66(0x0f38, 0x30): /* pmovzxbw xmm/m64,xmm */
> -    case X86EMUL_OPC_66(0x0f38, 0x31): /* pmovzxbd xmm/m32,xmm */
> -    case X86EMUL_OPC_66(0x0f38, 0x32): /* pmovzxbq xmm/m16,xmm */
> -    case X86EMUL_OPC_66(0x0f38, 0x33): /* pmovzxwd xmm/m64,xmm */
> -    case X86EMUL_OPC_66(0x0f38, 0x34): /* pmovzxwq xmm/m32,xmm */
> -    case X86EMUL_OPC_66(0x0f38, 0x35): /* pmovzxdq xmm/m64,xmm */
> -        op_bytes = 16 >> pmov_convert_delta[b & 7];
> -        /* fall through */
> -    case X86EMUL_OPC_66(0x0f38, 0x10): /* pblendvb XMM0,xmm/m128,xmm */
> -    case X86EMUL_OPC_66(0x0f38, 0x14): /* blendvps XMM0,xmm/m128,xmm */
> -    case X86EMUL_OPC_66(0x0f38, 0x15): /* blendvpd XMM0,xmm/m128,xmm */
> -    case X86EMUL_OPC_66(0x0f38, 0x28): /* pmuldq xmm/m128,xmm */
> -    case X86EMUL_OPC_66(0x0f38, 0x29): /* pcmpeqq xmm/m128,xmm */
> -    case X86EMUL_OPC_66(0x0f38, 0x2b): /* packusdw xmm/m128,xmm */
> -    case X86EMUL_OPC_66(0x0f38, 0x38): /* pminsb xmm/m128,xmm */
> -    case X86EMUL_OPC_66(0x0f38, 0x39): /* pminsd xmm/m128,xmm */
> -    case X86EMUL_OPC_66(0x0f38, 0x3a): /* pminub xmm/m128,xmm */
> -    case X86EMUL_OPC_66(0x0f38, 0x3b): /* pminud xmm/m128,xmm */
> -    case X86EMUL_OPC_66(0x0f38, 0x3c): /* pmaxsb xmm/m128,xmm */
> -    case X86EMUL_OPC_66(0x0f38, 0x3d): /* pmaxsd xmm/m128,xmm */
> -    case X86EMUL_OPC_66(0x0f38, 0x3e): /* pmaxub xmm/m128,xmm */
> -    case X86EMUL_OPC_66(0x0f38, 0x3f): /* pmaxud xmm/m128,xmm */
> -    case X86EMUL_OPC_66(0x0f38, 0x40): /* pmulld xmm/m128,xmm */
> -    case X86EMUL_OPC_66(0x0f38, 0x41): /* phminposuw xmm/m128,xmm */
> -        host_and_vcpu_must_have(sse4_1);
> -        goto simd_0f38_common;
> -
>      case X86EMUL_OPC_VEX_66(0x0f38, 0x0e): /* vtestps {x,y}mm/mem,{x,y}mm */
>      case X86EMUL_OPC_VEX_66(0x0f38, 0x0f): /* vtestpd {x,y}mm/mem,{x,y}mm */
>          generate_exception_if(vex.w, EXC_UD);
> @@ -7220,6 +7183,39 @@ x86_emulate(
>          dst.type = OP_NONE;
>          break;
>
> +    case X86EMUL_OPC_66(0x0f38, 0x20): /* pmovsxbw xmm/m64,xmm */
> +    case X86EMUL_OPC_66(0x0f38, 0x21): /* pmovsxbd xmm/m32,xmm */
> +    case X86EMUL_OPC_66(0x0f38, 0x22): /* pmovsxbq xmm/m16,xmm */
> +    case X86EMUL_OPC_66(0x0f38, 0x23): /* pmovsxwd xmm/m64,xmm */
> +    case X86EMUL_OPC_66(0x0f38, 0x24): /* pmovsxwq xmm/m32,xmm */
> +    case X86EMUL_OPC_66(0x0f38, 0x25): /* pmovsxdq xmm/m64,xmm */
> +    case X86EMUL_OPC_66(0x0f38, 0x30): /* pmovzxbw xmm/m64,xmm */
> +    case X86EMUL_OPC_66(0x0f38, 0x31): /* pmovzxbd xmm/m32,xmm */
> +    case X86EMUL_OPC_66(0x0f38, 0x32): /* pmovzxbq xmm/m16,xmm */
> +    case X86EMUL_OPC_66(0x0f38, 0x33): /* pmovzxwd xmm/m64,xmm */
> +    case X86EMUL_OPC_66(0x0f38, 0x34): /* pmovzxwq xmm/m32,xmm */
> +    case X86EMUL_OPC_66(0x0f38, 0x35): /* pmovzxdq xmm/m64,xmm */
> +        op_bytes = 16 >> pmov_convert_delta[b & 7];
> +        /* fall through */
> +    case X86EMUL_OPC_66(0x0f38, 0x10): /* pblendvb XMM0,xmm/m128,xmm */
> +    case X86EMUL_OPC_66(0x0f38, 0x14): /* blendvps XMM0,xmm/m128,xmm */
> +    case X86EMUL_OPC_66(0x0f38, 0x15): /* blendvpd XMM0,xmm/m128,xmm */
> +    case X86EMUL_OPC_66(0x0f38, 0x28): /* pmuldq xmm/m128,xmm */
> +    case X86EMUL_OPC_66(0x0f38, 0x29): /* pcmpeqq xmm/m128,xmm */
> +    case X86EMUL_OPC_66(0x0f38, 0x2b): /* packusdw xmm/m128,xmm */
> +    case X86EMUL_OPC_66(0x0f38, 0x38): /* pminsb xmm/m128,xmm */
> +    case X86EMUL_OPC_66(0x0f38, 0x39): /* pminsd xmm/m128,xmm */
> +    case X86EMUL_OPC_66(0x0f38, 0x3a): /* pminub xmm/m128,xmm */
> +    case X86EMUL_OPC_66(0x0f38, 0x3b): /* pminud xmm/m128,xmm */
> +    case X86EMUL_OPC_66(0x0f38, 0x3c): /* pmaxsb xmm/m128,xmm */
> +    case X86EMUL_OPC_66(0x0f38, 0x3d): /* pmaxsd xmm/m128,xmm */
> +    case X86EMUL_OPC_66(0x0f38, 0x3e): /* pmaxub xmm/m128,xmm */
> +    case X86EMUL_OPC_66(0x0f38, 0x3f): /* pmaxud xmm/m128,xmm */
> +    case X86EMUL_OPC_66(0x0f38, 0x40): /* pmulld xmm/m128,xmm */
> +    case X86EMUL_OPC_66(0x0f38, 0x41): /* phminposuw xmm/m128,xmm */
> +        host_and_vcpu_must_have(sse4_1);
> +        goto simd_0f38_common;
> +
>      case X86EMUL_OPC_VEX_66(0x0f38, 0x20): /* vpmovsxbw xmm/mem,{x,y}mm */
>      case X86EMUL_OPC_VEX_66(0x0f38, 0x21): /* vpmovsxbd xmm/mem,{x,y}mm */
>      case X86EMUL_OPC_VEX_66(0x0f38, 0x22): /* vpmovsxbq xmm/mem,{x,y}mm */
> @@ -7318,16 +7314,6 @@ x86_emulate(
>          host_and_vcpu_must_have(sse4_2);
>          goto simd_0f38_common;
>
> -    case X86EMUL_OPC(0x0f38, 0xc8):     /* sha1nexte xmm/m128,xmm */
> -    case X86EMUL_OPC(0x0f38, 0xc9):     /* sha1msg1 xmm/m128,xmm */
> -    case X86EMUL_OPC(0x0f38, 0xca):     /* sha1msg2 xmm/m128,xmm */
> -    case X86EMUL_OPC(0x0f38, 0xcb):     /* sha256rnds2 XMM0,xmm/m128,xmm */
> -    case X86EMUL_OPC(0x0f38, 0xcc):     /* sha256msg1 xmm/m128,xmm */
> -    case X86EMUL_OPC(0x0f38, 0xcd):     /* sha256msg2 xmm/m128,xmm */
> -        host_and_vcpu_must_have(sha);
> -        op_bytes = 16;
> -        goto simd_0f38_common;
> -
>      case X86EMUL_OPC_66(0x0f38, 0xdb):     /* aesimc xmm/m128,xmm */
>      case X86EMUL_OPC_VEX_66(0x0f38, 0xdb): /* vaesimc xmm/m128,xmm */
>      case X86EMUL_OPC_66(0x0f38, 0xdc):     /* aesenc xmm/m128,xmm,xmm */
> @@ -7341,9 +7327,21 @@ x86_emulate(
>          host_and_vcpu_must_have(aesni);
>          if ( vex.opcx == vex_none )
>              goto simd_0f38_common;
> +        /* fall through */
> +    case X86EMUL_OPC_VEX_66(0x0f38, 0x41): /* vphminposuw xmm/m128,xmm,xmm */
>          generate_exception_if(vex.l, EXC_UD);
>          goto simd_0f_avx;
>
> +    case X86EMUL_OPC(0x0f38, 0xc8):     /* sha1nexte xmm/m128,xmm */
> +    case X86EMUL_OPC(0x0f38, 0xc9):     /* sha1msg1 xmm/m128,xmm */
> +    case X86EMUL_OPC(0x0f38, 0xca):     /* sha1msg2 xmm/m128,xmm */
> +    case X86EMUL_OPC(0x0f38, 0xcb):     /* sha256rnds2 XMM0,xmm/m128,xmm */
> +    case X86EMUL_OPC(0x0f38, 0xcc):     /* sha256msg1 xmm/m128,xmm */
> +    case X86EMUL_OPC(0x0f38, 0xcd):     /* sha256msg2 xmm/m128,xmm */
> +        host_and_vcpu_must_have(sha);
> +        op_bytes = 16;
> +        goto simd_0f38_common;
> +
>      case X86EMUL_OPC(0x0f38, 0xf0): /* movbe m,r */
>      case X86EMUL_OPC(0x0f38, 0xf1): /* movbe r,m */
>          vcpu_must_have(movbe);
> @@ -7519,6 +7517,19 @@ x86_emulate(
>          generate_exception_if(vex.w, EXC_UD);
>          goto simd_0f_imm8_avx;
>
> +    case X86EMUL_OPC_66(0x0f3a, 0x08): /* roundps $imm8,xmm/m128,xmm */
> +    case X86EMUL_OPC_66(0x0f3a, 0x09): /* roundpd $imm8,xmm/m128,xmm */
> +    case X86EMUL_OPC_66(0x0f3a, 0x0a): /* roundss $imm8,xmm/m128,xmm */
> +    case X86EMUL_OPC_66(0x0f3a, 0x0b): /* roundsd $imm8,xmm/m128,xmm */
> +    case X86EMUL_OPC_66(0x0f3a, 0x0c): /* blendps $imm8,xmm/m128,xmm */
> +    case X86EMUL_OPC_66(0x0f3a, 0x0d): /* blendpd $imm8,xmm/m128,xmm */
> +    case X86EMUL_OPC_66(0x0f3a, 0x0e): /* pblendw $imm8,xmm/m128,xmm */
> +    case X86EMUL_OPC_66(0x0f3a, 0x40): /* dpps $imm8,xmm/m128,xmm */
> +    case X86EMUL_OPC_66(0x0f3a, 0x41): /* dppd $imm8,xmm/m128,xmm */
> +    case X86EMUL_OPC_66(0x0f3a, 0x42): /* mpsadbw $imm8,xmm/m128,xmm */
> +        host_and_vcpu_must_have(sse4_1);
> +        goto simd_0f3a_common;
> +
>      case X86EMUL_OPC(0x0f3a, 0x0f):    /* palignr $imm8,mm/m64,mm */
>      case X86EMUL_OPC_66(0x0f3a, 0x0f): /* palignr $imm8,xmm/m128,xmm */
>          host_and_vcpu_must_have(ssse3);
> @@ -7547,19 +7558,6 @@ x86_emulate(
>          fic.insn_bytes = PFX_BYTES + 4;
>          break;
>
> -    case X86EMUL_OPC_66(0x0f3a, 0x08): /* roundps $imm8,xmm/m128,xmm */
> -    case X86EMUL_OPC_66(0x0f3a, 0x09): /* roundpd $imm8,xmm/m128,xmm */
> -    case X86EMUL_OPC_66(0x0f3a, 0x0a): /* roundss $imm8,xmm/m128,xmm */
> -    case X86EMUL_OPC_66(0x0f3a, 0x0b): /* roundsd $imm8,xmm/m128,xmm */
> -    case X86EMUL_OPC_66(0x0f3a, 0x0c): /* blendps $imm8,xmm/m128,xmm */
> -    case X86EMUL_OPC_66(0x0f3a, 0x0d): /* blendpd $imm8,xmm/m128,xmm */
> -    case X86EMUL_OPC_66(0x0f3a, 0x0e): /* pblendw $imm8,xmm/m128,xmm */
> -    case X86EMUL_OPC_66(0x0f3a, 0x40): /* dpps $imm8,xmm/m128,xmm */
> -    case X86EMUL_OPC_66(0x0f3a, 0x41): /* dppd $imm8,xmm/m128,xmm */
> -    case X86EMUL_OPC_66(0x0f3a, 0x42): /* mpsadbw $imm8,xmm/m128,xmm */
> -        host_and_vcpu_must_have(sse4_1);
> -        goto simd_0f3a_common;
> -
>      case X86EMUL_OPC_66(0x0f3a, 0x14): /* pextrb $imm8,xmm,r/m */
>      case X86EMUL_OPC_66(0x0f3a, 0x15): /* pextrw $imm8,xmm,r/m */
>      case X86EMUL_OPC_66(0x0f3a, 0x16): /* pextr{d,q} $imm8,xmm,r/m */
>
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> https://lists.xen.org/xen-devel
>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 03/17] x86emul: build SIMD tests with -Os
  2017-06-21 12:00 ` [PATCH 03/17] x86emul: build SIMD tests with -Os Jan Beulich
@ 2017-09-13 15:19   ` George Dunlap
  2017-09-13 15:34     ` Jan Beulich
  0 siblings, 1 reply; 27+ messages in thread
From: George Dunlap @ 2017-09-13 15:19 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Andrew Cooper

On Wed, Jun 21, 2017 at 1:00 PM, Jan Beulich <JBeulich@suse.com> wrote:
> Namely in the context of putting together subsequent patches I've
> noticed that together with the touch() macro using -Os further
> increases the chances of the compiler using memory operands for the
> instructions we actually care to test.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

I'm not sure how I'm supposed to evaluate the claim, but it certainly
looks like the patch does what it does on the tin, so:

Reviewed-by: George Dunlap <george.dunlap@citrix.com>

>
> --- a/tools/tests/x86_emulator/Makefile
> +++ b/tools/tests/x86_emulator/Makefile
> @@ -45,17 +45,17 @@ define simd-defs
>  $(1)-cflags := \
>         $(foreach vec,$($(1)-vecs), \
>           $(foreach int,$($(1)-ints), \
> -           "-D_$(vec)i$(int) -m$(1) $(call non-sse,$(1)) -O2 -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
> -           "-D_$(vec)u$(int) -m$(1) $(call non-sse,$(1)) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)") \
> +           "-D_$(vec)i$(int) -m$(1) $(call non-sse,$(1)) -Os -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
> +           "-D_$(vec)u$(int) -m$(1) $(call non-sse,$(1)) -Os -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)") \
>           $(foreach flt,$($(1)-flts), \
> -           "-D_$(vec)f$(flt) -m$(1) $(call non-sse,$(1)) -O2 -DVEC_SIZE=$(vec) -DFLOAT_SIZE=$(flt)")) \
> +           "-D_$(vec)f$(flt) -m$(1) $(call non-sse,$(1)) -Os -DVEC_SIZE=$(vec) -DFLOAT_SIZE=$(flt)")) \
>         $(foreach flt,$($(1)-flts), \
> -         "-D_f$(flt) -m$(1) $(call non-sse,$(1)) -mfpmath=sse -O2 -DFLOAT_SIZE=$(flt)")
> +         "-D_f$(flt) -m$(1) $(call non-sse,$(1)) -mfpmath=sse -Os -DFLOAT_SIZE=$(flt)")
>  $(1)-avx-cflags := \
>         $(foreach vec,$($(1)-vecs), \
>           $(foreach int,$($(1)-ints), \
> -           "-D_$(vec)i$(int) -m$(1) $(sse2avx-$(1)) -O2 -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
> -           "-D_$(vec)u$(int) -m$(1) $(sse2avx-$(1)) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)"))
> +           "-D_$(vec)i$(int) -m$(1) $(sse2avx-$(1)) -Os -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
> +           "-D_$(vec)u$(int) -m$(1) $(sse2avx-$(1)) -Os -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)"))
>  endef
>
>  $(foreach flavor,$(SIMD),$(eval $(call simd-defs,$(flavor))))
>
>
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> https://lists.xen.org/xen-devel
>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 01/17] x86emul: support remaining AVX insns
  2017-09-13 15:02   ` George Dunlap
@ 2017-09-13 15:31     ` Jan Beulich
  0 siblings, 0 replies; 27+ messages in thread
From: Jan Beulich @ 2017-09-13 15:31 UTC (permalink / raw)
  To: George Dunlap; +Cc: Andrew Cooper, xen-devel

>>> On 13.09.17 at 17:02, <dunlapg@umich.edu> wrote:
> On Wed, Jun 21, 2017 at 12:59 PM, Jan Beulich <JBeulich@suse.com> wrote:
>> I.e. those not being equivalents of SSEn ones.
>>
>> There's one necessary change to generic code: Faulting behavior of
>> VMASKMOVP{S,D} requires us to do partial reads/writes.
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> Having dug a bit through the manuals about AVX, then come back to this
> patch again, I feel the same way that I did when I first read it:
> This discription is woefully inadequate.  It doesn't look so much like
> a changelog as a note to yourself.  If someone else sent a patch
> description like this for a patch this complicated and expected you to
> review it you'd certainly complain, and there's a good chance you
> wouldn't review it.
> 
> It's not your job to teach me how this code is laid out and how it
> works, but there at least needs to be enough description of what the
> patch is actually trying to do that I have some hope of reconstructing
> its intent.

Hmm, for someone not familiar with prior changes in this direction
(SSE etc) I can see how the brevity might be confusing / unhelpful.
However, the title of the patch says all that is to be said about
"its intent". The description really just points out the one thing
that is not in line with code that's already there. And I'm sure you
don't mean me to enumerate the individual instructions support is
being added for. Yet beyond that and beyond the partial write
handling there's really nothing the patch does. So I'm sort of lost
as to what additional information I could provide.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 03/17] x86emul: build SIMD tests with -Os
  2017-09-13 15:19   ` George Dunlap
@ 2017-09-13 15:34     ` Jan Beulich
  0 siblings, 0 replies; 27+ messages in thread
From: Jan Beulich @ 2017-09-13 15:34 UTC (permalink / raw)
  To: George Dunlap; +Cc: Andrew Cooper, xen-devel

>>> On 13.09.17 at 17:19, <dunlapg@umich.edu> wrote:
> On Wed, Jun 21, 2017 at 1:00 PM, Jan Beulich <JBeulich@suse.com> wrote:
>> Namely in the context of putting together subsequent patches I've
>> noticed that together with the touch() macro using -Os further
>> increases the chances of the compiler using memory operands for the
>> instructions we actually care to test.
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> I'm not sure how I'm supposed to evaluate the claim, but it certainly

Well, looking at the generated code for those tests is probably the
only way.

> looks like the patch does what it does on the tin, so:
> 
> Reviewed-by: George Dunlap <george.dunlap@citrix.com>

Thanks (also for the other patch)!

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 04/17] x86emul: support F16C insns
  2017-06-21 12:01 ` [PATCH 04/17] x86emul: support F16C insns Jan Beulich
@ 2017-09-13 17:10   ` George Dunlap
  2017-09-14  9:13     ` George Dunlap
  0 siblings, 1 reply; 27+ messages in thread
From: George Dunlap @ 2017-09-13 17:10 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Andrew Cooper

On Wed, Jun 21, 2017 at 1:01 PM, Jan Beulich <JBeulich@suse.com> wrote:
> Note that this avoids emulating the behavior of VCVTPS2PH found on at
> least some Intel CPUs, which update MXCSR even when the memory write
> faults.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>
> --- a/tools/tests/x86_emulator/test_x86_emulator.c
> +++ b/tools/tests/x86_emulator/test_x86_emulator.c
> @@ -3028,6 +3028,47 @@ int main(int argc, char **argv)
>          printf("skipped\n");
>  #endif
>
> +    printf("%-40s", "Testing vcvtph2ps (%ecx),%ymm1...");
> +    if ( stack_exec && cpu_has_f16c )
> +    {
> +        decl_insn(vcvtph2ps);
> +        decl_insn(vcvtps2ph);
> +
> +        asm volatile ( "vxorps %%xmm1, %%xmm1, %%xmm1\n"
> +                       put_insn(vcvtph2ps, "vcvtph2ps (%0), %%ymm1")
> +                       :: "c" (NULL) );
> +
> +        set_insn(vcvtph2ps);
> +        res[1] = 0x40003c00; /* (1.0, 2.0) */
> +        res[2] = 0x44004200; /* (3.0, 4.0) */
> +        res[3] = 0x3400b800; /* (-.5, .25) */
> +        res[4] = 0xbc000000; /* (0.0, -1.) */
> +        memset(res + 5, 0xff, 16);
> +        regs.ecx = (unsigned long)(res + 1);
> +        rc = x86_emulate(&ctxt, &emulops);
> +        asm volatile ( "vmovups %%ymm1, %0" : "=m" (res[16]) );
> +        if ( rc != X86EMUL_OKAY || !check_eip(vcvtph2ps) )
> +            goto fail;
> +        printf("okay\n");
> +
> +        printf("%-40s", "Testing vcvtps2ph $0,%ymm1,(%edx)...");
> +        asm volatile ( "vmovups %0, %%ymm1\n"
> +                       put_insn(vcvtps2ph, "vcvtps2ph $0, %%ymm1, (%1)")
> +                       :: "m" (res[16]), "d" (NULL) );
> +
> +        set_insn(vcvtps2ph);
> +        memset(res + 7, 0, 32);
> +        regs.edx = (unsigned long)(res + 7);
> +        rc = x86_emulate(&ctxt, &emulops);
> +        if ( rc != X86EMUL_OKAY || !check_eip(vcvtps2ph) ||
> +             memcmp(res + 1, res + 7, 16) ||
> +             res[11] || res[12] || res[13] || res[14] )
> +            goto fail;
> +        printf("okay\n");
> +    }
> +    else
> +        printf("skipped\n");
> +
>  #undef decl_insn
>  #undef put_insn
>  #undef set_insn
> --- a/tools/tests/x86_emulator/x86_emulate.h
> +++ b/tools/tests/x86_emulator/x86_emulate.h
> @@ -127,6 +127,14 @@ static inline uint64_t xgetbv(uint32_t x
>      (res.c & (1U << 28)) != 0; \
>  })
>
> +#define cpu_has_f16c ({ \
> +    struct cpuid_leaf res; \
> +    emul_test_cpuid(1, 0, &res, NULL); \
> +    if ( !(res.c & (1U << 27)) || ((xgetbv(0) & 6) != 6) ) \
> +        res.c = 0; \
> +    (res.c & (1U << 29)) != 0; \
> +})
> +
>  #define cpu_has_avx2 ({ \
>      struct cpuid_leaf res; \
>      emul_test_cpuid(1, 0, &res, NULL); \
> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -369,6 +369,7 @@ static const struct {
>      [0x00 ... 0x0b] = { .simd_size = simd_packed_int },
>      [0x0c ... 0x0f] = { .simd_size = simd_packed_fp },
>      [0x10] = { .simd_size = simd_packed_int },
> +    [0x13] = { .simd_size = simd_other, .two_op = 1 },
>      [0x14 ... 0x15] = { .simd_size = simd_packed_fp },
>      [0x17] = { .simd_size = simd_packed_int, .two_op = 1 },
>      [0x18 ... 0x19] = { .simd_size = simd_scalar_fp, .two_op = 1 },
> @@ -411,6 +412,7 @@ static const struct {
>      [0x14 ... 0x17] = { .simd_size = simd_none, .to_mem = 1, .two_op = 1 },
>      [0x18] = { .simd_size = simd_128 },
>      [0x19] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1 },
> +    [0x1d] = { .simd_size = simd_other, .to_mem = 1, .two_op = 1 },
>      [0x20] = { .simd_size = simd_none },
>      [0x21] = { .simd_size = simd_other },
>      [0x22] = { .simd_size = simd_none },
> @@ -1601,6 +1603,7 @@ static bool vcpu_has(
>  #define vcpu_has_popcnt()      vcpu_has(         1, ECX, 23, ctxt, ops)
>  #define vcpu_has_aesni()       vcpu_has(         1, ECX, 25, ctxt, ops)
>  #define vcpu_has_avx()         vcpu_has(         1, ECX, 28, ctxt, ops)
> +#define vcpu_has_f16c()        vcpu_has(         1, ECX, 29, ctxt, ops)
>  #define vcpu_has_rdrand()      vcpu_has(         1, ECX, 30, ctxt, ops)
>  #define vcpu_has_mmxext()     (vcpu_has(0x80000001, EDX, 22, ctxt, ops) || \
>                                 vcpu_has_sse())
> @@ -7216,6 +7219,12 @@ x86_emulate(
>          host_and_vcpu_must_have(sse4_1);
>          goto simd_0f38_common;
>
> +    case X86EMUL_OPC_VEX_66(0x0f38, 0x13): /* vcvtph2ps xmm/mem,{x,y}mm */
> +        generate_exception_if(vex.w, EXC_UD);
> +        host_and_vcpu_must_have(f16c);
> +        op_bytes = 8 << vex.l;
> +        goto simd_0f_ymm;
> +
>      case X86EMUL_OPC_VEX_66(0x0f38, 0x20): /* vpmovsxbw xmm/mem,{x,y}mm */
>      case X86EMUL_OPC_VEX_66(0x0f38, 0x21): /* vpmovsxbd xmm/mem,{x,y}mm */
>      case X86EMUL_OPC_VEX_66(0x0f38, 0x22): /* vpmovsxbq xmm/mem,{x,y}mm */
> @@ -7607,6 +7616,50 @@ x86_emulate(
>          opc = init_prefixes(stub);
>          goto pextr;
>
> +    case X86EMUL_OPC_VEX_66(0x0f3a, 0x1d): /* vcvtps2ph $imm8,{x,y}mm,xmm/mem */

On the whole this stanza looks plausible; just a few comments...

> +    {
> +        uint32_t mxcsr;
> +
> +        generate_exception_if(vex.w || vex.reg != 0xf, EXC_UD);
> +        host_and_vcpu_must_have(f16c);
> +        fail_if(!ops->write);
> +
> +        opc = init_prefixes(stub);
> +        opc[0] = b;
> +        opc[1] = modrm;
> +        if ( ea.type == OP_MEM )
> +        {
> +            /* Convert memory operand to (%rAX). */
> +            vex.b = 1;
> +            opc[1] &= 0x38;
> +        }

First of all, I'm not a fan of modifying the actual decoded vex prefix
here.  I realize that as it happens vex.b won't be read again; but it
just seems like risky behavior.  Would it make more sense to make a
copy of vex and modify that?

Secondly, the logic here is confusing: you first copy the decoded
modrm byte, then modify one bit of the decoded vex prefix, then modify
the copied modrm byte, then (below) copy the (potentially) modified
vex prefix.

It seems like it would be a lot more clear to have the if before the
init_prefixes, and have it look like this:

if ( ea.type == OP_MEM )
{
  /* Convert blah */
  vex.b = 1;
  modrm &= 0x38
}

(Or lvex and lmodrm, which are local variables containing copies of
the decoded vex prefix and modrm byte.)

> +        opc[2] = imm1;
> +        fic.insn_bytes = PFX_BYTES + 3;
> +        opc[3] = 0xc3;

Open-coding this 'ret' makes this harder to parse.

> +
> +        copy_VEX(opc, vex);

If we move the vex & modrm adjustment above the init_prefixes(), then
this can go right after init_prefixes(); that way the bytes are all
basically set in order.

At very least this logically goes with the other "set up opc" code, so
if left here the blank line should be after this rather than before
this.

More later. :-)

 -George

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 04/17] x86emul: support F16C insns
  2017-09-13 17:10   ` George Dunlap
@ 2017-09-14  9:13     ` George Dunlap
  2017-09-14 10:24       ` Jan Beulich
  0 siblings, 1 reply; 27+ messages in thread
From: George Dunlap @ 2017-09-14  9:13 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Andrew Cooper

On Wed, Sep 13, 2017 at 6:10 PM, George Dunlap <george.dunlap@citrix.com> wrote:
> On Wed, Jun 21, 2017 at 1:01 PM, Jan Beulich <JBeulich@suse.com> wrote:
>> Note that this avoids emulating the behavior of VCVTPS2PH found on at
>> least some Intel CPUs, which update MXCSR even when the memory write
>> faults.
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>
>> --- a/tools/tests/x86_emulator/test_x86_emulator.c
>> +++ b/tools/tests/x86_emulator/test_x86_emulator.c
>> @@ -3028,6 +3028,47 @@ int main(int argc, char **argv)
>>          printf("skipped\n");
>>  #endif
>>
>> +    printf("%-40s", "Testing vcvtph2ps (%ecx),%ymm1...");
>> +    if ( stack_exec && cpu_has_f16c )
>> +    {
>> +        decl_insn(vcvtph2ps);
>> +        decl_insn(vcvtps2ph);
>> +
>> +        asm volatile ( "vxorps %%xmm1, %%xmm1, %%xmm1\n"
>> +                       put_insn(vcvtph2ps, "vcvtph2ps (%0), %%ymm1")
>> +                       :: "c" (NULL) );
>> +
>> +        set_insn(vcvtph2ps);
>> +        res[1] = 0x40003c00; /* (1.0, 2.0) */
>> +        res[2] = 0x44004200; /* (3.0, 4.0) */
>> +        res[3] = 0x3400b800; /* (-.5, .25) */
>> +        res[4] = 0xbc000000; /* (0.0, -1.) */
>> +        memset(res + 5, 0xff, 16);
>> +        regs.ecx = (unsigned long)(res + 1);
>> +        rc = x86_emulate(&ctxt, &emulops);
>> +        asm volatile ( "vmovups %%ymm1, %0" : "=m" (res[16]) );
>> +        if ( rc != X86EMUL_OKAY || !check_eip(vcvtph2ps) )
>> +            goto fail;
>> +        printf("okay\n");
>> +
>> +        printf("%-40s", "Testing vcvtps2ph $0,%ymm1,(%edx)...");
>> +        asm volatile ( "vmovups %0, %%ymm1\n"
>> +                       put_insn(vcvtps2ph, "vcvtps2ph $0, %%ymm1, (%1)")
>> +                       :: "m" (res[16]), "d" (NULL) );
>> +
>> +        set_insn(vcvtps2ph);
>> +        memset(res + 7, 0, 32);
>> +        regs.edx = (unsigned long)(res + 7);
>> +        rc = x86_emulate(&ctxt, &emulops);
>> +        if ( rc != X86EMUL_OKAY || !check_eip(vcvtps2ph) ||
>> +             memcmp(res + 1, res + 7, 16) ||
>> +             res[11] || res[12] || res[13] || res[14] )
>> +            goto fail;
>> +        printf("okay\n");
>> +    }
>> +    else
>> +        printf("skipped\n");
>> +
>>  #undef decl_insn
>>  #undef put_insn
>>  #undef set_insn
>> --- a/tools/tests/x86_emulator/x86_emulate.h
>> +++ b/tools/tests/x86_emulator/x86_emulate.h
>> @@ -127,6 +127,14 @@ static inline uint64_t xgetbv(uint32_t x
>>      (res.c & (1U << 28)) != 0; \
>>  })
>>
>> +#define cpu_has_f16c ({ \
>> +    struct cpuid_leaf res; \
>> +    emul_test_cpuid(1, 0, &res, NULL); \
>> +    if ( !(res.c & (1U << 27)) || ((xgetbv(0) & 6) != 6) ) \
>> +        res.c = 0; \
>> +    (res.c & (1U << 29)) != 0; \
>> +})
>> +
>>  #define cpu_has_avx2 ({ \
>>      struct cpuid_leaf res; \
>>      emul_test_cpuid(1, 0, &res, NULL); \
>> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
>> @@ -369,6 +369,7 @@ static const struct {
>>      [0x00 ... 0x0b] = { .simd_size = simd_packed_int },
>>      [0x0c ... 0x0f] = { .simd_size = simd_packed_fp },
>>      [0x10] = { .simd_size = simd_packed_int },
>> +    [0x13] = { .simd_size = simd_other, .two_op = 1 },
>>      [0x14 ... 0x15] = { .simd_size = simd_packed_fp },
>>      [0x17] = { .simd_size = simd_packed_int, .two_op = 1 },
>>      [0x18 ... 0x19] = { .simd_size = simd_scalar_fp, .two_op = 1 },
>> @@ -411,6 +412,7 @@ static const struct {
>>      [0x14 ... 0x17] = { .simd_size = simd_none, .to_mem = 1, .two_op = 1 },
>>      [0x18] = { .simd_size = simd_128 },
>>      [0x19] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1 },
>> +    [0x1d] = { .simd_size = simd_other, .to_mem = 1, .two_op = 1 },
>>      [0x20] = { .simd_size = simd_none },
>>      [0x21] = { .simd_size = simd_other },
>>      [0x22] = { .simd_size = simd_none },
>> @@ -1601,6 +1603,7 @@ static bool vcpu_has(
>>  #define vcpu_has_popcnt()      vcpu_has(         1, ECX, 23, ctxt, ops)
>>  #define vcpu_has_aesni()       vcpu_has(         1, ECX, 25, ctxt, ops)
>>  #define vcpu_has_avx()         vcpu_has(         1, ECX, 28, ctxt, ops)
>> +#define vcpu_has_f16c()        vcpu_has(         1, ECX, 29, ctxt, ops)
>>  #define vcpu_has_rdrand()      vcpu_has(         1, ECX, 30, ctxt, ops)
>>  #define vcpu_has_mmxext()     (vcpu_has(0x80000001, EDX, 22, ctxt, ops) || \
>>                                 vcpu_has_sse())
>> @@ -7216,6 +7219,12 @@ x86_emulate(
>>          host_and_vcpu_must_have(sse4_1);
>>          goto simd_0f38_common;
>>
>> +    case X86EMUL_OPC_VEX_66(0x0f38, 0x13): /* vcvtph2ps xmm/mem,{x,y}mm */
>> +        generate_exception_if(vex.w, EXC_UD);
>> +        host_and_vcpu_must_have(f16c);
>> +        op_bytes = 8 << vex.l;
>> +        goto simd_0f_ymm;
>> +
>>      case X86EMUL_OPC_VEX_66(0x0f38, 0x20): /* vpmovsxbw xmm/mem,{x,y}mm */
>>      case X86EMUL_OPC_VEX_66(0x0f38, 0x21): /* vpmovsxbd xmm/mem,{x,y}mm */
>>      case X86EMUL_OPC_VEX_66(0x0f38, 0x22): /* vpmovsxbq xmm/mem,{x,y}mm */
>> @@ -7607,6 +7616,50 @@ x86_emulate(
>>          opc = init_prefixes(stub);
>>          goto pextr;
>>
>> +    case X86EMUL_OPC_VEX_66(0x0f3a, 0x1d): /* vcvtps2ph $imm8,{x,y}mm,xmm/mem */
>
> On the whole this stanza looks plausible; just a few comments...
>
>> +    {
>> +        uint32_t mxcsr;
>> +
>> +        generate_exception_if(vex.w || vex.reg != 0xf, EXC_UD);
>> +        host_and_vcpu_must_have(f16c);
>> +        fail_if(!ops->write);
>> +
>> +        opc = init_prefixes(stub);
>> +        opc[0] = b;
>> +        opc[1] = modrm;
>> +        if ( ea.type == OP_MEM )
>> +        {
>> +            /* Convert memory operand to (%rAX). */
>> +            vex.b = 1;
>> +            opc[1] &= 0x38;
>> +        }
>
> First of all, I'm not a fan of modifying the actual decoded vex prefix
> here.  I realize that as it happens vex.b won't be read again; but it
> just seems like risky behavior.  Would it make more sense to make a
> copy of vex and modify that?
>
> Secondly, the logic here is confusing: you first copy the decoded
> modrm byte, then modify one bit of the decoded vex prefix, then modify
> the copied modrm byte, then (below) copy the (potentially) modified
> vex prefix.
>
> It seems like it would be a lot more clear to have the if before the
> init_prefixes, and have it look like this:
>
> if ( ea.type == OP_MEM )
> {
>   /* Convert blah */
>   vex.b = 1;
>   modrm &= 0x38
> }

I see now -- you're using a construct that is common all over the code.

I think the construct could probably use changing, but currently for
readability it's probably better to follow suit.

 -George

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 04/17] x86emul: support F16C insns
  2017-09-14  9:13     ` George Dunlap
@ 2017-09-14 10:24       ` Jan Beulich
  0 siblings, 0 replies; 27+ messages in thread
From: Jan Beulich @ 2017-09-14 10:24 UTC (permalink / raw)
  To: George Dunlap; +Cc: Andrew Cooper, xen-devel

>>> On 14.09.17 at 11:13, <george.dunlap@citrix.com> wrote:
> I see now -- you're using a construct that is common all over the code.
> 
> I think the construct could probably use changing, but currently for
> readability it's probably better to follow suit.

Perhaps, though at the time I couldn't think of anything that would
look better, work, and not result in overly much larger code size.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2017-09-14 10:24 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-06-21 11:23 [PATCH 00/17] x86: emulator enhancements Jan Beulich
2017-06-21 11:59 ` [PATCH 01/17] x86emul: support remaining AVX insns Jan Beulich
2017-09-13 15:02   ` George Dunlap
2017-09-13 15:31     ` Jan Beulich
2017-06-21 11:59 ` [PATCH 02/17] x86emul: re-order cases of main switch statement Jan Beulich
2017-09-13 15:15   ` George Dunlap
2017-06-21 12:00 ` [PATCH 03/17] x86emul: build SIMD tests with -Os Jan Beulich
2017-09-13 15:19   ` George Dunlap
2017-09-13 15:34     ` Jan Beulich
2017-06-21 12:01 ` [PATCH 04/17] x86emul: support F16C insns Jan Beulich
2017-09-13 17:10   ` George Dunlap
2017-09-14  9:13     ` George Dunlap
2017-09-14 10:24       ` Jan Beulich
2017-06-21 12:01 ` [PATCH 05/17] x86emul: support FMA4 insns Jan Beulich
2017-06-21 12:02 ` [PATCH 06/17] x86emul: support FMA insns Jan Beulich
2017-06-21 12:02 ` [PATCH 07/17] x86emul: support most remaining AVX2 insns Jan Beulich
2017-06-21 12:03 ` [PATCH 08/17] x86emul: fold/eliminate some local variables Jan Beulich
2017-06-21 12:04 ` [PATCH 09/17] x86emul: support AVX2 gather insns Jan Beulich
2017-06-21 12:04 ` [PATCH 10/17] x86emul: add tables for XOP 08 and 09 extension spaces Jan Beulich
2017-06-21 12:05 ` [PATCH 11/17] x86emul: support XOP insns Jan Beulich
2017-06-21 12:05 ` [PATCH 12/17] x86emul: support 3DNow! insns Jan Beulich
2017-06-21 12:06 ` [PATCH 13/17] x86emul: re-order checks in test harness Jan Beulich
2017-06-21 12:07 ` [PATCH 14/17] x86emul: abstract out XCRn accesses Jan Beulich
2017-06-21 12:07 ` [PATCH 15/17] x86emul: adjust_bnd() should check XCR0 Jan Beulich
2017-06-21 12:08 ` [PATCH 16/17] x86emul: make all FPU emulation use the stub Jan Beulich
2017-06-21 12:09 ` [PATCH 17/17] x86/HVM: eliminate custom #MF/#XM handling Jan Beulich
2017-09-05 17:08 ` [PATCH 00/17] x86: emulator enhancements George Dunlap

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.