All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3 00/25] x86: emulator enhancements
@ 2017-12-07 13:49 Jan Beulich
  2017-12-07 13:58 ` [PATCH v3 01/25] x86emul: make decode_register() return unsigned long * Jan Beulich
                   ` (24 more replies)
  0 siblings, 25 replies; 85+ messages in thread
From: Jan Beulich @ 2017-12-07 13:49 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper

01: make decode_register() return unsigned long *
02: build SIMD tests with -Os
03: support F16C insns
04: support FMA4 insns
05: support FMA insns
06: support most remaining AVX2 insns
07: support AVX2 gather insns
08: add tables for XOP 08 and 09 extension spaces
09: support XOP insns
10: support 3DNow! insns
11: place test blobs in executable section
12: abstract out XCRn accesses
13: adjust_bnd() should check XCR0
14: make all FPU emulation use the stub
15: eliminate custom #MF/#XM handling
16: support SWAPGS
17: emulate {MONITOR,MWAIT}{,X} as no-op
18: add missing suffixes in test harness
19: tell cmpxchg hook whether LOCK is in effect
20: correctly handle CMPXCHG* comparison failures
21: add read-modify-write hook
22: x86/HVM: do actual CMPXCHG in hvmemul_cmpxchg()
23: x86/HVM: make use of new read-modify-write emulator hook
24: x86/shadow: fully move unmap-dest into common code
25: x86/shadow: fold sh_x86_emulate_{write,cmpxchg}() into their only callers

Signed-off-by: Jan Beulich <jbeulich@suse.com>

v3: Several new patches; existing ones mostly unchanged (except
    for re-basing).




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* [PATCH v3 01/25] x86emul: make decode_register() return unsigned long *
  2017-12-07 13:49 [PATCH v3 00/25] x86: emulator enhancements Jan Beulich
@ 2017-12-07 13:58 ` Jan Beulich
  2017-12-07 18:32   ` Andrew Cooper
  2017-12-07 13:59 ` [PATCH v3 02/25] x86emul: build SIMD tests with -Os Jan Beulich
                   ` (23 subsequent siblings)
  24 siblings, 1 reply; 85+ messages in thread
From: Jan Beulich @ 2017-12-07 13:58 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Kevin Tian, Jun Nakajima

Quite a few casts can be dropped this way, and type-safeness is being
increased by not using void * (same goes for decode_vex_gpr()). Drop
casts and no longer needed intermediate variables where possible. Take
the opportunity and also switch the last parameter to bool.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/xen/arch/x86/hvm/vmx/vvmx.c
+++ b/xen/arch/x86/hvm/vmx/vvmx.c
@@ -347,18 +347,14 @@ enum vmx_insn_errno set_vvmcs_real_safe(
 static unsigned long reg_read(struct cpu_user_regs *regs,
                               enum vmx_regs_enc index)
 {
-    unsigned long *pval = decode_register(index, regs, 0);
-
-    return *pval;
+    return *decode_register(index, regs, false);
 }
 
 static void reg_write(struct cpu_user_regs *regs,
                       enum vmx_regs_enc index,
                       unsigned long value)
 {
-    unsigned long *pval = decode_register(index, regs, 0);
-
-    *pval = value;
+    *decode_register(index, regs, false) = value;
 }
 
 static inline u32 __n2_pin_exec_control(struct vcpu *v)
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1935,9 +1935,9 @@ load_seg(
     return rc;
 }
 
-void *
+unsigned long *
 decode_register(
-    uint8_t modrm_reg, struct cpu_user_regs *regs, int highbyte_regs)
+    uint8_t modrm_reg, struct cpu_user_regs *regs, bool highbyte_regs)
 {
     void *p;
 
@@ -1967,10 +1967,11 @@ decode_register(
     return p;
 }
 
-static void *decode_vex_gpr(unsigned int vex_reg, struct cpu_user_regs *regs,
-                            const struct x86_emulate_ctxt *ctxt)
+static unsigned long *decode_vex_gpr(unsigned int vex_reg,
+                                     struct cpu_user_regs *regs,
+                                     const struct x86_emulate_ctxt *ctxt)
 {
-    return decode_register(~vex_reg & (mode_64bit() ? 0xf : 7), regs, 0);
+    return decode_register(~vex_reg & (mode_64bit() ? 0xf : 7), regs, false);
 }
 
 static bool is_aligned(enum x86_segment seg, unsigned long offs,
@@ -2779,8 +2780,8 @@ x86_decode(
                 sib_index = ((sib >> 3) & 7) | ((rex_prefix << 2) & 8);
                 sib_base  = (sib & 7) | ((rex_prefix << 3) & 8);
                 if ( sib_index != 4 && !(d & vSIB) )
-                    ea.mem.off = *(long *)decode_register(sib_index,
-                                                          state->regs, 0);
+                    ea.mem.off = *decode_register(sib_index, state->regs,
+                                                  false);
                 ea.mem.off <<= (sib >> 6) & 3;
                 if ( (modrm_mod == 0) && ((sib_base & 7) == 5) )
                     ea.mem.off += insn_fetch_type(int32_t);
@@ -2799,15 +2800,14 @@ x86_decode(
                     ea.mem.off += state->regs->r(bp);
                 }
                 else
-                    ea.mem.off += *(long *)decode_register(sib_base,
-                                                           state->regs, 0);
+                    ea.mem.off += *decode_register(sib_base, state->regs,
+                                                   false);
             }
             else
             {
                 generate_exception_if(d & vSIB, EXC_UD);
                 modrm_rm |= (rex_prefix & 1) << 3;
-                ea.mem.off = *(long *)decode_register(modrm_rm,
-                                                      state->regs, 0);
+                ea.mem.off = *decode_register(modrm_rm, state->regs, false);
                 if ( (modrm_rm == 5) && (modrm_mod != 0) )
                     ea.mem.seg = x86_seg_ss;
             }
@@ -3329,8 +3329,8 @@ x86_emulate(
         break;
 
     case 0x50 ... 0x57: /* push reg */
-        src.val = *(unsigned long *)decode_register(
-            (b & 7) | ((rex_prefix & 1) << 3), &_regs, 0);
+        src.val = *decode_register((b & 7) | ((rex_prefix & 1) << 3),
+                                   &_regs, false);
         goto push;
 
     case 0x58 ... 0x5f: /* pop reg */
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -606,9 +606,9 @@ int x86_emulate_wrapper(
  * pointer into the block that addresses the relevant register.
  * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
  */
-void *
+unsigned long *
 decode_register(
-    uint8_t modrm_reg, struct cpu_user_regs *regs, int highbyte_regs);
+    uint8_t modrm_reg, struct cpu_user_regs *regs, bool highbyte_regs);
 
 /* Unhandleable read, write or instruction fetch */
 int



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* [PATCH v3 02/25] x86emul: build SIMD tests with -Os
  2017-12-07 13:49 [PATCH v3 00/25] x86: emulator enhancements Jan Beulich
  2017-12-07 13:58 ` [PATCH v3 01/25] x86emul: make decode_register() return unsigned long * Jan Beulich
@ 2017-12-07 13:59 ` Jan Beulich
  2017-12-07 18:32   ` Andrew Cooper
  2017-12-07 14:00 ` [PATCH v3 03/25] x86emul: support F16C insns Jan Beulich
                   ` (22 subsequent siblings)
  24 siblings, 1 reply; 85+ messages in thread
From: Jan Beulich @ 2017-12-07 13:59 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper

Specifically in the context of putting together subsequent patches I've
noticed that together with the touch() macro using -Os further
increases the chances of the compiler using memory operands for the
instructions we actually care to test.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>

--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -45,17 +45,17 @@ define simd-defs
 $(1)-cflags := \
 	$(foreach vec,$($(1)-vecs), \
 	  $(foreach int,$($(1)-ints), \
-	    "-D_$(vec)i$(int) -m$(1) $(call non-sse,$(1)) -O2 -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
-	    "-D_$(vec)u$(int) -m$(1) $(call non-sse,$(1)) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)") \
+	    "-D_$(vec)i$(int) -m$(1) $(call non-sse,$(1)) -Os -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
+	    "-D_$(vec)u$(int) -m$(1) $(call non-sse,$(1)) -Os -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)") \
 	  $(foreach flt,$($(1)-flts), \
-	    "-D_$(vec)f$(flt) -m$(1) $(call non-sse,$(1)) -O2 -DVEC_SIZE=$(vec) -DFLOAT_SIZE=$(flt)")) \
+	    "-D_$(vec)f$(flt) -m$(1) $(call non-sse,$(1)) -Os -DVEC_SIZE=$(vec) -DFLOAT_SIZE=$(flt)")) \
 	$(foreach flt,$($(1)-flts), \
-	  "-D_f$(flt) -m$(1) $(call non-sse,$(1)) -mfpmath=sse -O2 -DFLOAT_SIZE=$(flt)")
+	  "-D_f$(flt) -m$(1) $(call non-sse,$(1)) -mfpmath=sse -Os -DFLOAT_SIZE=$(flt)")
 $(1)-avx-cflags := \
 	$(foreach vec,$($(1)-vecs), \
 	  $(foreach int,$($(1)-ints), \
-	    "-D_$(vec)i$(int) -m$(1) $(sse2avx-$(1)) -O2 -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
-	    "-D_$(vec)u$(int) -m$(1) $(sse2avx-$(1)) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)"))
+	    "-D_$(vec)i$(int) -m$(1) $(sse2avx-$(1)) -Os -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
+	    "-D_$(vec)u$(int) -m$(1) $(sse2avx-$(1)) -Os -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)"))
 endef
 
 $(foreach flavor,$(SIMD),$(eval $(call simd-defs,$(flavor))))




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* [PATCH v3 03/25] x86emul: support F16C insns
  2017-12-07 13:49 [PATCH v3 00/25] x86: emulator enhancements Jan Beulich
  2017-12-07 13:58 ` [PATCH v3 01/25] x86emul: make decode_register() return unsigned long * Jan Beulich
  2017-12-07 13:59 ` [PATCH v3 02/25] x86emul: build SIMD tests with -Os Jan Beulich
@ 2017-12-07 14:00 ` Jan Beulich
  2018-01-31 18:58   ` Andrew Cooper
  2017-12-07 14:01 ` [PATCH v3 04/25] x86emul: support FMA4 insns Jan Beulich
                   ` (21 subsequent siblings)
  24 siblings, 1 reply; 85+ messages in thread
From: Jan Beulich @ 2017-12-07 14:00 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper

Note that this avoids emulating the behavior of VCVTPS2PH found on at
least some Intel CPUs, which update MXCSR even when the memory write
faults.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -3053,6 +3053,47 @@ int main(int argc, char **argv)
         printf("skipped\n");
 #endif
 
+    printf("%-40s", "Testing vcvtph2ps (%ecx),%ymm1...");
+    if ( stack_exec && cpu_has_f16c )
+    {
+        decl_insn(vcvtph2ps);
+        decl_insn(vcvtps2ph);
+
+        asm volatile ( "vxorps %%xmm1, %%xmm1, %%xmm1\n"
+                       put_insn(vcvtph2ps, "vcvtph2ps (%0), %%ymm1")
+                       :: "c" (NULL) );
+
+        set_insn(vcvtph2ps);
+        res[1] = 0x40003c00; /* (1.0, 2.0) */
+        res[2] = 0x44004200; /* (3.0, 4.0) */
+        res[3] = 0x3400b800; /* (-.5, .25) */
+        res[4] = 0xbc000000; /* (0.0, -1.) */
+        memset(res + 5, 0xff, 16);
+        regs.ecx = (unsigned long)(res + 1);
+        rc = x86_emulate(&ctxt, &emulops);
+        asm volatile ( "vmovups %%ymm1, %0" : "=m" (res[16]) );
+        if ( rc != X86EMUL_OKAY || !check_eip(vcvtph2ps) )
+            goto fail;
+        printf("okay\n");
+
+        printf("%-40s", "Testing vcvtps2ph $0,%ymm1,(%edx)...");
+        asm volatile ( "vmovups %0, %%ymm1\n"
+                       put_insn(vcvtps2ph, "vcvtps2ph $0, %%ymm1, (%1)")
+                       :: "m" (res[16]), "d" (NULL) );
+
+        set_insn(vcvtps2ph);
+        memset(res + 7, 0, 32);
+        regs.edx = (unsigned long)(res + 7);
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vcvtps2ph) ||
+             memcmp(res + 1, res + 7, 16) ||
+             res[11] || res[12] || res[13] || res[14] )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
 #undef decl_insn
 #undef put_insn
 #undef set_insn
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -132,6 +132,14 @@ static inline uint64_t xgetbv(uint32_t x
     (res.c & (1U << 28)) != 0; \
 })
 
+#define cpu_has_f16c ({ \
+    struct cpuid_leaf res; \
+    emul_test_cpuid(1, 0, &res, NULL); \
+    if ( !(res.c & (1U << 27)) || ((xgetbv(0) & 6) != 6) ) \
+        res.c = 0; \
+    (res.c & (1U << 29)) != 0; \
+})
+
 #define cpu_has_avx2 ({ \
     struct cpuid_leaf res; \
     emul_test_cpuid(1, 0, &res, NULL); \
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -369,6 +369,7 @@ static const struct {
     [0x00 ... 0x0b] = { .simd_size = simd_packed_int },
     [0x0c ... 0x0f] = { .simd_size = simd_packed_fp },
     [0x10] = { .simd_size = simd_packed_int },
+    [0x13] = { .simd_size = simd_other, .two_op = 1 },
     [0x14 ... 0x15] = { .simd_size = simd_packed_fp },
     [0x17] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0x18 ... 0x19] = { .simd_size = simd_scalar_fp, .two_op = 1 },
@@ -411,6 +412,7 @@ static const struct {
     [0x14 ... 0x17] = { .simd_size = simd_none, .to_mem = 1, .two_op = 1 },
     [0x18] = { .simd_size = simd_128 },
     [0x19] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1 },
+    [0x1d] = { .simd_size = simd_other, .to_mem = 1, .two_op = 1 },
     [0x20] = { .simd_size = simd_none },
     [0x21] = { .simd_size = simd_other },
     [0x22] = { .simd_size = simd_none },
@@ -1602,6 +1604,7 @@ static bool vcpu_has(
 #define vcpu_has_popcnt()      vcpu_has(         1, ECX, 23, ctxt, ops)
 #define vcpu_has_aesni()       vcpu_has(         1, ECX, 25, ctxt, ops)
 #define vcpu_has_avx()         vcpu_has(         1, ECX, 28, ctxt, ops)
+#define vcpu_has_f16c()        vcpu_has(         1, ECX, 29, ctxt, ops)
 #define vcpu_has_rdrand()      vcpu_has(         1, ECX, 30, ctxt, ops)
 #define vcpu_has_mmxext()     (vcpu_has(0x80000001, EDX, 22, ctxt, ops) || \
                                vcpu_has_sse())
@@ -7230,6 +7233,12 @@ x86_emulate(
         host_and_vcpu_must_have(sse4_1);
         goto simd_0f38_common;
 
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x13): /* vcvtph2ps xmm/mem,{x,y}mm */
+        generate_exception_if(vex.w, EXC_UD);
+        host_and_vcpu_must_have(f16c);
+        op_bytes = 8 << vex.l;
+        goto simd_0f_ymm;
+
     case X86EMUL_OPC_VEX_66(0x0f38, 0x20): /* vpmovsxbw xmm/mem,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x21): /* vpmovsxbd xmm/mem,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x22): /* vpmovsxbq xmm/mem,{x,y}mm */
@@ -7621,6 +7630,50 @@ x86_emulate(
         opc = init_prefixes(stub);
         goto pextr;
 
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x1d): /* vcvtps2ph $imm8,{x,y}mm,xmm/mem */
+    {
+        uint32_t mxcsr;
+
+        generate_exception_if(vex.w || vex.reg != 0xf, EXC_UD);
+        host_and_vcpu_must_have(f16c);
+        fail_if(!ops->write);
+
+        opc = init_prefixes(stub);
+        opc[0] = b;
+        opc[1] = modrm;
+        if ( ea.type == OP_MEM )
+        {
+            /* Convert memory operand to (%rAX). */
+            vex.b = 1;
+            opc[1] &= 0x38;
+        }
+        opc[2] = imm1;
+        fic.insn_bytes = PFX_BYTES + 3;
+        opc[3] = 0xc3;
+
+        copy_VEX(opc, vex);
+        /* Latch MXCSR - we may need to restore it below. */
+        invoke_stub("stmxcsr %[mxcsr]", "",
+                    "=m" (*mmvalp), "+m" (fic.exn_raised), [mxcsr] "=m" (mxcsr)
+                    : "a" (mmvalp));
+
+        put_stub(stub);
+        check_xmm_exn(&fic);
+
+        if ( ea.type == OP_MEM )
+        {
+            rc = ops->write(ea.mem.seg, ea.mem.off, mmvalp, 8 << vex.l, ctxt);
+            if ( rc != X86EMUL_OKAY )
+            {
+                asm volatile ( "ldmxcsr %0" :: "m" (mxcsr) );
+                goto done;
+            }
+        }
+
+        state->simd_size = simd_none;
+        break;
+    }
+
     case X86EMUL_OPC_66(0x0f3a, 0x20): /* pinsrb $imm8,r32/m8,xmm */
     case X86EMUL_OPC_66(0x0f3a, 0x22): /* pinsr{d,q} $imm8,r/m,xmm */
         host_and_vcpu_must_have(sse4_1);
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -60,6 +60,7 @@
 #define cpu_has_aesni           boot_cpu_has(X86_FEATURE_AESNI)
 #define cpu_has_xsave           boot_cpu_has(X86_FEATURE_XSAVE)
 #define cpu_has_avx             boot_cpu_has(X86_FEATURE_AVX)
+#define cpu_has_f16c            boot_cpu_has(X86_FEATURE_F16C)
 #define cpu_has_rdrand          boot_cpu_has(X86_FEATURE_RDRAND)
 #define cpu_has_hypervisor      boot_cpu_has(X86_FEATURE_HYPERVISOR)
 



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* [PATCH v3 04/25] x86emul: support FMA4 insns
  2017-12-07 13:49 [PATCH v3 00/25] x86: emulator enhancements Jan Beulich
                   ` (2 preceding siblings ...)
  2017-12-07 14:00 ` [PATCH v3 03/25] x86emul: support F16C insns Jan Beulich
@ 2017-12-07 14:01 ` Jan Beulich
  2018-01-31 19:51   ` Andrew Cooper
  2017-12-07 14:02 ` [PATCH v3 05/25] x86emul: support FMA insns Jan Beulich
                   ` (20 subsequent siblings)
  24 siblings, 1 reply; 85+ messages in thread
From: Jan Beulich @ 2017-12-07 14:01 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: Re-base.

--- a/.gitignore
+++ b/.gitignore
@@ -226,6 +226,7 @@
 tools/tests/x86_emulator/asm
 tools/tests/x86_emulator/avx*.[ch]
 tools/tests/x86_emulator/blowfish.h
+tools/tests/x86_emulator/fma*.[ch]
 tools/tests/x86_emulator/sse*.[ch]
 tools/tests/x86_emulator/test_x86_emulator
 tools/tests/x86_emulator/x86_emulate
--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -12,7 +12,8 @@ run: $(TARGET)
 	./$(TARGET)
 
 SIMD := sse sse2 sse4 avx
-TESTCASES := blowfish $(SIMD) sse2-avx sse4-avx
+FMA := fma4
+TESTCASES := blowfish $(SIMD) sse2-avx sse4-avx $(FMA)
 
 blowfish-cflags := ""
 blowfish-cflags-x86_32 := "-mno-accumulate-outgoing-args -Dstatic="
@@ -29,6 +30,9 @@ sse4-flts := $(sse2-flts)
 avx-vecs := 16 32
 avx-ints :=
 avx-flts := 4 8
+fma4-vecs := $(avx-vecs)
+fma4-ints :=
+fma4-flts := $(avx-flts)
 
 # When converting SSE to AVX, have the compiler avoid XMM0 to widen
 # coverage of the VEX.vvvv checks in the emulator. We must not do this,
@@ -58,7 +62,7 @@ $(1)-avx-cflags := \
 	    "-D_$(vec)u$(int) -m$(1) $(sse2avx-$(1)) -Os -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)"))
 endef
 
-$(foreach flavor,$(SIMD),$(eval $(call simd-defs,$(flavor))))
+$(foreach flavor,$(SIMD) $(FMA),$(eval $(call simd-defs,$(flavor))))
 
 $(addsuffix .h,$(TESTCASES)): %.h: %.c testcase.mk Makefile
 	rm -f $@.new $*.bin
@@ -77,6 +81,11 @@ $(addsuffix .h,$(TESTCASES)): %.h: %.c t
 $(addsuffix .c,$(SIMD)) $(addsuffix -avx.c,$(filter sse%,$(SIMD))):
 	ln -sf simd.c $@
 
+$(addsuffix .c,$(FMA)):
+	ln -sf simd-fma.c $@
+
+$(addsuffix .o,$(SIMD) $(FMA)) $(addsuffix -avx.o,$(filter sse%,$(SIMD))): simd.h
+
 $(TARGET): x86-emulate.o test_x86_emulator.o
 	$(HOSTCC) $(HOSTCFLAGS) -o $@ $^
 
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -1,71 +1,6 @@
-#include <stdbool.h>
+#include "simd.h"
 
-asm (
-    "\t.text\n"
-    "\t.globl _start\n"
-    "_start:\n"
-#if defined(__i386__) && VEC_SIZE == 16
-    "\tpush %ebp\n"
-    "\tmov %esp,%ebp\n"
-    "\tand $~0xf,%esp\n"
-    "\tcall simd_test\n"
-    "\tleave\n"
-    "\tret"
-#else
-    "\tjmp simd_test"
-#endif
-    );
-
-typedef
-#if defined(INT_SIZE)
-# define ELEM_SIZE INT_SIZE
-signed int
-# if INT_SIZE == 1
-#  define MODE QI
-# elif INT_SIZE == 2
-#  define MODE HI
-# elif INT_SIZE == 4
-#  define MODE SI
-# elif INT_SIZE == 8
-#  define MODE DI
-# endif
-#elif defined(UINT_SIZE)
-# define ELEM_SIZE UINT_SIZE
-unsigned int
-# if UINT_SIZE == 1
-#  define MODE QI
-# elif UINT_SIZE == 2
-#  define MODE HI
-# elif UINT_SIZE == 4
-#  define MODE SI
-# elif UINT_SIZE == 8
-#  define MODE DI
-# endif
-#elif defined(FLOAT_SIZE)
-float
-# define ELEM_SIZE FLOAT_SIZE
-# if FLOAT_SIZE == 4
-#  define MODE SF
-# elif FLOAT_SIZE == 8
-#  define MODE DF
-# endif
-#endif
-#ifndef VEC_SIZE
-# define VEC_SIZE ELEM_SIZE
-#endif
-__attribute__((mode(MODE), vector_size(VEC_SIZE))) vec_t;
-
-#define ELEM_COUNT (VEC_SIZE / ELEM_SIZE)
-
-typedef unsigned int __attribute__((mode(QI), vector_size(VEC_SIZE))) byte_vec_t;
-
-/* Various builtins want plain char / int / long long vector types ... */
-typedef char __attribute__((vector_size(VEC_SIZE))) vqi_t;
-typedef short __attribute__((vector_size(VEC_SIZE))) vhi_t;
-typedef int __attribute__((vector_size(VEC_SIZE))) vsi_t;
-#if VEC_SIZE >= 8
-typedef long long __attribute__((vector_size(VEC_SIZE))) vdi_t;
-#endif
+ENTRY(simd_test);
 
 #if VEC_SIZE == 8 && defined(__SSE__)
 # define to_bool(cmp) (__builtin_ia32_pmovmskb(cmp) == 0xff)
@@ -418,13 +353,6 @@ static inline bool _to_bool(byte_vec_t b
 # endif
 #endif
 
-/*
- * Suppress value propagation by the compiler, preventing unwanted
- * optimization. This at once makes the compiler use memory operands
- * more often, which for our purposes is the more interesting case.
- */
-#define touch(var) asm volatile ( "" : "+m" (var) )
-
 int simd_test(void)
 {
     unsigned int i, j;
--- /dev/null
+++ b/tools/tests/x86_emulator/simd.h
@@ -0,0 +1,78 @@
+#include <stdbool.h>
+
+#if defined(__i386__) && VEC_SIZE == 16
+# define ENTRY(name) \
+asm ( "\t.text\n" \
+      "\t.globl _start\n" \
+      "_start:\n" \
+      "\tpush %ebp\n" \
+      "\tmov %esp,%ebp\n" \
+      "\tand $~0xf,%esp\n" \
+      "\tcall " #name "\n" \
+      "\tleave\n" \
+      "\tret" )
+#else
+# define ENTRY(name) \
+asm ( "\t.text\n" \
+      "\t.globl _start\n" \
+      "_start:\n" \
+      "\tjmp " #name )
+#endif
+
+typedef
+#if defined(INT_SIZE)
+# define ELEM_SIZE INT_SIZE
+signed int
+# if INT_SIZE == 1
+#  define MODE QI
+# elif INT_SIZE == 2
+#  define MODE HI
+# elif INT_SIZE == 4
+#  define MODE SI
+# elif INT_SIZE == 8
+#  define MODE DI
+# endif
+#elif defined(UINT_SIZE)
+# define ELEM_SIZE UINT_SIZE
+unsigned int
+# if UINT_SIZE == 1
+#  define MODE QI
+# elif UINT_SIZE == 2
+#  define MODE HI
+# elif UINT_SIZE == 4
+#  define MODE SI
+# elif UINT_SIZE == 8
+#  define MODE DI
+# endif
+#elif defined(FLOAT_SIZE)
+float
+# define ELEM_SIZE FLOAT_SIZE
+# if FLOAT_SIZE == 4
+#  define MODE SF
+# elif FLOAT_SIZE == 8
+#  define MODE DF
+# endif
+#endif
+#ifndef VEC_SIZE
+# define VEC_SIZE ELEM_SIZE
+#endif
+__attribute__((mode(MODE), vector_size(VEC_SIZE))) vec_t;
+
+#define ELEM_COUNT (VEC_SIZE / ELEM_SIZE)
+
+typedef unsigned int __attribute__((mode(QI), vector_size(VEC_SIZE))) byte_vec_t;
+
+/* Various builtins want plain char / int / long long vector types ... */
+typedef char __attribute__((vector_size(VEC_SIZE))) vqi_t;
+typedef short __attribute__((vector_size(VEC_SIZE))) vhi_t;
+typedef int __attribute__((vector_size(VEC_SIZE))) vsi_t;
+#if VEC_SIZE >= 8
+typedef long long __attribute__((vector_size(VEC_SIZE))) vdi_t;
+#endif
+
+/*
+ * Suppress value propagation by the compiler, preventing unwanted
+ * optimization. This at once makes the compiler use memory operands
+ * more often, which for our purposes is the more interesting case.
+ */
+#define touch(var) asm volatile ( "" : "+m" (var) )
--- /dev/null
+++ b/tools/tests/x86_emulator/simd-fma.c
@@ -0,0 +1,121 @@
+#include "simd.h"
+
+ENTRY(fma_test);
+
+#if VEC_SIZE < 16
+# define to_bool(cmp) (!~(cmp)[0])
+#elif VEC_SIZE == 16
+# if FLOAT_SIZE == 4
+#  define to_bool(cmp) __builtin_ia32_vtestcps(cmp, (vec_t){} == 0)
+# elif FLOAT_SIZE == 8
+#  define to_bool(cmp) __builtin_ia32_vtestcpd(cmp, (vec_t){} == 0)
+# endif
+#elif VEC_SIZE == 32
+# if FLOAT_SIZE == 4
+#  define to_bool(cmp) __builtin_ia32_vtestcps256(cmp, (vec_t){} == 0)
+# elif FLOAT_SIZE == 8
+#  define to_bool(cmp) __builtin_ia32_vtestcpd256(cmp, (vec_t){} == 0)
+# endif
+#endif
+
+#if VEC_SIZE == 16
+# if FLOAT_SIZE == 4
+#  define addsub(x, y) __builtin_ia32_addsubps(x, y)
+#  if defined(__FMA4__)
+#   define fmaddsub(x, y, z) __builtin_ia32_vfmaddsubps(x, y, z)
+#  endif
+# elif FLOAT_SIZE == 8
+#  define addsub(x, y) __builtin_ia32_addsubpd(x, y)
+#  if defined(__FMA4__)
+#   define fmaddsub(x, y, z) __builtin_ia32_vfmaddsubpd(x, y, z)
+#  endif
+# endif
+#elif VEC_SIZE == 32
+# if FLOAT_SIZE == 4
+#  define addsub(x, y) __builtin_ia32_addsubps256(x, y)
+#  if defined(__FMA4__)
+#   define fmaddsub(x, y, z) __builtin_ia32_vfmaddsubps256(x, y, z)
+#  endif
+# elif FLOAT_SIZE == 8
+#  define addsub(x, y) __builtin_ia32_addsubpd256(x, y)
+#  if defined(__FMA4__)
+#   define fmaddsub(x, y, z) __builtin_ia32_vfmaddsubpd256(x, y, z)
+#  endif
+# endif
+#endif
+
+int fma_test(void)
+{
+    unsigned int i;
+    vec_t x, y, z, src, inv, one;
+
+    for ( i = 0; i < ELEM_COUNT; ++i )
+    {
+        src[i] = i + 1;
+        inv[i] = ELEM_COUNT - i;
+        one[i] = 1;
+    }
+
+    x = (src + one) * inv;
+    y = (src - one) * inv;
+    touch(src);
+    z = inv * src + inv;
+    if ( !to_bool(x == z) ) return __LINE__;
+
+    touch(src);
+    z = -inv * src - inv;
+    if ( !to_bool(-x == z) ) return __LINE__;
+
+    touch(src);
+    z = inv * src - inv;
+    if ( !to_bool(y == z) ) return __LINE__;
+
+    touch(src);
+    z = -inv * src + inv;
+    if ( !to_bool(-y == z) ) return __LINE__;
+    touch(src);
+
+    x = src + inv;
+    y = src - inv;
+    touch(inv);
+    z = src * one + inv;
+    if ( !to_bool(x == z) ) return __LINE__;
+
+    touch(inv);
+    z = -src * one - inv;
+    if ( !to_bool(-x == z) ) return __LINE__;
+
+    touch(inv);
+    z = src * one - inv;
+    if ( !to_bool(y == z) ) return __LINE__;
+
+    touch(inv);
+    z = -src * one + inv;
+    if ( !to_bool(-y == z) ) return __LINE__;
+    touch(inv);
+
+#if defined(addsub) && defined(fmaddsub)
+    x = addsub(src * inv, one);
+    y = addsub(src * inv, -one);
+    touch(one);
+    z = fmaddsub(src, inv, one);
+    if ( !to_bool(x == z) ) return __LINE__;
+
+    touch(one);
+    z = fmaddsub(src, inv, -one);
+    if ( !to_bool(y == z) ) return __LINE__;
+    touch(one);
+
+    x = addsub(src * inv, one);
+    touch(inv);
+    z = fmaddsub(src, inv, one);
+    if ( !to_bool(x == z) ) return __LINE__;
+
+    touch(inv);
+    z = fmaddsub(src, inv, -one);
+    if ( !to_bool(y == z) ) return __LINE__;
+    touch(inv);
+#endif
+
+    return 0;
+}
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -11,6 +11,7 @@
 #include "sse2-avx.h"
 #include "sse4-avx.h"
 #include "avx.h"
+#include "fma4.h"
 
 #define verbose false /* Switch to true for far more logging. */
 
@@ -47,6 +48,11 @@ static bool simd_check_avx(void)
 #define simd_check_sse2_avx  simd_check_avx
 #define simd_check_sse4_avx  simd_check_avx
 
+static bool simd_check_fma4(void)
+{
+    return cpu_has_fma4;
+}
+
 static void simd_set_regs(struct cpu_user_regs *regs)
 {
     if ( cpu_has_mmx )
@@ -143,6 +149,12 @@ static const struct {
     SIMD(AVX scalar double,      avx,         f8),
     SIMD(AVX 128bit double,      avx,       16f8),
     SIMD(AVX 256bit double,      avx,       32f8),
+    SIMD(FMA4 scalar single,     fma4,        f4),
+    SIMD(FMA4 128bit single,     fma4,      16f4),
+    SIMD(FMA4 256bit single,     fma4,      32f4),
+    SIMD(FMA4 scalar double,     fma4,        f8),
+    SIMD(FMA4 128bit double,     fma4,      16f8),
+    SIMD(FMA4 256bit double,     fma4,      32f8),
 #undef SIMD_
 #undef SIMD
 };
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -169,6 +169,16 @@ static inline uint64_t xgetbv(uint32_t x
     (res.c & (1U << 6)) != 0; \
 })
 
+#define cpu_has_fma4 ({ \
+    struct cpuid_leaf res; \
+    emul_test_cpuid(1, 0, &res, NULL); \
+    if ( !(res.c & (1U << 27)) || ((xgetbv(0) & 6) != 6) ) \
+        res.c = 0; \
+    else \
+        emul_test_cpuid(0x80000001, 0, &res, NULL); \
+    (res.c & (1U << 16)) != 0; \
+})
+
 #define cpu_has_tbm ({ \
     struct cpuid_leaf res; \
     emul_test_cpuid(0x80000001, 0, &res, NULL); \
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -421,7 +421,16 @@ static const struct {
     [0x44] = { .simd_size = simd_packed_int },
     [0x4a ... 0x4b] = { .simd_size = simd_packed_fp, .four_op = 1 },
     [0x4c] = { .simd_size = simd_packed_int, .four_op = 1 },
+    [0x5c ... 0x5f] = { .simd_size = simd_packed_fp, .four_op = 1 },
     [0x60 ... 0x63] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0x68 ... 0x69] = { .simd_size = simd_packed_fp, .four_op = 1 },
+    [0x6a ... 0x6b] = { .simd_size = simd_scalar_fp, .four_op = 1 },
+    [0x6c ... 0x6d] = { .simd_size = simd_packed_fp, .four_op = 1 },
+    [0x6e ... 0x6f] = { .simd_size = simd_scalar_fp, .four_op = 1 },
+    [0x78 ... 0x79] = { .simd_size = simd_packed_fp, .four_op = 1 },
+    [0x7a ... 0x7b] = { .simd_size = simd_scalar_fp, .four_op = 1 },
+    [0x7c ... 0x7d] = { .simd_size = simd_packed_fp, .four_op = 1 },
+    [0x7e ... 0x7f] = { .simd_size = simd_scalar_fp, .four_op = 1 },
     [0xcc] = { .simd_size = simd_other },
     [0xdf] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0xf0] = {},
@@ -1613,6 +1622,7 @@ static bool vcpu_has(
 #define vcpu_has_lzcnt()       vcpu_has(0x80000001, ECX,  5, ctxt, ops)
 #define vcpu_has_sse4a()       vcpu_has(0x80000001, ECX,  6, ctxt, ops)
 #define vcpu_has_misalignsse() vcpu_has(0x80000001, ECX,  7, ctxt, ops)
+#define vcpu_has_fma4()        vcpu_has(0x80000001, ECX, 16, ctxt, ops)
 #define vcpu_has_tbm()         vcpu_has(0x80000001, ECX, 21, ctxt, ops)
 #define vcpu_has_bmi1()        vcpu_has(         7, EBX,  3, ctxt, ops)
 #define vcpu_has_hle()         vcpu_has(         7, EBX,  4, ctxt, ops)
@@ -6168,6 +6178,7 @@ x86_emulate(
     simd_0f_imm8_avx:
                 host_and_vcpu_must_have(avx);
             }
+    simd_0f_imm8_ymm:
             get_fpu(X86EMUL_FPU_ymm, &fic);
         }
         else if ( vex.pfx )
@@ -7726,6 +7737,49 @@ x86_emulate(
         generate_exception_if(vex.w, EXC_UD);
         goto simd_0f_int_imm8;
 
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x5c): /* vfmaddsubps {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+                                           /* vfmaddsubps {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x5d): /* vfmaddsubpd {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+                                           /* vfmaddsubpd {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x5e): /* vfmsubaddps {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+                                           /* vfmsubaddps {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x5f): /* vfmsubaddpd {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+                                           /* vfmsubaddpd {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x68): /* vfmaddps {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+                                           /* vfmaddps {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x69): /* vfmaddpd {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+                                           /* vfmaddpd {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x6a): /* vfmaddss xmm,xmm/m32,xmm,xmm */
+                                           /* vfmaddss xmm/m32,xmm,xmm,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x6b): /* vfmaddsd xmm,xmm/m64,xmm,xmm */
+                                           /* vfmaddsd xmm/m64,xmm,xmm,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x6c): /* vfmsubps {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+                                           /* vfmsubps {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x6d): /* vfmsubpd {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+                                           /* vfmsubpd {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x6e): /* vfmsubss xmm,xmm/m32,xmm,xmm */
+                                           /* vfmsubss xmm/m32,xmm,xmm,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x6f): /* vfmsubsd xmm,xmm/m64,xmm,xmm */
+                                           /* vfmsubsd xmm/m64,xmm,xmm,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x78): /* vfnmaddps {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+                                           /* vfnmaddps {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x79): /* vfnmaddpd {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+                                           /* vfnmaddpd {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x7a): /* vfnmaddss xmm,xmm/m32,xmm,xmm */
+                                           /* vfnmaddss xmm/m32,xmm,xmm,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x7b): /* vfnmaddsd xmm,xmm/m64,xmm,xmm */
+                                           /* vfnmaddsd xmm/m64,xmm,xmm,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x7c): /* vfnmsubps {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+                                           /* vfnmsubps {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x7d): /* vfnmsubpd {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+                                           /* vfnmsubpd {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x7e): /* vfnmsubss xmm,xmm/m32,xmm,xmm */
+                                           /* vfnmsubss xmm/m32,xmm,xmm,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x7f): /* vfnmsubsd xmm,xmm/m64,xmm,xmm */
+                                           /* vfnmsubsd xmm/m64,xmm,xmm,xmm */
+        host_and_vcpu_must_have(fma4);
+        goto simd_0f_imm8_ymm;
+
     case X86EMUL_OPC_66(0x0f3a, 0x60):     /* pcmpestrm $imm8,xmm/m128,xmm */
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x60): /* vpcmpestrm $imm8,xmm/m128,xmm */
     case X86EMUL_OPC_66(0x0f3a, 0x61):     /* pcmpestri $imm8,xmm/m128,xmm */
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -76,6 +76,7 @@
 #define cpu_has_svm             boot_cpu_has(X86_FEATURE_SVM)
 #define cpu_has_sse4a           boot_cpu_has(X86_FEATURE_SSE4A)
 #define cpu_has_lwp             boot_cpu_has(X86_FEATURE_LWP)
+#define cpu_has_fma4            boot_cpu_has(X86_FEATURE_FMA4)
 #define cpu_has_tbm             boot_cpu_has(X86_FEATURE_TBM)
 
 /* CPUID level 0x0000000D:1.eax */



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* [PATCH v3 05/25] x86emul: support FMA insns
  2017-12-07 13:49 [PATCH v3 00/25] x86: emulator enhancements Jan Beulich
                   ` (3 preceding siblings ...)
  2017-12-07 14:01 ` [PATCH v3 04/25] x86emul: support FMA4 insns Jan Beulich
@ 2017-12-07 14:02 ` Jan Beulich
  2018-02-01 16:15   ` Andrew Cooper
  2017-12-07 14:03 ` [PATCH v3 06/25] x86emul: support most remaining AVX2 insns Jan Beulich
                   ` (19 subsequent siblings)
  24 siblings, 1 reply; 85+ messages in thread
From: Jan Beulich @ 2017-12-07 14:02 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -12,7 +12,7 @@ run: $(TARGET)
 	./$(TARGET)
 
 SIMD := sse sse2 sse4 avx
-FMA := fma4
+FMA := fma4 fma
 TESTCASES := blowfish $(SIMD) sse2-avx sse4-avx $(FMA)
 
 blowfish-cflags := ""
@@ -33,6 +33,9 @@ avx-flts := 4 8
 fma4-vecs := $(avx-vecs)
 fma4-ints :=
 fma4-flts := $(avx-flts)
+fma-vecs := $(avx-vecs)
+fma-ints :=
+fma-flts := $(avx-flts)
 
 # When converting SSE to AVX, have the compiler avoid XMM0 to widen
 # coverage of the VEX.vvvv checks in the emulator. We must not do this,
--- a/tools/tests/x86_emulator/simd-fma.c
+++ b/tools/tests/x86_emulator/simd-fma.c
@@ -21,24 +21,24 @@ ENTRY(fma_test);
 #if VEC_SIZE == 16
 # if FLOAT_SIZE == 4
 #  define addsub(x, y) __builtin_ia32_addsubps(x, y)
-#  if defined(__FMA4__)
+#  if defined(__FMA4__) || defined(__FMA__)
 #   define fmaddsub(x, y, z) __builtin_ia32_vfmaddsubps(x, y, z)
 #  endif
 # elif FLOAT_SIZE == 8
 #  define addsub(x, y) __builtin_ia32_addsubpd(x, y)
-#  if defined(__FMA4__)
+#  if defined(__FMA4__) || defined(__FMA__)
 #   define fmaddsub(x, y, z) __builtin_ia32_vfmaddsubpd(x, y, z)
 #  endif
 # endif
 #elif VEC_SIZE == 32
 # if FLOAT_SIZE == 4
 #  define addsub(x, y) __builtin_ia32_addsubps256(x, y)
-#  if defined(__FMA4__)
+#  if defined(__FMA4__) || defined(__FMA__)
 #   define fmaddsub(x, y, z) __builtin_ia32_vfmaddsubps256(x, y, z)
 #  endif
 # elif FLOAT_SIZE == 8
 #  define addsub(x, y) __builtin_ia32_addsubpd256(x, y)
-#  if defined(__FMA4__)
+#  if defined(__FMA4__) || defined(__FMA__)
 #   define fmaddsub(x, y, z) __builtin_ia32_vfmaddsubpd256(x, y, z)
 #  endif
 # endif
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -12,6 +12,7 @@
 #include "sse4-avx.h"
 #include "avx.h"
 #include "fma4.h"
+#include "fma.h"
 
 #define verbose false /* Switch to true for far more logging. */
 
@@ -53,6 +54,11 @@ static bool simd_check_fma4(void)
     return cpu_has_fma4;
 }
 
+static bool simd_check_fma(void)
+{
+    return cpu_has_fma;
+}
+
 static void simd_set_regs(struct cpu_user_regs *regs)
 {
     if ( cpu_has_mmx )
@@ -155,6 +161,12 @@ static const struct {
     SIMD(FMA4 scalar double,     fma4,        f8),
     SIMD(FMA4 128bit double,     fma4,      16f8),
     SIMD(FMA4 256bit double,     fma4,      32f8),
+    SIMD(FMA scalar single,      fma,         f4),
+    SIMD(FMA 128bit single,      fma,       16f4),
+    SIMD(FMA 256bit single,      fma,       32f4),
+    SIMD(FMA scalar double,      fma,         f8),
+    SIMD(FMA 128bit double,      fma,       16f8),
+    SIMD(FMA 256bit double,      fma,       32f8),
 #undef SIMD_
 #undef SIMD
 };
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -99,6 +99,14 @@ static inline uint64_t xgetbv(uint32_t x
     (res.c & (1U << 0)) != 0; \
 })
 
+#define cpu_has_fma ({ \
+    struct cpuid_leaf res; \
+    emul_test_cpuid(1, 0, &res, NULL); \
+    if ( !(res.c & (1U << 27)) || ((xgetbv(0) & 6) != 6) ) \
+        res.c = 0; \
+    (res.c & (1U << 12)) != 0; \
+})
+
 #define cpu_has_sse4_1 ({ \
     struct cpuid_leaf res; \
     emul_test_cpuid(1, 0, &res, NULL); \
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -385,6 +385,9 @@ static const struct {
     [0x37 ... 0x3f] = { .simd_size = simd_packed_int },
     [0x40] = { .simd_size = simd_packed_int },
     [0x41] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0x96 ... 0x9f] = { .simd_size = simd_packed_fp },
+    [0xa6 ... 0xaf] = { .simd_size = simd_packed_fp },
+    [0xb6 ... 0xbf] = { .simd_size = simd_packed_fp },
     [0xc8 ... 0xcd] = { .simd_size = simd_other },
     [0xdb] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0xdc ... 0xdf] = { .simd_size = simd_packed_int },
@@ -1606,6 +1609,7 @@ static bool vcpu_has(
 #define vcpu_has_sse3()        vcpu_has(         1, ECX,  0, ctxt, ops)
 #define vcpu_has_pclmulqdq()   vcpu_has(         1, ECX,  1, ctxt, ops)
 #define vcpu_has_ssse3()       vcpu_has(         1, ECX,  9, ctxt, ops)
+#define vcpu_has_fma()         vcpu_has(         1, ECX, 12, ctxt, ops)
 #define vcpu_has_cx16()        vcpu_has(         1, ECX, 13, ctxt, ops)
 #define vcpu_has_sse4_1()      vcpu_has(         1, ECX, 19, ctxt, ops)
 #define vcpu_has_sse4_2()      vcpu_has(         1, ECX, 20, ctxt, ops)
@@ -7366,6 +7370,39 @@ x86_emulate(
         generate_exception_if(vex.l, EXC_UD);
         goto simd_0f_avx;
 
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x96): /* vfmaddsub132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x97): /* vfmsubadd132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x98): /* vfmadd132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x99): /* vfmadd132s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x9a): /* vfmsub132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x9b): /* vfmsub132s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x9c): /* vfnmadd132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x9d): /* vfnmadd132s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x9e): /* vfnmsub132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x9f): /* vfnmsub132s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xa6): /* vfmaddsub213p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xa7): /* vfmsubadd213p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xa8): /* vfmadd213p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xa9): /* vfmadd213s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xaa): /* vfmsub213p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xab): /* vfmsub213s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xac): /* vfnmadd213p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xad): /* vfnmadd213s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xae): /* vfnmsub213p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xaf): /* vfnmsub213s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xb6): /* vfmaddsub231p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xb7): /* vfmsubadd231p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xb8): /* vfmadd231p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xb9): /* vfmadd231s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xba): /* vfmsub231p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xbb): /* vfmsub231s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xbc): /* vfnmadd231p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xbd): /* vfnmadd231s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xbe): /* vfnmsub231p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xbf): /* vfnmsub231s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+        host_and_vcpu_must_have(fma);
+        goto simd_0f_ymm;
+
     case X86EMUL_OPC(0x0f38, 0xc8):     /* sha1nexte xmm/m128,xmm */
     case X86EMUL_OPC(0x0f38, 0xc9):     /* sha1msg1 xmm/m128,xmm */
     case X86EMUL_OPC(0x0f38, 0xca):     /* sha1msg2 xmm/m128,xmm */
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -50,6 +50,7 @@
 #define cpu_has_vmx             boot_cpu_has(X86_FEATURE_VMX)
 #define cpu_has_eist            boot_cpu_has(X86_FEATURE_EIST)
 #define cpu_has_ssse3           boot_cpu_has(X86_FEATURE_SSSE3)
+#define cpu_has_fma             boot_cpu_has(X86_FEATURE_FMA)
 #define cpu_has_cx16            boot_cpu_has(X86_FEATURE_CX16)
 #define cpu_has_pdcm            boot_cpu_has(X86_FEATURE_PDCM)
 #define cpu_has_pcid            boot_cpu_has(X86_FEATURE_PCID)



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* [PATCH v3 06/25] x86emul: support most remaining AVX2 insns
  2017-12-07 13:49 [PATCH v3 00/25] x86: emulator enhancements Jan Beulich
                   ` (4 preceding siblings ...)
  2017-12-07 14:02 ` [PATCH v3 05/25] x86emul: support FMA insns Jan Beulich
@ 2017-12-07 14:03 ` Jan Beulich
  2018-02-01 19:45   ` Andrew Cooper
  2017-12-07 14:03 ` [PATCH v3 07/25] x86emul: support AVX2 gather insns Jan Beulich
                   ` (18 subsequent siblings)
  24 siblings, 1 reply; 85+ messages in thread
From: Jan Beulich @ 2017-12-07 14:03 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper

I.e. those not being equivalents of SSEn ones, but with the exception
of the various gather operations.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: vbroadcasts{d,s} support register operands as of AVX2. Re-base.
v2: Add all vpmaskmov{d,q} handling here.

--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -11,9 +11,9 @@ all: $(TARGET)
 run: $(TARGET)
 	./$(TARGET)
 
-SIMD := sse sse2 sse4 avx
+SIMD := sse sse2 sse4 avx avx2
 FMA := fma4 fma
-TESTCASES := blowfish $(SIMD) sse2-avx sse4-avx $(FMA)
+TESTCASES := blowfish $(SIMD) $(FMA)
 
 blowfish-cflags := ""
 blowfish-cflags-x86_32 := "-mno-accumulate-outgoing-args -Dstatic="
@@ -36,13 +36,9 @@ fma4-flts := $(avx-flts)
 fma-vecs := $(avx-vecs)
 fma-ints :=
 fma-flts := $(avx-flts)
-
-# When converting SSE to AVX, have the compiler avoid XMM0 to widen
-# coverage of the VEX.vvvv checks in the emulator. We must not do this,
-# however, for SSE4.1 and later, as there are instructions with XMM0 as
-# an implicit operand.
-sse2avx-sse2 := -ffixed-xmm0 -Wa,-msse2avx
-sse2avx-sse4 := -Wa,-msse2avx
+avx2-vecs := $(avx-vecs)
+avx2-ints := 1 2 4 8
+avx2-flts := 4 8
 
 # For AVX and later, have the compiler avoid XMM0 to widen coverage of
 # the VEX.vvvv checks in the emulator.
@@ -58,11 +54,6 @@ $(1)-cflags := \
 	    "-D_$(vec)f$(flt) -m$(1) $(call non-sse,$(1)) -Os -DVEC_SIZE=$(vec) -DFLOAT_SIZE=$(flt)")) \
 	$(foreach flt,$($(1)-flts), \
 	  "-D_f$(flt) -m$(1) $(call non-sse,$(1)) -mfpmath=sse -Os -DFLOAT_SIZE=$(flt)")
-$(1)-avx-cflags := \
-	$(foreach vec,$($(1)-vecs), \
-	  $(foreach int,$($(1)-ints), \
-	    "-D_$(vec)i$(int) -m$(1) $(sse2avx-$(1)) -Os -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
-	    "-D_$(vec)u$(int) -m$(1) $(sse2avx-$(1)) -Os -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)"))
 endef
 
 $(foreach flavor,$(SIMD) $(FMA),$(eval $(call simd-defs,$(flavor))))
@@ -81,13 +72,13 @@ $(addsuffix .h,$(TESTCASES)): %.h: %.c t
 	)
 	mv $@.new $@
 
-$(addsuffix .c,$(SIMD)) $(addsuffix -avx.c,$(filter sse%,$(SIMD))):
+$(addsuffix .c,$(SIMD)):
 	ln -sf simd.c $@
 
 $(addsuffix .c,$(FMA)):
 	ln -sf simd-fma.c $@
 
-$(addsuffix .o,$(SIMD) $(FMA)) $(addsuffix -avx.o,$(filter sse%,$(SIMD))): simd.h
+$(addsuffix .o,$(SIMD) $(FMA)): simd.h
 
 $(TARGET): x86-emulate.o test_x86_emulator.o
 	$(HOSTCC) $(HOSTCFLAGS) -o $@ $^
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -23,7 +23,9 @@ ENTRY(simd_test);
 #  endif
 # endif
 #elif VEC_SIZE == 32
-# if defined(__AVX__) && ELEM_SIZE == 4
+# if defined(__AVX2__)
+#  define to_bool(cmp) __builtin_ia32_ptestc256(cmp, (vdi_t){} == 0)
+# elif defined(__AVX__) && ELEM_SIZE == 4
 #  define to_bool(cmp) (__builtin_ia32_movmskps256(cmp) == 0xff)
 # elif defined(__AVX__) && ELEM_SIZE == 8
 #  define to_bool(cmp) (__builtin_ia32_movmskpd256(cmp) == 0xf)
@@ -70,7 +72,12 @@ static inline bool _to_bool(byte_vec_t b
 
 #if FLOAT_SIZE == 4 && defined(__SSE__)
 # if VEC_SIZE == 32 && defined(__AVX__)
-#  define broadcast(x) ({ float t_ = (x); __builtin_ia32_vbroadcastss256(&t_); })
+#  if defined(__AVX2__)
+#   define broadcast(x) \
+    __builtin_ia32_vbroadcastss_ps256((float __attribute__((vector_size(16)))){ x })
+#  else
+#   define broadcast(x) ({ float t_ = (x); __builtin_ia32_vbroadcastss256(&t_); })
+#  endif
 #  define max(x, y) __builtin_ia32_maxps256(x, y)
 #  define min(x, y) __builtin_ia32_minps256(x, y)
 #  define recip(x) __builtin_ia32_rcpps256(x)
@@ -80,12 +87,18 @@ static inline bool _to_bool(byte_vec_t b
     vec_t t_ = __builtin_ia32_vpermilps256(x, 0b00011011); \
     __builtin_ia32_vperm2f128_ps256(t_, t_, 0b00000001); \
 })
-#  define swap2(x) ({ \
-    vec_t t_ = __builtin_ia32_vpermilvarps256(x, __builtin_ia32_cvtps2dq256(inv) - 1); \
-    __builtin_ia32_vperm2f128_ps256(t_, t_, 0b00000001); \
+#  ifdef __AVX2__
+#   define swap2(x) __builtin_ia32_permvarsf256(x, __builtin_ia32_cvtps2dq256(inv) - 1)
+#  else
+#   define swap2(x) ({ \
+        vec_t t_ = __builtin_ia32_vpermilvarps256(x, __builtin_ia32_cvtps2dq256(inv) - 1); \
+        __builtin_ia32_vperm2f128_ps256(t_, t_, 0b00000001); \
 })
+#  endif
 # elif VEC_SIZE == 16
-#  ifdef __AVX__
+#  if defined(__AVX2__)
+#   define broadcast(x) __builtin_ia32_vbroadcastss_ps((vec_t){ x })
+#  elif defined(__AVX__)
 #   define broadcast(x) ({ float t_ = (x); __builtin_ia32_vbroadcastss(&t_); })
 #  endif
 #  define interleave_hi(x, y) __builtin_ia32_unpckhps(x, y)
@@ -106,7 +119,12 @@ static inline bool _to_bool(byte_vec_t b
 # endif
 #elif FLOAT_SIZE == 8 && defined(__SSE2__)
 # if VEC_SIZE == 32 && defined(__AVX__)
-#  define broadcast(x) ({ double t_ = (x); __builtin_ia32_vbroadcastsd256(&t_); })
+#  if defined(__AVX2__)
+#   define broadcast(x) \
+    __builtin_ia32_vbroadcastsd_pd256((double __attribute__((vector_size(16)))){ x })
+#  else
+#   define broadcast(x) ({ double t_ = (x); __builtin_ia32_vbroadcastsd256(&t_); })
+#  endif
 #  define max(x, y) __builtin_ia32_maxpd256(x, y)
 #  define min(x, y) __builtin_ia32_minpd256(x, y)
 #  define recip(x) ({ \
@@ -128,6 +146,9 @@ static inline bool _to_bool(byte_vec_t b
     vec_t t_ = __builtin_ia32_vpermilpd256(x, 0b00000101); \
     __builtin_ia32_vperm2f128_pd256(t_, t_, 0b00000001); \
 })
+#  ifdef __AVX2__
+#   define swap2(x) __builtin_ia32_permdf256(x, 0b00011011)
+#  endif
 # elif VEC_SIZE == 16
 #  define interleave_hi(x, y) __builtin_ia32_unpckhpd(x, y)
 #  define interleave_lo(x, y) __builtin_ia32_unpcklpd(x, y)
@@ -184,6 +205,104 @@ static inline bool _to_bool(byte_vec_t b
     __builtin_ia32_maskmovdqu((vqi_t)(x),  m_, d_); \
     __builtin_ia32_maskmovdqu((vqi_t)(y), ~m_, d_); \
 })
+#elif VEC_SIZE == 32 && defined(__AVX2__)
+# define swap_lanes(x, y, func, type) ({ \
+    long long __attribute__((vector_size(16))) t_ = __builtin_ia32_extract128i256((vdi_t)(y), 0); \
+    type t1_ = (type)__builtin_ia32_insert128i256((vdi_t)(x), t_, 1), t2_; \
+    t_ = __builtin_ia32_extract128i256((vdi_t)(x), 1); \
+    t2_ = (type)__builtin_ia32_insert128i256((vdi_t)(y), t_, 0); \
+    func(t1_, t2_); \
+})
+# if INT_SIZE == 1 || UINT_SIZE == 1
+#  define broadcast(x) ({ char s_ = (x); vec_t d_; asm ( "vpbroadcastb %1,%0" : "=x" (d_) : "m" (s_)); d_; })
+#  define copysignz(x, y) ((vec_t)__builtin_ia32_psignb256((vqi_t)(x), (vqi_t)(y)))
+#  define rotr(x, n) ((vec_t)__builtin_ia32_palignr256(__builtin_ia32_permti256((vdi_t)(x), (vdi_t)(x), 0b00000001), \
+                                                       (vdi_t)(x), (n) * 8))
+# elif INT_SIZE == 2 || UINT_SIZE == 2
+#  define broadcast(x) ({ short s_ = (x); vec_t d_; asm ( "vpbroadcastw %1,%0" : "=x" (d_) : "m" (s_)); d_; })
+#  define copysignz(x, y) ((vec_t)__builtin_ia32_psignw256((vhi_t)(x), (vhi_t)(y)))
+#  define hadd(x, y) ((vec_t)swap_lanes(x, y, __builtin_ia32_phaddw256, vhi_t))
+#  define hsub(x, y) ((vec_t)swap_lanes(x, y, __builtin_ia32_phsubw256, vhi_t))
+#  define mix(x, y) ((vec_t)__builtin_ia32_pblendw256((vhi_t)(x), (vhi_t)(y), 0b10101010))
+#  define rotr(x, n) ((vec_t)__builtin_ia32_palignr256(__builtin_ia32_permti256((vdi_t)(x), (vdi_t)(x), 0b00000001), \
+                                                       (vdi_t)(x), (n) * 16))
+# elif INT_SIZE == 4 || UINT_SIZE == 4
+#  define broadcast(x) ({ int s_ = (x); vec_t d_; asm ( "vpbroadcastd %1,%0" : "=x" (d_) : "m" (s_)); d_; })
+#  define copysignz(x, y) ((vec_t)__builtin_ia32_psignd256((vsi_t)(x), (vsi_t)(y)))
+#  define hadd(x, y) ((vec_t)swap_lanes(x, y, __builtin_ia32_phaddd256, vsi_t))
+#  define hsub(x, y) ((vec_t)swap_lanes(x, y, __builtin_ia32_phsubd256, vsi_t))
+#  define mix(x, y) ((vec_t)__builtin_ia32_pblendd256((vsi_t)(x), (vsi_t)(y), 0b10101010))
+#  define rotr(x, n) ((vec_t)__builtin_ia32_palignr256(__builtin_ia32_permti256((vdi_t)(x), (vdi_t)(x), 0b00000001), \
+                                                       (vdi_t)(x), (n) * 32))
+#  define select(d, x, y, m) ({ \
+    vsi_t m_ = (vsi_t)(m); \
+    *(d) = (vec_t)__builtin_ia32_maskloadd256((vsi_t *)&(x),  m_); \
+    __builtin_ia32_maskstored256((vsi_t *)(d), ~m_, (vsi_t)(y)); \
+})
+#  define swap(x) ((vec_t)__builtin_ia32_permvarsi256((vsi_t)(x), (vsi_t)inv - 1))
+# elif INT_SIZE == 8 || UINT_SIZE == 8
+#  define mix(x, y) ((vec_t)__builtin_ia32_pblendd256((vsi_t)(x), (vsi_t)(y), 0b11001100))
+#  define rotr(x, n) ((vec_t)__builtin_ia32_palignr256(__builtin_ia32_permti256((vdi_t)(x), (vdi_t)(x), 0b00000001), \
+                                                       (vdi_t)(x), (n) * 64))
+#  define select(d, x, y, m) ({ \
+    vdi_t m_ = (vdi_t)(m); \
+    *(d) = (vec_t)__builtin_ia32_maskloadq256((vdi_t *)&(x),  m_); \
+    __builtin_ia32_maskstoreq256((vdi_t *)(d), ~m_, (vdi_t)(y)); \
+})
+#  define swap(x) ((vec_t)__builtin_ia32_permdi256((vdi_t)(x), 0b00011011))
+#  define swap2(x) ({ \
+    vdi_t t_ = __builtin_ia32_permdi256((vdi_t)(x), 0b10110001); \
+    (vec_t)__builtin_ia32_permti256(t_, t_, 0b00000001); \
+})
+# endif
+# if INT_SIZE == 1
+#  define abs(x) ((vec_t)__builtin_ia32_pabsb256((vqi_t)(x)))
+#  define max(x, y) ((vec_t)__builtin_ia32_pmaxsb256((vqi_t)(x), (vqi_t)(y)))
+#  define min(x, y) ((vec_t)__builtin_ia32_pminsb256((vqi_t)(x), (vqi_t)(y)))
+#  define widen1(x) ((vec_t)__builtin_ia32_pmovsxbw256((vqi_t)(x)))
+#  define widen2(x) ((vec_t)__builtin_ia32_pmovsxbd256((vqi_t)(x)))
+#  define widen3(x) ((vec_t)__builtin_ia32_pmovsxbq256((vqi_t)(x)))
+# elif UINT_SIZE == 1
+#  define max(x, y) ((vec_t)__builtin_ia32_pmaxub256((vqi_t)(x), (vqi_t)(y)))
+#  define min(x, y) ((vec_t)__builtin_ia32_pminub256((vqi_t)(x), (vqi_t)(y)))
+#  define widen1(x) ((vec_t)__builtin_ia32_pmovzxbw256((vqi_t)(x)))
+#  define widen2(x) ((vec_t)__builtin_ia32_pmovzxbd256((vqi_t)(x)))
+#  define widen3(x) ((vec_t)__builtin_ia32_pmovzxbq256((vqi_t)(x)))
+# elif INT_SIZE == 2
+#  define abs(x) __builtin_ia32_pabsw256(x)
+#  define max(x, y) __builtin_ia32_pmaxsw256(x, y)
+#  define min(x, y) __builtin_ia32_pminsw256(x, y)
+#  define mul_hi(x, y) __builtin_ia32_pmulhw256(x, y)
+#  define widen1(x) ((vec_t)__builtin_ia32_pmovsxwd256(x))
+#  define widen2(x) ((vec_t)__builtin_ia32_pmovsxwq256(x))
+# elif UINT_SIZE == 2
+#  define max(x, y) ((vec_t)__builtin_ia32_pmaxuw256((vhi_t)(x), (vhi_t)(y)))
+#  define min(x, y) ((vec_t)__builtin_ia32_pminuw256((vhi_t)(x), (vhi_t)(y)))
+#  define mul_hi(x, y) ((vec_t)__builtin_ia32_pmulhuw256((vhi_t)(x), (vhi_t)(y)))
+#  define widen1(x) ((vec_t)__builtin_ia32_pmovzxwd256((vhi_t)(x)))
+#  define widen2(x) ((vec_t)__builtin_ia32_pmovzxwq256((vhi_t)(x)))
+# elif INT_SIZE == 4
+#  define abs(x) __builtin_ia32_pabsd256(x)
+#  define max(x, y) __builtin_ia32_pmaxsd256(x, y)
+#  define min(x, y) __builtin_ia32_pminsd256(x, y)
+#  define widen1(x) ((vec_t)__builtin_ia32_pmovsxdq256(x))
+# elif UINT_SIZE == 4
+#  define max(x, y) ((vec_t)__builtin_ia32_pmaxud256((vsi_t)(x), (vsi_t)(y)))
+#  define min(x, y) ((vec_t)__builtin_ia32_pminud256((vsi_t)(x), (vsi_t)(y)))
+#  define mul_full(x, y) ((vec_t)__builtin_ia32_pmuludq256((vsi_t)(x), (vsi_t)(y)))
+#  define widen1(x) ((vec_t)__builtin_ia32_pmovzxdq256((vsi_t)(x)))
+# elif INT_SIZE == 8
+#  define broadcast(x) ({ \
+    long long s_ = (x); \
+    long long __attribute__((vector_size(16))) t_; \
+    vec_t d_; \
+    asm ( "vpbroadcastq %1,%0" : "=x" (t_) : "m" (s_)); \
+    asm ( "vbroadcasti128 %1,%0" : "=x" (d_) : "m" (t_)); \
+    d_; \
+})
+# elif UINT_SIZE == 8
+#  define broadcast(x) ({ long long s_ = (x); vec_t d_; asm ( "vpbroadcastq %1,%0" : "=x" (d_) : "m" (s_)); d_; })
+# endif
 #endif
 #if VEC_SIZE == 16 && defined(__SSE3__)
 # if FLOAT_SIZE == 4
@@ -207,25 +326,37 @@ static inline bool _to_bool(byte_vec_t b
 #  define addsub(x, y) __builtin_ia32_addsubps256(x, y)
 #  define dup_hi(x) __builtin_ia32_movshdup256(x)
 #  define dup_lo(x) __builtin_ia32_movsldup256(x)
-#  define hadd(x, y) ({ \
+#  ifdef __AVX2__
+#   define hadd(x, y) __builtin_ia32_permvarsf256(__builtin_ia32_haddps256(x, y), \
+                                                  (vsi_t){0, 1, 4, 5, 2, 3, 6, 7})
+#   define hsub(x, y) __builtin_ia32_permvarsf256(__builtin_ia32_hsubps256(x, y), \
+                                                  (vsi_t){0, 1, 4, 5, 2, 3, 6, 7})
+#  else
+#   define hadd(x, y) ({ \
         vec_t t_ = __builtin_ia32_haddps256(x, y); \
         (vec_t){t_[0], t_[1], t_[4], t_[5], t_[2], t_[3], t_[6], t_[7]}; \
 })
-#  define hsub(x, y) ({ \
+#   define hsub(x, y) ({ \
         vec_t t_ = __builtin_ia32_hsubps256(x, y); \
         (vec_t){t_[0], t_[1], t_[4], t_[5], t_[2], t_[3], t_[6], t_[7]}; \
 })
+#  endif
 # elif FLOAT_SIZE == 8
 #  define addsub(x, y) __builtin_ia32_addsubpd256(x, y)
 #  define dup_lo(x) __builtin_ia32_movddup256(x)
-#  define hadd(x, y) ({ \
+#  ifdef __AVX2__
+#   define hadd(x, y) __builtin_ia32_permdf256(__builtin_ia32_haddpd256(x, y), 0b11011000)
+#   define hsub(x, y) __builtin_ia32_permdf256(__builtin_ia32_hsubpd256(x, y), 0b11011000)
+#  else
+#   define hadd(x, y) ({ \
         vec_t t_ = __builtin_ia32_haddpd256(x, y); \
         (vec_t){t_[0], t_[2], t_[1], t_[3]}; \
 })
-#  define hsub(x, y) ({ \
+#   define hsub(x, y) ({ \
         vec_t t_ = __builtin_ia32_hsubpd256(x, y); \
         (vec_t){t_[0], t_[2], t_[1], t_[3]}; \
 })
+#  endif
 # endif
 #endif
 #if VEC_SIZE == 16 && defined(__SSSE3__)
@@ -546,7 +677,7 @@ int simd_test(void)
     z *= alt;
 #  endif
     /*
-     * Zap elements for which the shift count is negative (and the hence the
+     * Zap elements for which the shift count is zero (and the hence the
      * decrement below would yield a negative count.
      */
     z &= (sh > 0);
@@ -556,9 +687,14 @@ int simd_test(void)
     --sh;
     touch(sh);
     y = z << sh;
-    touch(sh);
     if ( !to_bool(x == y + y) ) return __LINE__;
 
+#  if defined(__AVX2__) && ELEM_SIZE >= 4
+    touch(sh);
+    x = y >> sh;
+    if ( !to_bool(x == z) ) return __LINE__;
+#  endif
+
 # endif
 
 #endif
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -8,11 +8,10 @@
 #include "sse.h"
 #include "sse2.h"
 #include "sse4.h"
-#include "sse2-avx.h"
-#include "sse4-avx.h"
 #include "avx.h"
 #include "fma4.h"
 #include "fma.h"
+#include "avx2.h"
 
 #define verbose false /* Switch to true for far more logging. */
 
@@ -46,8 +45,6 @@ static bool simd_check_avx(void)
 {
     return cpu_has_avx;
 }
-#define simd_check_sse2_avx  simd_check_avx
-#define simd_check_sse4_avx  simd_check_avx
 
 static bool simd_check_fma4(void)
 {
@@ -59,6 +56,11 @@ static bool simd_check_fma(void)
     return cpu_has_fma;
 }
 
+static bool simd_check_avx2(void)
+{
+    return cpu_has_avx2;
+}
+
 static void simd_set_regs(struct cpu_user_regs *regs)
 {
     if ( cpu_has_mmx )
@@ -133,22 +135,6 @@ static const struct {
     SIMD(SSE4 packed u32,        sse4,      16u4),
     SIMD(SSE4 packed s64,        sse4,      16i8),
     SIMD(SSE4 packed u64,        sse4,      16u8),
-    SIMD(SSE2/AVX packed s8,     sse2_avx,  16i1),
-    SIMD(SSE2/AVX packed u8,     sse2_avx,  16u1),
-    SIMD(SSE2/AVX packed s16,    sse2_avx,  16i2),
-    SIMD(SSE2/AVX packed u16,    sse2_avx,  16u2),
-    SIMD(SSE2/AVX packed s32,    sse2_avx,  16i4),
-    SIMD(SSE2/AVX packed u32,    sse2_avx,  16u4),
-    SIMD(SSE2/AVX packed s64,    sse2_avx,  16i8),
-    SIMD(SSE2/AVX packed u64,    sse2_avx,  16u8),
-    SIMD(SSE4/AVX packed s8,     sse4_avx,  16i1),
-    SIMD(SSE4/AVX packed u8,     sse4_avx,  16u1),
-    SIMD(SSE4/AVX packed s16,    sse4_avx,  16i2),
-    SIMD(SSE4/AVX packed u16,    sse4_avx,  16u2),
-    SIMD(SSE4/AVX packed s32,    sse4_avx,  16i4),
-    SIMD(SSE4/AVX packed u32,    sse4_avx,  16u4),
-    SIMD(SSE4/AVX packed s64,    sse4_avx,  16i8),
-    SIMD(SSE4/AVX packed u64,    sse4_avx,  16u8),
     SIMD(AVX scalar single,      avx,         f4),
     SIMD(AVX 128bit single,      avx,       16f4),
     SIMD(AVX 256bit single,      avx,       32f4),
@@ -167,6 +153,26 @@ static const struct {
     SIMD(FMA scalar double,      fma,         f8),
     SIMD(FMA 128bit double,      fma,       16f8),
     SIMD(FMA 256bit double,      fma,       32f8),
+    SIMD(AVX2 128bit single,     avx2,      16f4),
+    SIMD(AVX2 256bit single,     avx2,      32f4),
+    SIMD(AVX2 128bit double,     avx2,      16f8),
+    SIMD(AVX2 256bit double,     avx2,      32f8),
+    SIMD(AVX2 s8x16,             avx2,      16i1),
+    SIMD(AVX2 u8x16,             avx2,      16u1),
+    SIMD(AVX2 s16x8,             avx2,      16i2),
+    SIMD(AVX2 u16x8,             avx2,      16u2),
+    SIMD(AVX2 s32x4,             avx2,      16i4),
+    SIMD(AVX2 u32x4,             avx2,      16u4),
+    SIMD(AVX2 s64x2,             avx2,      16i8),
+    SIMD(AVX2 u64x2,             avx2,      16u8),
+    SIMD(AVX2 s8x32,             avx2,      32i1),
+    SIMD(AVX2 u8x32,             avx2,      32u1),
+    SIMD(AVX2 s16x16,            avx2,      32i2),
+    SIMD(AVX2 u16x16,            avx2,      32u2),
+    SIMD(AVX2 s32x8,             avx2,      32i4),
+    SIMD(AVX2 u32x8,             avx2,      32u4),
+    SIMD(AVX2 s64x4,             avx2,      32i8),
+    SIMD(AVX2 u64x4,             avx2,      32u8),
 #undef SIMD_
 #undef SIMD
 };
@@ -2950,6 +2956,91 @@ int main(int argc, char **argv)
              res[0] || res[1] || memcmp(res + 2, res + 4, 8) )
             goto fail;
 
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing vpmaskmovd %xmm1,%xmm2,(%edx)...");
+    if ( stack_exec && cpu_has_avx2 )
+    {
+        decl_insn(vpmaskmovd);
+
+        asm volatile ( "vpxor %%xmm1, %%xmm1, %%xmm1\n\t"
+                       "vpinsrd $0b00, %1, %%xmm1, %%xmm2\n\t"
+#if 0 /* Don't use AVX2 instructions for now */
+                       put_insn(vpmaskmovd, "vpmaskmovd %%xmm1, %%xmm2, (%0)")
+#else
+                       put_insn(vpmaskmovd,
+                                ".byte 0xc4, 0xe2, 0x69, 0x8e, 0x0a")
+#endif
+                       :: "d" (NULL), "r" (~0) );
+
+        memset(res + MMAP_SZ / sizeof(*res) - 8, 0xdb, 32);
+        set_insn(vpmaskmovd);
+        regs.edx = (unsigned long)res + MMAP_SZ - 4;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vpmaskmovd) ||
+             res[MMAP_SZ / sizeof(*res) - 1] ||
+             memcmp(res + MMAP_SZ / sizeof(*res) - 8,
+                    res + MMAP_SZ / sizeof(*res) - 4, 12) )
+            goto fail;
+
+        asm volatile ( "vpinsrd $0b11, %0, %%xmm1, %%xmm2" :: "r" (~0) );
+        memset(res, 0xdb, 32);
+        set_insn(vpmaskmovd);
+        regs.edx = (unsigned long)(res - 3);
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vpmaskmovd) ||
+             res[0] || memcmp(res + 1, res + 4, 12) )
+            goto fail;
+
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing vpmaskmovq %xmm1,%xmm2,(%edx)...");
+    if ( stack_exec && cpu_has_avx2 )
+    {
+        decl_insn(vpmaskmovq);
+
+        asm volatile ( "vpxor %%xmm1, %%xmm1, %%xmm1\n\t"
+                       "vpcmpeqd %%xmm0, %%xmm0, %%xmm0\n\t"
+#if 0 /* Don't use AVX2 instructions for now */
+                       "vpblendd $0b0011, %%xmm0, %%xmm1, %%xmm2\n\t"
+                       put_insn(vpmaskmovq, "vpmaskmovq %%xmm1, %%xmm2, (%0)")
+#else
+                       ".byte 0xc4, 0xe3, 0x71, 0x02, 0xd0, 0b0011\n\t"
+                       put_insn(vpmaskmovq,
+                                ".byte 0xc4, 0xe2, 0xe9, 0x8e, 0x0a")
+#endif
+                       :: "d" (NULL) );
+
+        memset(res + MMAP_SZ / sizeof(*res) - 8, 0xdb, 32);
+        set_insn(vpmaskmovq);
+        regs.edx = (unsigned long)res + MMAP_SZ - 8;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vpmaskmovq) ||
+             res[MMAP_SZ / sizeof(*res) - 1] ||
+             res[MMAP_SZ / sizeof(*res) - 2] ||
+             memcmp(res + MMAP_SZ / sizeof(*res) - 8,
+                    res + MMAP_SZ / sizeof(*res) - 4, 8) )
+            goto fail;
+
+#if 0 /* Don't use AVX2 instructions for now */
+        asm volatile ( "vpermq $0b00000001, %ymm2, %ymm2" );
+#else
+        asm volatile ( ".byte 0xc4, 0xe3, 0xfd, 0x00, 0xd2, 0b00000001" );
+#endif
+        memset(res, 0xdb, 32);
+        set_insn(vpmaskmovq);
+        regs.edx = (unsigned long)(res - 2);
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vpmaskmovq) ||
+             res[0] || res[1] || memcmp(res + 2, res + 4, 8) )
+            goto fail;
+
         printf("okay\n");
     }
     else
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -370,7 +370,7 @@ static const struct {
     [0x0c ... 0x0f] = { .simd_size = simd_packed_fp },
     [0x10] = { .simd_size = simd_packed_int },
     [0x13] = { .simd_size = simd_other, .two_op = 1 },
-    [0x14 ... 0x15] = { .simd_size = simd_packed_fp },
+    [0x14 ... 0x16] = { .simd_size = simd_packed_fp },
     [0x17] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0x18 ... 0x19] = { .simd_size = simd_scalar_fp, .two_op = 1 },
     [0x1a] = { .simd_size = simd_128, .two_op = 1 },
@@ -382,9 +382,15 @@ static const struct {
     [0x2c ... 0x2d] = { .simd_size = simd_other },
     [0x2e ... 0x2f] = { .simd_size = simd_other, .to_mem = 1 },
     [0x30 ... 0x35] = { .simd_size = simd_other, .two_op = 1 },
-    [0x37 ... 0x3f] = { .simd_size = simd_packed_int },
+    [0x36 ... 0x3f] = { .simd_size = simd_packed_int },
     [0x40] = { .simd_size = simd_packed_int },
     [0x41] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0x45 ... 0x47] = { .simd_size = simd_packed_int },
+    [0x58 ... 0x59] = { .simd_size = simd_other, .two_op = 1 },
+    [0x5a] = { .simd_size = simd_128, .two_op = 1 },
+    [0x78 ... 0x79] = { .simd_size = simd_other, .two_op = 1 },
+    [0x8c] = { .simd_size = simd_other },
+    [0x8e] = { .simd_size = simd_other, .to_mem = 1 },
     [0x96 ... 0x9f] = { .simd_size = simd_packed_fp },
     [0xa6 ... 0xaf] = { .simd_size = simd_packed_fp },
     [0xb6 ... 0xbf] = { .simd_size = simd_packed_fp },
@@ -406,6 +412,9 @@ static const struct {
     uint8_t two_op:1;
     uint8_t four_op:1;
 } ext0f3a_table[256] = {
+    [0x00] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0x01] = { .simd_size = simd_packed_fp, .two_op = 1 },
+    [0x02] = { .simd_size = simd_packed_int },
     [0x04 ... 0x05] = { .simd_size = simd_packed_fp, .two_op = 1 },
     [0x06] = { .simd_size = simd_packed_fp },
     [0x08 ... 0x09] = { .simd_size = simd_packed_fp, .two_op = 1 },
@@ -419,9 +428,12 @@ static const struct {
     [0x20] = { .simd_size = simd_none },
     [0x21] = { .simd_size = simd_other },
     [0x22] = { .simd_size = simd_none },
+    [0x38] = { .simd_size = simd_128 },
+    [0x39] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1 },
     [0x40 ... 0x41] = { .simd_size = simd_packed_fp },
     [0x42] = { .simd_size = simd_packed_int },
     [0x44] = { .simd_size = simd_packed_int },
+    [0x46] = { .simd_size = simd_packed_int },
     [0x4a ... 0x4b] = { .simd_size = simd_packed_fp, .four_op = 1 },
     [0x4c] = { .simd_size = simd_packed_int, .four_op = 1 },
     [0x5c ... 0x5f] = { .simd_size = simd_packed_fp, .four_op = 1 },
@@ -2973,7 +2985,7 @@ x86_decode(
         }
         break;
 
-    case simd_scalar_fp:
+    case simd_scalar_fp: /* case simd_scalar_dq: */
         op_bytes = 4 << (ctxt->opcode & 1);
         break;
 
@@ -6070,6 +6082,10 @@ x86_emulate(
     case X86EMUL_OPC_VEX_66(0x0f38, 0x40): /* vpmulld {x,y}mm/mem,{x,y}mm,{x,y}mm */
             if ( !vex.l )
                 goto simd_0f_avx;
+            /* fall through */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x45): /* vpsrlv{d,q} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x47): /* vpsllv{d,q} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    simd_0f_avx2:
             host_and_vcpu_must_have(avx2);
             goto simd_0f_ymm;
         }
@@ -6169,7 +6185,10 @@ x86_emulate(
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x0f): /* vpalignr $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x42): /* vmpsadbw $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
             if ( vex.l )
+            {
+    simd_0f_imm8_avx2:
                 host_and_vcpu_must_have(avx2);
+            }
             else
             {
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x08): /* vroundps $imm8,{x,y}mm/mem,{x,y}mm */
@@ -7150,12 +7169,16 @@ x86_emulate(
         fic.insn_bytes = PFX_BYTES + 3;
         break;
 
-    case X86EMUL_OPC_VEX_66(0x0f38, 0x19): /* vbroadcastsd m64,ymm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x19): /* vbroadcastsd xmm/m64,ymm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x1a): /* vbroadcastf128 m128,ymm */
         generate_exception_if(!vex.l, EXC_UD);
         /* fall through */
-    case X86EMUL_OPC_VEX_66(0x0f38, 0x18): /* vbroadcastss m32,{x,y}mm */
-        generate_exception_if(ea.type != OP_MEM, EXC_UD);
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x18): /* vbroadcastss xmm/m32,{x,y}mm */
+        if ( ea.type != OP_MEM )
+        {
+            generate_exception_if(b & 2, EXC_UD);
+            host_and_vcpu_must_have(avx2);
+        }
         /* fall through */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x0c): /* vpermilps {x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x0d): /* vpermilpd {x,y}mm/mem,{x,y}mm,{x,y}mm */
@@ -7254,6 +7277,11 @@ x86_emulate(
         op_bytes = 8 << vex.l;
         goto simd_0f_ymm;
 
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x16): /* vpermps ymm/m256,ymm,ymm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x36): /* vpermd ymm/m256,ymm,ymm */
+        generate_exception_if(!vex.l || vex.w, EXC_UD);
+        goto simd_0f_avx2;
+
     case X86EMUL_OPC_VEX_66(0x0f38, 0x20): /* vpmovsxbw xmm/mem,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x21): /* vpmovsxbd xmm/mem,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x22): /* vpmovsxbq xmm/mem,{x,y}mm */
@@ -7370,6 +7398,80 @@ x86_emulate(
         generate_exception_if(vex.l, EXC_UD);
         goto simd_0f_avx;
 
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x58): /* vpbroadcastd xmm/m32,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x59): /* vpbroadcastq xmm/m64,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x78): /* vpbroadcastb xmm/m8,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x79): /* vpbroadcastw xmm/m16,{x,y}mm */
+        op_bytes = 1 << ((!(b & 0x20) * 2) + (b & 1));
+        /* fall through */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x46): /* vpsravd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+        generate_exception_if(vex.w, EXC_UD);
+        goto simd_0f_avx2;
+
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x5a): /* vbroadcasti128 m128,ymm */
+        generate_exception_if(ea.type != OP_MEM || !vex.l || vex.w, EXC_UD);
+        goto simd_0f_avx2;
+
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x8c): /* vpmaskmov{d,q} mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x8e): /* vpmaskmov{d,q} {x,y}mm,{x,y}mm,mem */
+    {
+        typeof(vex) *pvex;
+        unsigned int mask = vex.w ? 0x80808080U : 0x88888888U;
+
+        generate_exception_if(ea.type != OP_MEM, EXC_UD);
+        host_and_vcpu_must_have(avx2);
+        get_fpu(X86EMUL_FPU_ymm, &fic);
+
+        /*
+         * While we can't reasonably provide fully correct behavior here
+         * (in particular, for writes, avoiding the memory read in anticipation
+         * of all elements in the range eventually being written), we can (and
+         * should) still limit the memory access to the smallest possible range
+         * (suppressing it altogether if all mask bits are clear), to provide
+         * correct faulting behavior. Read the mask bits via vmovmskp{s,d}
+         * for that purpose.
+         */
+        opc = init_prefixes(stub);
+        pvex = copy_VEX(opc, vex);
+        pvex->opcx = vex_0f;
+        opc[0] = 0xd7; /* vpmovmskb */
+        /* Use %rax as GPR destination and VEX.vvvv as source. */
+        pvex->r = 1;
+        pvex->b = !mode_64bit() || (vex.reg >> 3);
+        opc[1] = 0xc0 | (~vex.reg & 7);
+        pvex->reg = 0xf;
+        opc[2] = 0xc3;
+
+        invoke_stub("", "", "=a" (ea.val) : [dummy] "i" (0));
+        put_stub(stub);
+
+        /* Convert byte granular result to dword/qword granularity. */
+        ea.val &= mask;
+        if ( !ea.val )
+            goto complete_insn;
+
+        first_byte = __builtin_ctz(ea.val) & ~((4 << vex.w) - 1);
+        ea.val >>= first_byte;
+        op_bytes = 32 - __builtin_clz(ea.val);
+
+        /*
+         * Even for the memory write variant a memory read is needed, unless
+         * all set mask bits are contiguous.
+         */
+        if ( ea.val & (ea.val + ~mask + 1) )
+            d = (d & ~SrcMask) | SrcMem;
+
+        opc = init_prefixes(stub);
+        opc[0] = b;
+        /* Convert memory operand to (%rAX). */
+        rex_prefix &= ~REX_B;
+        vex.b = 1;
+        opc[1] = modrm & 0x38;
+        fic.insn_bytes = PFX_BYTES + 2;
+
+        break;
+    }
+
     case X86EMUL_OPC_VEX_66(0x0f38, 0x96): /* vfmaddsub132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x97): /* vfmsubadd132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x98): /* vfmadd132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
@@ -7578,6 +7680,20 @@ x86_emulate(
                             : "0" ((uint32_t)src.val), "rm" (_regs.edx) );
         break;
 
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x00): /* vpermq $imm8,ymm/m256,ymm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x01): /* vpermpd $imm8,ymm/m256,ymm */
+        generate_exception_if(!vex.l || !vex.w, EXC_UD);
+        goto simd_0f_imm8_avx2;
+
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x38): /* vinserti128 $imm8,xmm/m128,ymm,ymm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x39): /* vextracti128 $imm8,ymm,xmm/m128 */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x46): /* vperm2i128 $imm8,ymm/m256,ymm,ymm */
+        generate_exception_if(!vex.l, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x02): /* vpblendd $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+        generate_exception_if(vex.w, EXC_UD);
+        goto simd_0f_imm8_avx2;
+
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x06): /* vperm2f128 $imm8,ymm/m256,ymm,ymm */
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x18): /* vinsertf128 $imm8,xmm/m128,ymm,ymm */
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x19): /* vextractf128 $imm8,ymm,xmm/m128 */
@@ -8059,6 +8175,7 @@ x86_emulate(
                 {
                 case X86EMUL_OPC_VEX_66(0x0f38, 0x2e): /* vmaskmovps */
                 case X86EMUL_OPC_VEX_66(0x0f38, 0x2f): /* vmaskmovpd */
+                case X86EMUL_OPC_VEX_66(0x0f38, 0x8e): /* vpmaskmov{d,q} */
                     /* These have merge semantics; force write to occur. */
                     d |= Mov;
                     break;



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* [PATCH v3 07/25] x86emul: support AVX2 gather insns
  2017-12-07 13:49 [PATCH v3 00/25] x86: emulator enhancements Jan Beulich
                   ` (5 preceding siblings ...)
  2017-12-07 14:03 ` [PATCH v3 06/25] x86emul: support most remaining AVX2 insns Jan Beulich
@ 2017-12-07 14:03 ` Jan Beulich
  2018-02-01 20:53   ` Andrew Cooper
  2017-12-07 14:04 ` [PATCH v3 08/25] x86emul: add tables for XOP 08 and 09 extension spaces Jan Beulich
                   ` (17 subsequent siblings)
  24 siblings, 1 reply; 85+ messages in thread
From: Jan Beulich @ 2017-12-07 14:03 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: Re-base.

--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -13,7 +13,8 @@ run: $(TARGET)
 
 SIMD := sse sse2 sse4 avx avx2
 FMA := fma4 fma
-TESTCASES := blowfish $(SIMD) $(FMA)
+SG := avx2-sg
+TESTCASES := blowfish $(SIMD) $(FMA) $(SG)
 
 blowfish-cflags := ""
 blowfish-cflags-x86_32 := "-mno-accumulate-outgoing-args -Dstatic="
@@ -39,6 +40,10 @@ fma-flts := $(avx-flts)
 avx2-vecs := $(avx-vecs)
 avx2-ints := 1 2 4 8
 avx2-flts := 4 8
+avx2-sg-vecs := $(avx2-vecs)
+avx2-sg-idxs := 4 8
+avx2-sg-ints := 4 8
+avx2-sg-flts := 4 8
 
 # For AVX and later, have the compiler avoid XMM0 to widen coverage of
 # the VEX.vvvv checks in the emulator.
@@ -55,8 +60,18 @@ $(1)-cflags := \
 	$(foreach flt,$($(1)-flts), \
 	  "-D_f$(flt) -m$(1) $(call non-sse,$(1)) -mfpmath=sse -Os -DFLOAT_SIZE=$(flt)")
 endef
+define simd-sg-defs
+$(1)-cflags := \
+	$(foreach vec,$($(1)-vecs), \
+	  $(foreach idx,$($(1)-idxs), \
+	   $(foreach int,$($(1)-ints), \
+	     "-D_$(vec)x$(idx)i$(int) -m$(1:-sg=) $(call non-sse,$(1)) -Os -DVEC_MAX=$(vec) -DIDX_SIZE=$(idx) -DINT_SIZE=$(int)") \
+	   $(foreach flt,$($(1)-flts), \
+	     "-D_$(vec)x$(idx)f$(flt) -m$(1:-sg=) $(call non-sse,$(1)) -Os -DVEC_MAX=$(vec) -DIDX_SIZE=$(idx) -DFLOAT_SIZE=$(flt)")))
+endef
 
 $(foreach flavor,$(SIMD) $(FMA),$(eval $(call simd-defs,$(flavor))))
+$(foreach flavor,$(SG),$(eval $(call simd-sg-defs,$(flavor))))
 
 $(addsuffix .h,$(TESTCASES)): %.h: %.c testcase.mk Makefile
 	rm -f $@.new $*.bin
@@ -78,7 +93,10 @@ $(addsuffix .c,$(SIMD)):
 $(addsuffix .c,$(FMA)):
 	ln -sf simd-fma.c $@
 
-$(addsuffix .o,$(SIMD) $(FMA)): simd.h
+$(addsuffix .c,$(SG)):
+	ln -sf simd-sg.c $@
+
+$(addsuffix .o,$(SIMD) $(FMA) $(SG)): simd.h
 
 $(TARGET): x86-emulate.o test_x86_emulator.o
 	$(HOSTCC) $(HOSTCFLAGS) -o $@ $^
--- /dev/null
+++ b/tools/tests/x86_emulator/simd-sg.c
@@ -0,0 +1,209 @@
+#ifdef INT_SIZE
+# define ELEM_SIZE INT_SIZE
+#else
+# define ELEM_SIZE FLOAT_SIZE
+#endif
+
+#define VEC_SIZE (IDX_SIZE <= ELEM_SIZE ? VEC_MAX \
+                                        : VEC_MAX * ELEM_SIZE / IDX_SIZE)
+#if VEC_SIZE < 16
+# undef VEC_SIZE
+# define VEC_SIZE 16
+#endif
+
+#include "simd.h"
+
+ENTRY(sg_test);
+
+#undef MODE
+#if IDX_SIZE == 4
+# define MODE SI
+#elif IDX_SIZE == 8
+# define MODE DI
+#endif
+
+#define IVEC_SIZE (ELEM_SIZE <= IDX_SIZE ? VEC_MAX \
+                                         : VEC_MAX * IDX_SIZE / ELEM_SIZE)
+#if IVEC_SIZE < 16
+# undef IVEC_SIZE
+# define IVEC_SIZE 16
+#endif
+
+typedef signed int __attribute__((mode(MODE), vector_size(IVEC_SIZE))) idx_t;
+typedef long long __attribute__((vector_size(IVEC_SIZE))) idi_t;
+
+#define ITEM_COUNT (VEC_SIZE / ELEM_SIZE < IVEC_SIZE / IDX_SIZE ? \
+                    VEC_SIZE / ELEM_SIZE : IVEC_SIZE / IDX_SIZE)
+
+#if VEC_SIZE == 16
+# define to_bool(cmp) __builtin_ia32_ptestc128(cmp, (vec_t){} == 0)
+#else
+# define to_bool(cmp) __builtin_ia32_ptestc256(cmp, (vec_t){} == 0)
+#endif
+
+#if defined(__AVX2__)
+# if VEC_MAX == 16
+#  if IDX_SIZE == 4
+#   if INT_SIZE == 4
+#    define gather __builtin_ia32_gathersiv4si
+#   elif INT_SIZE == 8
+#    define gather(reg, mem, idx, msk, scl) \
+            (vec_t)(__builtin_ia32_gathersiv2di((vdi_t)(reg), \
+                                                (const void *)(mem), \
+                                                idx, (vdi_t)(msk), scl))
+#   elif FLOAT_SIZE == 4
+#    define gather __builtin_ia32_gathersiv4sf
+#   elif FLOAT_SIZE == 8
+#    define gather __builtin_ia32_gathersiv2df
+#   endif
+#  elif IDX_SIZE == 8
+#   if INT_SIZE == 4
+#    define gather(reg, mem, idx, msk, scl) \
+            __builtin_ia32_gatherdiv4si(reg, mem, (vdi_t)(idx), msk, scl)
+#   elif INT_SIZE == 8
+#    define gather(reg, mem, idx, msk, scl) \
+            (vec_t)(__builtin_ia32_gatherdiv2di((vdi_t)(reg), \
+                                                (const void *)(mem), \
+                                                (vdi_t)(idx), (vdi_t)(msk), \
+                                                scl))
+#   elif FLOAT_SIZE == 4
+#    define gather(reg, mem, idx, msk, scl) \
+            __builtin_ia32_gatherdiv4sf(reg, mem, (vdi_t)(idx), msk, scl)
+#   elif FLOAT_SIZE == 8
+#    define gather(reg, mem, idx, msk, scl) \
+            __builtin_ia32_gatherdiv2df(reg, mem, (vdi_t)(idx), msk, scl)
+#   endif
+#  endif
+# elif VEC_MAX == 32
+#  if IDX_SIZE == 4
+#   if INT_SIZE == 4
+#    define gather __builtin_ia32_gathersiv8si
+#   elif INT_SIZE == 8
+#    define gather(reg, mem, idx, msk, scl) \
+            (vec_t)(__builtin_ia32_gathersiv4di((vdi_t)(reg), \
+                                                (const void *)(mem), \
+                                                idx, (vdi_t)(msk), scl))
+
+#   elif FLOAT_SIZE == 4
+#    define gather __builtin_ia32_gathersiv8sf
+#   elif FLOAT_SIZE == 8
+#    define gather __builtin_ia32_gathersiv4df
+#   endif
+#  elif IDX_SIZE == 8
+#   if INT_SIZE == 4
+#    define gather(reg, mem, idx, msk, scl) \
+            __builtin_ia32_gatherdiv4si256(reg, mem, (idi_t)(idx), msk, scl)
+#   elif INT_SIZE == 8
+#    define gather(reg, mem, idx, msk, scl) \
+            (vec_t)(__builtin_ia32_gatherdiv4di((vdi_t)(reg), \
+                                                (const void *)(mem), \
+                                                (vdi_t)(idx), (vdi_t)(msk), \
+                                                scl))
+
+#   elif FLOAT_SIZE == 4
+#    define gather(reg, mem, idx, msk, scl) \
+            __builtin_ia32_gatherdiv4sf256(reg, mem, (idi_t)(idx), msk, scl)
+#   elif FLOAT_SIZE == 8
+#    define gather(reg, mem, idx, msk, scl) \
+            __builtin_ia32_gatherdiv4df(reg, mem, (vdi_t)(idx), msk, scl)
+#   endif
+#  endif
+# endif
+#endif
+
+#define GLUE_(x, y) x ## y
+#define GLUE(x, y) GLUE_(x, y)
+
+#define PUT2(n)      (n),        (n) +  1
+#define PUT4(n)  PUT2(n),   PUT2((n) +  2)
+#define PUT8(n)  PUT4(n),   PUT4((n) +  4)
+#define PUT16(n) PUT8(n),   PUT8((n) +  8)
+#define PUT32(n) PUT16(n), PUT16((n) + 16)
+
+const typeof((vec_t){}[0]) array[] = {
+    GLUE(PUT, VEC_MAX)(1),
+    GLUE(PUT, VEC_MAX)(VEC_MAX + 1)
+};
+
+int sg_test(void)
+{
+    unsigned int i;
+    vec_t x, y, full = (vec_t){} == 0;
+    idx_t idx, inv;
+
+    for ( i = 0; i < IVEC_SIZE / IDX_SIZE; ++i )
+    {
+        idx[i] = i + 1;
+        inv[i] = ITEM_COUNT - i;
+    }
+
+    touch(idx);
+    touch(inv);
+
+    x = gather(full, array, (idx_t){}, full, 1);
+    for ( i = 0; i < ITEM_COUNT; ++i )
+        if ( x[i] != 1 )
+            return __LINE__;
+    for ( ; i < ELEM_COUNT; ++i )
+        if ( x[i] )
+            return __LINE__;
+
+    x = gather(full, array, idx, full, ELEM_SIZE);
+    for ( i = 0; i < ITEM_COUNT; ++i )
+        if ( x[i] != i + 2 )
+            return __LINE__;
+    for ( ; i < ELEM_COUNT; ++i )
+        if ( x[i] )
+            return __LINE__;
+
+    x = gather(full, array, idx * ELEM_SIZE, full, 2);
+    for ( i = 0; i < ITEM_COUNT; ++i )
+        if ( x[i] != i * 2 + 3 )
+            return __LINE__;
+    for ( ; i < ELEM_COUNT; ++i )
+        if ( x[i] )
+            return __LINE__;
+
+    x = gather(full, array, inv, full, ELEM_SIZE);
+    for ( i = 0; i < ITEM_COUNT; ++i )
+        if ( x[i] != inv[i] + 1 )
+            return __LINE__;
+    for ( ; i < ELEM_COUNT; ++i )
+        if ( x[i] )
+            return __LINE__;
+
+    y = gather(full, array + ITEM_COUNT, -idx, full, ELEM_SIZE);
+#if ITEM_COUNT == ELEM_COUNT
+    if ( !to_bool(y == x - 1) )
+        return __LINE__;
+#else
+    for ( i = 0; i < ITEM_COUNT; ++i )
+        if ( y[i] != x[i] - 1 )
+            return __LINE__;
+    for ( ; i < ELEM_COUNT; ++i )
+        if ( y[i] )
+            return __LINE__;
+#endif
+
+#if ELEM_SIZE > 1
+    x = gather(full, array, inv * 2, full, ELEM_SIZE / 2);
+    for ( i = 0; i < ITEM_COUNT; ++i )
+        if ( x[i] != inv[i] + 1 )
+            return __LINE__;
+    for ( ; i < ELEM_COUNT; ++i )
+        if ( x[i] )
+            return __LINE__;
+
+# if ELEM_SIZE == IDX_SIZE
+    y = gather(x, array, idx, (idx & inv) != 0, ELEM_SIZE);
+    for ( i = 0; i < ITEM_COUNT; ++i )
+        if ( y[i] != ((i + 1) & (ITEM_COUNT - i) ? idx : inv)[i] + 1 )
+            return __LINE__;
+    for ( ; i < ELEM_COUNT; ++i )
+        if ( y[i] )
+            return __LINE__;
+# endif
+#endif
+
+    return 0;
+}
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -12,6 +12,7 @@
 #include "fma4.h"
 #include "fma.h"
 #include "avx2.h"
+#include "avx2-sg.h"
 
 #define verbose false /* Switch to true for far more logging. */
 
@@ -60,6 +61,7 @@ static bool simd_check_avx2(void)
 {
     return cpu_has_avx2;
 }
+#define simd_check_avx2_sg simd_check_avx2
 
 static void simd_set_regs(struct cpu_user_regs *regs)
 {
@@ -173,6 +175,22 @@ static const struct {
     SIMD(AVX2 u32x8,             avx2,      32u4),
     SIMD(AVX2 s64x4,             avx2,      32i8),
     SIMD(AVX2 u64x4,             avx2,      32u8),
+    SIMD(AVX2 S/G f32[4x32],  avx2_sg,    16x4f4),
+    SIMD(AVX2 S/G f64[2x32],  avx2_sg,    16x4f8),
+    SIMD(AVX2 S/G f32[2x64],  avx2_sg,    16x8f4),
+    SIMD(AVX2 S/G f64[2x64],  avx2_sg,    16x8f8),
+    SIMD(AVX2 S/G f32[8x32],  avx2_sg,    32x4f4),
+    SIMD(AVX2 S/G f64[4x32],  avx2_sg,    32x4f8),
+    SIMD(AVX2 S/G f32[4x64],  avx2_sg,    32x8f4),
+    SIMD(AVX2 S/G f64[4x64],  avx2_sg,    32x8f8),
+    SIMD(AVX2 S/G i32[4x32],  avx2_sg,    16x4i4),
+    SIMD(AVX2 S/G i64[2x32],  avx2_sg,    16x4i8),
+    SIMD(AVX2 S/G i32[2x64],  avx2_sg,    16x8i4),
+    SIMD(AVX2 S/G i64[2x64],  avx2_sg,    16x8i8),
+    SIMD(AVX2 S/G i32[8x32],  avx2_sg,    32x4i4),
+    SIMD(AVX2 S/G i64[4x32],  avx2_sg,    32x4i8),
+    SIMD(AVX2 S/G i32[4x64],  avx2_sg,    32x8i4),
+    SIMD(AVX2 S/G i64[4x64],  avx2_sg,    32x8i8),
 #undef SIMD_
 #undef SIMD
 };
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -391,6 +391,7 @@ static const struct {
     [0x78 ... 0x79] = { .simd_size = simd_other, .two_op = 1 },
     [0x8c] = { .simd_size = simd_other },
     [0x8e] = { .simd_size = simd_other, .to_mem = 1 },
+    [0x90 ... 0x93] = { .simd_size = simd_other, .vsib = 1 },
     [0x96 ... 0x9f] = { .simd_size = simd_packed_fp },
     [0xa6 ... 0xaf] = { .simd_size = simd_packed_fp },
     [0xb6 ... 0xbf] = { .simd_size = simd_packed_fp },
@@ -598,6 +599,7 @@ struct x86_emulate_state {
         ext_8f0a,
     } ext;
     uint8_t modrm, modrm_mod, modrm_reg, modrm_rm;
+    uint8_t sib_index, sib_scale;
     uint8_t rex_prefix;
     bool lock_prefix;
     bool not_64bit; /* Instruction not available in 64bit. */
@@ -2411,7 +2413,7 @@ x86_decode(
     struct x86_emulate_ctxt *ctxt,
     const struct x86_emulate_ops  *ops)
 {
-    uint8_t b, d, sib, sib_index, sib_base;
+    uint8_t b, d;
     unsigned int def_op_bytes, def_ad_bytes, opcode;
     enum x86_segment override_seg = x86_seg_none;
     bool pc_rel = false;
@@ -2745,6 +2747,7 @@ x86_decode(
 
         if ( modrm_mod == 3 )
         {
+            generate_exception_if(d & vSIB, EXC_UD);
             modrm_rm |= (rex_prefix & 1) << 3;
             ea.type = OP_REG;
         }
@@ -2805,13 +2808,17 @@ x86_decode(
             ea.type = OP_MEM;
             if ( modrm_rm == 4 )
             {
-                sib = insn_fetch_type(uint8_t);
-                sib_index = ((sib >> 3) & 7) | ((rex_prefix << 2) & 8);
-                sib_base  = (sib & 7) | ((rex_prefix << 3) & 8);
-                if ( sib_index != 4 && !(d & vSIB) )
-                    ea.mem.off = *decode_register(sib_index, state->regs,
-                                                  false);
-                ea.mem.off <<= (sib >> 6) & 3;
+                uint8_t sib = insn_fetch_type(uint8_t);
+                uint8_t sib_base = (sib & 7) | ((rex_prefix << 3) & 8);
+
+                state->sib_index = ((sib >> 3) & 7) | ((rex_prefix << 2) & 8);
+                state->sib_scale = (sib >> 6) & 3;
+                if ( state->sib_index != 4 && !(d & vSIB) )
+                {
+                    ea.mem.off = *decode_register(state->sib_index,
+                                                  state->regs, false);
+                    ea.mem.off <<= state->sib_scale;
+                }
                 if ( (modrm_mod == 0) && ((sib_base & 7) == 5) )
                     ea.mem.off += insn_fetch_type(int32_t);
                 else if ( sib_base == 4 )
@@ -7472,6 +7479,110 @@ x86_emulate(
         break;
     }
 
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x90): /* vpgatherd{d,q} {x,y}mm,mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x91): /* vpgatherq{d,q} {x,y}mm,mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x92): /* vgatherdp{s,d} {x,y}mm,mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x93): /* vgatherqp{s,d} {x,y}mm,mem,{x,y}mm */
+    {
+        unsigned int mask_reg = ~vex.reg & (mode_64bit() ? 0xf : 7);
+        typeof(vex) *pvex;
+        union {
+            int32_t dw[8];
+            int64_t qw[4];
+        } index, mask;
+
+        ASSERT(ea.type == OP_MEM);
+        generate_exception_if(modrm_reg == state->sib_index ||
+                              modrm_reg == mask_reg ||
+                              state->sib_index == mask_reg, EXC_UD);
+        generate_exception_if(!cpu_has_avx, EXC_UD);
+        vcpu_must_have(avx2);
+        get_fpu(X86EMUL_FPU_ymm, &fic);
+
+        /* Read destination, index, and mask registers. */
+        opc = init_prefixes(stub);
+        pvex = copy_VEX(opc, vex);
+        pvex->opcx = vex_0f;
+        opc[0] = 0x7f; /* vmovdqa */
+        /* Use (%rax) as destination and modrm_reg as source. */
+        pvex->r = !mode_64bit() || !(modrm_reg & 8);
+        pvex->b = 1;
+        opc[1] = (modrm_reg & 7) << 3;
+        pvex->reg = 0xf;
+        opc[2] = 0xc3;
+
+        invoke_stub("", "", "=m" (*mmvalp) : "a" (mmvalp));
+
+        pvex->pfx = vex_f3; /* vmovdqu */
+        /* Switch to sib_index as source. */
+        pvex->r = !mode_64bit() || !(state->sib_index & 8);
+        opc[1] = (state->sib_index & 7) << 3;
+
+        invoke_stub("", "", "=m" (index) : "a" (&index));
+
+        /* Switch to mask_reg as source. */
+        pvex->r = !mode_64bit() || !(mask_reg & 8);
+        opc[1] = (mask_reg & 7) << 3;
+
+        invoke_stub("", "", "=m" (mask) : "a" (&mask));
+        put_stub(stub);
+
+        /* Clear untouched parts of the destination and mask values. */
+        n = 1 << (2 + vex.l - ((b & 1) | vex.w));
+        op_bytes = 4 << vex.w;
+        memset((void *)mmvalp + n * op_bytes, 0, 32 - n * op_bytes);
+        memset((void *)&mask + n * op_bytes, 0, 32 - n * op_bytes);
+
+        for ( i = 0; i < n && rc == X86EMUL_OKAY; ++i )
+        {
+            if ( (vex.w ? mask.qw[i] : mask.dw[i]) < 0 )
+            {
+                signed long idx = b & 1 ? index.qw[i] : index.dw[i];
+
+                rc = ops->read(ea.mem.seg,
+                               ea.mem.off + (idx << state->sib_scale),
+                               (void *)mmvalp + i * op_bytes, op_bytes, ctxt);
+                if ( rc != X86EMUL_OKAY )
+                    break;
+
+#ifdef __XEN__
+                if ( i + 1 < n && local_events_need_delivery() )
+                    rc = X86EMUL_RETRY;
+#endif
+            }
+
+            if ( vex.w )
+                mask.qw[i] = 0;
+            else
+                mask.dw[i] = 0;
+        }
+
+        /* Write destination and mask registers. */
+        opc = init_prefixes(stub);
+        pvex = copy_VEX(opc, vex);
+        pvex->opcx = vex_0f;
+        opc[0] = 0x6f; /* vmovdqa */
+        /* Use modrm_reg as destination and (%rax) as source. */
+        pvex->r = !mode_64bit() || !(modrm_reg & 8);
+        pvex->b = 1;
+        opc[1] = (modrm_reg & 7) << 3;
+        pvex->reg = 0xf;
+        opc[2] = 0xc3;
+
+        invoke_stub("", "", "+m" (*mmvalp) : "a" (mmvalp));
+
+        pvex->pfx = vex_f3; /* vmovdqu */
+        /* Switch to mask_reg as destination. */
+        pvex->r = !mode_64bit() || !(mask_reg & 8);
+        opc[1] = (mask_reg & 7) << 3;
+
+        invoke_stub("", "", "+m" (mask) : "a" (&mask));
+        put_stub(stub);
+
+        state->simd_size = simd_none;
+        break;
+    }
+
     case X86EMUL_OPC_VEX_66(0x0f38, 0x96): /* vfmaddsub132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x97): /* vfmsubadd132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x98): /* vfmadd132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
--- a/xen/arch/x86/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate.c
@@ -10,6 +10,7 @@
  */
 
 #include <xen/domain_page.h>
+#include <xen/event.h>
 #include <asm/x86_emulate.h>
 #include <asm/asm_defns.h> /* mark_regs_dirty() */
 #include <asm/processor.h> /* current_cpu_info */



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* [PATCH v3 08/25] x86emul: add tables for XOP 08 and 09 extension spaces
  2017-12-07 13:49 [PATCH v3 00/25] x86: emulator enhancements Jan Beulich
                   ` (6 preceding siblings ...)
  2017-12-07 14:03 ` [PATCH v3 07/25] x86emul: support AVX2 gather insns Jan Beulich
@ 2017-12-07 14:04 ` Jan Beulich
  2018-02-02 11:43   ` Andrew Cooper
  2017-12-07 14:04 ` [PATCH v3 09/25] x86emul: support XOP insns Jan Beulich
                   ` (16 subsequent siblings)
  24 siblings, 1 reply; 85+ messages in thread
From: Jan Beulich @ 2017-12-07 14:04 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper

Convert the few existing opcodes so far supported.

Also adjust two vex_* case labels to better be ext_* (the values are
identical).

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -458,6 +458,20 @@ static const opcode_desc_t xop_table[] =
     DstReg|SrcImm|ModRM,
 };
 
+static const struct {
+    uint8_t simd_size:5;
+    uint8_t two_op:1;
+    uint8_t four_op:1;
+} ext8f08_table[256] = {
+};
+
+static const struct {
+    uint8_t simd_size:5;
+    uint8_t two_op:1;
+} ext8f09_table[256] = {
+    [0x01 ... 0x02] = { .two_op = 1 },
+};
+
 #define REX_PREFIX 0x40
 #define REX_B 0x01
 #define REX_X 0x02
@@ -2726,7 +2740,7 @@ x86_decode(
             }
             break;
 
-        case vex_0f38:
+        case ext_0f38:
             d = ext0f38_table[b].to_mem ? DstMem | SrcReg
                                         : DstReg | SrcMem;
             if ( ext0f38_table[b].two_op )
@@ -2736,7 +2750,14 @@ x86_decode(
             state->simd_size = ext0f38_table[b].simd_size;
             break;
 
-        case vex_0f3a:
+        case ext_8f09:
+            if ( ext8f09_table[b].two_op )
+                d |= TwoOp;
+            state->simd_size = ext8f09_table[b].simd_size;
+            break;
+
+        case ext_0f3a:
+        case ext_8f08:
             /*
              * Cannot update d here yet, as the immediate operand still
              * needs fetching.
@@ -2928,6 +2949,15 @@ x86_decode(
         break;
 
     case ext_8f08:
+        d = DstReg | SrcMem;
+        if ( ext8f08_table[b].two_op )
+            d |= TwoOp;
+        else if ( ext8f08_table[b].four_op && !mode_64bit() )
+            imm1 &= 0x7f;
+        state->desc = d;
+        state->simd_size = ext8f08_table[b].simd_size;
+        break;
+
     case ext_8f09:
     case ext_8f0a:
         break;




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* [PATCH v3 09/25] x86emul: support XOP insns
  2017-12-07 13:49 [PATCH v3 00/25] x86: emulator enhancements Jan Beulich
                   ` (7 preceding siblings ...)
  2017-12-07 14:04 ` [PATCH v3 08/25] x86emul: add tables for XOP 08 and 09 extension spaces Jan Beulich
@ 2017-12-07 14:04 ` Jan Beulich
  2018-02-02 12:03   ` Andrew Cooper
  2017-12-07 14:05 ` [PATCH v3 10/25] x86emul: support 3DNow! insns Jan Beulich
                   ` (15 subsequent siblings)
  24 siblings, 1 reply; 85+ messages in thread
From: Jan Beulich @ 2017-12-07 14:04 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/.gitignore
+++ b/.gitignore
@@ -230,6 +230,7 @@
 tools/tests/x86_emulator/sse*.[ch]
 tools/tests/x86_emulator/test_x86_emulator
 tools/tests/x86_emulator/x86_emulate
+tools/tests/x86_emulator/xop*.[ch]
 tools/tests/xen-access/xen-access
 tools/tests/xenstore/xs-test
 tools/tests/regression/installed/*
--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -11,7 +11,7 @@ all: $(TARGET)
 run: $(TARGET)
 	./$(TARGET)
 
-SIMD := sse sse2 sse4 avx avx2
+SIMD := sse sse2 sse4 avx avx2 xop
 FMA := fma4 fma
 SG := avx2-sg
 TESTCASES := blowfish $(SIMD) $(FMA) $(SG)
@@ -44,6 +44,9 @@ avx2-sg-vecs := $(avx2-vecs)
 avx2-sg-idxs := 4 8
 avx2-sg-ints := 4 8
 avx2-sg-flts := 4 8
+xop-vecs := $(avx-vecs)
+xop-ints := 1 2 4 8
+xop-flts := $(avx-flts)
 
 # For AVX and later, have the compiler avoid XMM0 to widen coverage of
 # the VEX.vvvv checks in the emulator.
@@ -98,6 +101,8 @@ $(addsuffix .c,$(SG)):
 
 $(addsuffix .o,$(SIMD) $(FMA) $(SG)): simd.h
 
+xop.o: simd-fma.c
+
 $(TARGET): x86-emulate.o test_x86_emulator.o
 	$(HOSTCC) $(HOSTCFLAGS) -o $@ $^
 
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -483,6 +483,86 @@ static inline bool _to_bool(byte_vec_t b
 #  endif
 # endif
 #endif
+#ifdef __XOP__
+# undef select
+# if VEC_SIZE == 16
+#  if INT_SIZE == 2 || INT_SIZE == 4
+#   include "simd-fma.c"
+#  endif
+#  define select(d, x, y, m) \
+    (*(d) = (vec_t)__builtin_ia32_vpcmov((vdi_t)(x), (vdi_t)(y), (vdi_t)(m)))
+#  if INT_SIZE == 1 || UINT_SIZE == 1
+#   define swap2(x) ((vec_t)__builtin_ia32_vpperm((vqi_t)(x), (vqi_t)(x), (vqi_t)inv - 1))
+#  elif INT_SIZE == 2 || UINT_SIZE == 2
+#   define swap2(x) \
+    ((vec_t)__builtin_ia32_vpperm((vqi_t)(x), (vqi_t)(x), \
+                                  (vqi_t)(__builtin_ia32_vprotwi(2 * (vhi_t)inv - 1, 8) | \
+                                          (2 * inv - 2))))
+#  elif FLOAT_SIZE == 4
+#   define frac(x) __builtin_ia32_vfrczps(x)
+#   undef swap2
+#   define swap2(x) ({ \
+    /* Buggy in gcc 7.1.0 and earlier. */ \
+    /* __builtin_ia32_vpermil2ps((vec_t){}, x, __builtin_ia32_cvtps2dq(inv) + 3, 0) */ \
+    vec_t t_; \
+    asm ( "vpermil2ps $0, %3, %2, %1, %0" : \
+          "=x" (t_) : \
+          "x" ((vec_t){}), "m" (x), "x" (__builtin_ia32_cvtps2dq(inv) + 3) ); \
+    t_; \
+})
+#  elif FLOAT_SIZE == 8
+#   define frac(x) __builtin_ia32_vfrczpd(x)
+#   undef swap2
+#   define swap2(x) ({ \
+    /* Buggy in gcc 7.1.0 and earlier. */ \
+    /* __builtin_ia32_vpermil2pd((vec_t){}, x, */ \
+    /*                            __builtin_ia32_pmovsxdq128( */ \
+    /*                                __builtin_ia32_cvtpd2dq(inv) + 1) << 1, 0) */ \
+    vdi_t s_ = __builtin_ia32_pmovsxdq128( \
+                   __builtin_ia32_cvtpd2dq(inv) + 1) << 1; \
+    vec_t t_; \
+    asm ( "vpermil2pd $0, %3, %2, %1, %0" : \
+          "=x" (t_) : "x" ((vec_t){}), "x" (x), "m" (s_) ); \
+    t_; \
+})
+#  endif
+#  if INT_SIZE == 1
+#   define hadd(x, y) ((vec_t)__builtin_ia32_packsswb128(__builtin_ia32_vphaddbw((vqi_t)(x)), \
+                                                         __builtin_ia32_vphaddbw((vqi_t)(y))))
+#   define hsub(x, y) ((vec_t)__builtin_ia32_packsswb128(__builtin_ia32_vphsubbw((vqi_t)(x)), \
+                                                         __builtin_ia32_vphsubbw((vqi_t)(y))))
+#  elif UINT_SIZE == 1
+#   define hadd(x, y) ((vec_t)__builtin_ia32_packuswb128(__builtin_ia32_vphaddubw((vqi_t)(x)), \
+                                                         __builtin_ia32_vphaddubw((vqi_t)(y))))
+#  elif INT_SIZE == 2
+#   undef hadd
+#   define hadd(x, y) __builtin_ia32_packssdw128(__builtin_ia32_vphaddwd(x), \
+                                                 __builtin_ia32_vphaddwd(y))
+#   undef hsub
+#   define hsub(x, y) __builtin_ia32_packssdw128(__builtin_ia32_vphsubwd(x), \
+                                                 __builtin_ia32_vphsubwd(y))
+#  elif UINT_SIZE == 2
+#   undef hadd
+#   define hadd(x, y) ((vec_t)__builtin_ia32_packusdw128(__builtin_ia32_vphadduwd((vhi_t)(x)), \
+                                                         __builtin_ia32_vphadduwd((vhi_t)(y))))
+#   undef hsub
+#  endif
+# elif VEC_SIZE == 32
+#  define select(d, x, y, m) \
+    (*(d) = (vec_t)__builtin_ia32_vpcmov256((vdi_t)(x), (vdi_t)(y), (vdi_t)(m)))
+#  if FLOAT_SIZE == 4
+#   define frac(x) __builtin_ia32_vfrczps256(x)
+#  elif FLOAT_SIZE == 8
+#   define frac(x) __builtin_ia32_vfrczpd256(x)
+#  endif
+# elif VEC_SIZE == FLOAT_SIZE
+#  if VEC_SIZE == 4
+#   define frac(x) scalar_1op(x, "vfrczss %[in], %[out]")
+#  elif VEC_SIZE == 8
+#   define frac(x) scalar_1op(x, "vfrczsd %[in], %[out]")
+#  endif
+# endif
+#endif
 
 int simd_test(void)
 {
@@ -588,6 +668,29 @@ int simd_test(void)
     if ( !to_bool(y == z) ) return __LINE__;
 # endif
 
+# ifdef frac
+    touch(src);
+    x = frac(src);
+    touch(src);
+    if ( !to_bool(x == 0) ) return __LINE__;
+
+    x = 1 / (src + 1);
+    touch(x);
+    y = frac(x);
+    touch(x);
+    if ( !to_bool(x == y) ) return __LINE__;
+# endif
+
+# if defined(trunc) && defined(frac)
+    x = src / 4;
+    touch(x);
+    y = trunc(x);
+    touch(x);
+    z = frac(x);
+    touch(x);
+    if ( !to_bool(x == y + z) ) return __LINE__;
+# endif
+
 #else
 
 # if ELEM_SIZE > 1
@@ -689,7 +792,7 @@ int simd_test(void)
     y = z << sh;
     if ( !to_bool(x == y + y) ) return __LINE__;
 
-#  if defined(__AVX2__) && ELEM_SIZE >= 4
+#  if (defined(__AVX2__) && ELEM_SIZE >= 4) || defined(__XOP__)
     touch(sh);
     x = y >> sh;
     if ( !to_bool(x == z) ) return __LINE__;
@@ -883,6 +986,8 @@ int simd_test(void)
 #endif
 
 #ifdef hadd
+# if (!defined(INT_SIZE) || INT_SIZE > 1 || ELEM_COUNT < 16) && \
+     (!defined(UINT_SIZE) || UINT_SIZE > 1 || ELEM_COUNT <= 16)
     x = src;
     for ( i = ELEM_COUNT; i >>= 1; )
     {
@@ -890,6 +995,7 @@ int simd_test(void)
         x = hadd((vec_t){}, x);
     }
     if ( x[ELEM_COUNT - 1] != (ELEM_COUNT * (ELEM_COUNT + 1)) / 2 ) return __LINE__;
+# endif
 
 # ifdef hsub
     touch(src);
@@ -901,6 +1007,9 @@ int simd_test(void)
 # endif
 #endif
 
+#if defined(__XOP__) && VEC_SIZE == 16 && (INT_SIZE == 2 || INT_SIZE == 4)
+    return -fma_test();
+#endif
 
     return 0;
 }
--- a/tools/tests/x86_emulator/simd-fma.c
+++ b/tools/tests/x86_emulator/simd-fma.c
@@ -1,6 +1,8 @@
 #include "simd.h"
 
+#ifndef __XOP__
 ENTRY(fma_test);
+#endif
 
 #if VEC_SIZE < 16
 # define to_bool(cmp) (!~(cmp)[0])
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -13,6 +13,7 @@
 #include "fma.h"
 #include "avx2.h"
 #include "avx2-sg.h"
+#include "xop.h"
 
 #define verbose false /* Switch to true for far more logging. */
 
@@ -63,6 +64,11 @@ static bool simd_check_avx2(void)
 }
 #define simd_check_avx2_sg simd_check_avx2
 
+static bool simd_check_xop(void)
+{
+    return cpu_has_xop;
+}
+
 static void simd_set_regs(struct cpu_user_regs *regs)
 {
     if ( cpu_has_mmx )
@@ -191,6 +197,22 @@ static const struct {
     SIMD(AVX2 S/G i64[4x32],  avx2_sg,    32x4i8),
     SIMD(AVX2 S/G i32[4x64],  avx2_sg,    32x8i4),
     SIMD(AVX2 S/G i64[4x64],  avx2_sg,    32x8i8),
+    SIMD(XOP 128bit single,       xop,      16f4),
+    SIMD(XOP 256bit single,       xop,      32f4),
+    SIMD(XOP 128bit double,       xop,      16f8),
+    SIMD(XOP 256bit double,       xop,      32f8),
+    SIMD(XOP s8x16,               xop,      16i1),
+    SIMD(XOP u8x16,               xop,      16u1),
+    SIMD(XOP s16x8,               xop,      16i2),
+    SIMD(XOP u16x8,               xop,      16u2),
+    SIMD(XOP s32x4,               xop,      16i4),
+    SIMD(XOP u32x4,               xop,      16u4),
+    SIMD(XOP s64x2,               xop,      16i8),
+    SIMD(XOP u64x2,               xop,      16u8),
+    SIMD(XOP i8x32,               xop,      32i1),
+    SIMD(XOP i16x16,              xop,      32i2),
+    SIMD(XOP i32x8,               xop,      32i4),
+    SIMD(XOP i64x4,               xop,      32i8),
 #undef SIMD_
 #undef SIMD
 };
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -177,6 +177,16 @@ static inline uint64_t xgetbv(uint32_t x
     (res.c & (1U << 6)) != 0; \
 })
 
+#define cpu_has_xop ({ \
+    struct cpuid_leaf res; \
+    emul_test_cpuid(1, 0, &res, NULL); \
+    if ( !(res.c & (1U << 27)) || ((xgetbv(0) & 6) != 6) ) \
+        res.c = 0; \
+    else \
+        emul_test_cpuid(0x80000001, 0, &res, NULL); \
+    (res.c & (1U << 11)) != 0; \
+})
+
 #define cpu_has_fma4 ({ \
     struct cpuid_leaf res; \
     emul_test_cpuid(1, 0, &res, NULL); \
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -435,6 +435,7 @@ static const struct {
     [0x42] = { .simd_size = simd_packed_int },
     [0x44] = { .simd_size = simd_packed_int },
     [0x46] = { .simd_size = simd_packed_int },
+    [0x48 ... 0x49] = { .simd_size = simd_packed_fp, .four_op = 1 },
     [0x4a ... 0x4b] = { .simd_size = simd_packed_fp, .four_op = 1 },
     [0x4c] = { .simd_size = simd_packed_int, .four_op = 1 },
     [0x5c ... 0x5f] = { .simd_size = simd_packed_fp, .four_op = 1 },
@@ -463,6 +464,17 @@ static const struct {
     uint8_t two_op:1;
     uint8_t four_op:1;
 } ext8f08_table[256] = {
+    [0xa2] = { .simd_size = simd_packed_int, .four_op = 1 },
+    [0x85 ... 0x87] = { .simd_size = simd_packed_int, .four_op = 1 },
+    [0x8e ... 0x8f] = { .simd_size = simd_packed_int, .four_op = 1 },
+    [0x95 ... 0x97] = { .simd_size = simd_packed_int, .four_op = 1 },
+    [0x9e ... 0x9f] = { .simd_size = simd_packed_int, .four_op = 1 },
+    [0xa3] = { .simd_size = simd_packed_int, .four_op = 1 },
+    [0xa6] = { .simd_size = simd_packed_int, .four_op = 1 },
+    [0xb6] = { .simd_size = simd_packed_int, .four_op = 1 },
+    [0xc0 ... 0xc3] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0xcc ... 0xcf] = { .simd_size = simd_packed_int },
+    [0xec ... 0xef] = { .simd_size = simd_packed_int },
 };
 
 static const struct {
@@ -470,6 +482,16 @@ static const struct {
     uint8_t two_op:1;
 } ext8f09_table[256] = {
     [0x01 ... 0x02] = { .two_op = 1 },
+    [0x80 ... 0x81] = { .simd_size = simd_packed_fp, .two_op = 1 },
+    [0x82 ... 0x83] = { .simd_size = simd_scalar_fp, .two_op = 1 },
+    [0x90 ... 0x9b] = { .simd_size = simd_packed_int },
+    [0xc1 ... 0xc3] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0xc6 ... 0xc7] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0xcb] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0xd1 ... 0xd3] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0xd6 ... 0xd7] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0xdb] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0xe1 ... 0xe3] = { .simd_size = simd_packed_int, .two_op = 1 },
 };
 
 #define REX_PREFIX 0x40
@@ -528,7 +550,7 @@ union vex {
 #define copy_VEX(ptr, vex) ({ \
     if ( !mode_64bit() ) \
         (vex).reg |= 8; \
-    (ptr)[0 - PFX_BYTES] = 0xc4; \
+    (ptr)[0 - PFX_BYTES] = ext < ext_8f08 ? 0xc4 : 0x8f; \
     (ptr)[1 - PFX_BYTES] = (vex).raw[0]; \
     (ptr)[2 - PFX_BYTES] = (vex).raw[1]; \
     container_of((ptr) + 1 - PFX_BYTES, typeof(vex), raw[0]); \
@@ -1654,6 +1676,7 @@ static bool vcpu_has(
 #define vcpu_has_lzcnt()       vcpu_has(0x80000001, ECX,  5, ctxt, ops)
 #define vcpu_has_sse4a()       vcpu_has(0x80000001, ECX,  6, ctxt, ops)
 #define vcpu_has_misalignsse() vcpu_has(0x80000001, ECX,  7, ctxt, ops)
+#define vcpu_has_xop()         vcpu_has(0x80000001, ECX, 12, ctxt, ops)
 #define vcpu_has_fma4()        vcpu_has(0x80000001, ECX, 16, ctxt, ops)
 #define vcpu_has_tbm()         vcpu_has(0x80000001, ECX, 21, ctxt, ops)
 #define vcpu_has_bmi1()        vcpu_has(         7, EBX,  3, ctxt, ops)
@@ -2994,9 +3017,19 @@ x86_decode(
     case simd_packed_int:
         switch ( vex.pfx )
         {
-        case vex_none: op_bytes = 8;           break;
-        case vex_66:   op_bytes = 16 << vex.l; break;
-        default:       op_bytes = 0;           break;
+        case vex_none:
+            if ( !vex.opcx )
+            {
+                op_bytes = 8;
+                break;
+            }
+            /* fall through */
+        case vex_66:
+            op_bytes = 16 << vex.l;
+            break;
+        default:
+            op_bytes = 0;
+            break;
         }
         break;
 
@@ -8027,6 +8060,13 @@ x86_emulate(
         generate_exception_if(vex.w, EXC_UD);
         goto simd_0f_imm8_avx;
 
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x48): /* vpermil2ps $imm,{x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+                                           /* vpermil2ps $imm,{x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x49): /* vpermil2pd $imm,{x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+                                           /* vpermil2pd $imm,{x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+        host_and_vcpu_must_have(xop);
+        goto simd_0f_imm8_ymm;
+
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x4c): /* vpblendvb {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
         generate_exception_if(vex.w, EXC_UD);
         goto simd_0f_int_imm8;
@@ -8164,6 +8204,41 @@ x86_emulate(
             asm ( "rorl %b1,%k0" : "=g" (dst.val) : "c" (imm1), "0" (src.val) );
         break;
 
+    case X86EMUL_OPC_XOP(08, 0x85): /* vpmacssww xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0x86): /* vpmacsswd xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0x87): /* vpmacssdql xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0x8e): /* vpmacssdd xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0x8f): /* vpmacssdqh xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0x95): /* vpmacsww xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0x96): /* vpmacswd xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0x97): /* vpmacsdql xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0x9e): /* vpmacsdd xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0x9f): /* vpmacsdqh xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0xa6): /* vpmadcsswd xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0xb6): /* vpmadcswd xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0xc0): /* vprotb $imm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(08, 0xc1): /* vprotw $imm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(08, 0xc2): /* vprotd $imm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(08, 0xc3): /* vprotq $imm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(08, 0xcc): /* vpcomb $imm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0xcd): /* vpcomw $imm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0xce): /* vpcomd $imm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0xcf): /* vpcomq $imm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0xec): /* vpcomub $imm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0xed): /* vpcomuw $imm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0xee): /* vpcomud $imm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0xef): /* vpcomuq $imm,xmm/m128,xmm,xmm */
+        generate_exception_if(vex.w, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_XOP(08, 0xa3): /* vpperm xmm/m128,xmm,xmm,xmm */
+                                    /* vpperm xmm,xmm/m128,xmm,xmm */
+        generate_exception_if(vex.l, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_XOP(08, 0xa2): /* vpcmov {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+                                    /* vpcmov {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+        host_and_vcpu_must_have(xop);
+        goto simd_0f_imm8_ymm;
+
     case X86EMUL_OPC_XOP(09, 0x01): /* XOP Grp1 */
         switch ( modrm_reg & 7 )
         {
@@ -8213,6 +8288,61 @@ x86_emulate(
         }
         goto unrecognized_insn;
 
+    case X86EMUL_OPC_XOP(09, 0x82): /* vfrczss xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x83): /* vfrczsd xmm/m128,xmm */
+        generate_exception_if(vex.l, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_XOP(09, 0x80): /* vfrczps {x,y}mm/mem,{x,y}mm */
+    case X86EMUL_OPC_XOP(09, 0x81): /* vfrczpd {x,y}mm/mem,{x,y}mm */
+        host_and_vcpu_must_have(xop);
+        generate_exception_if(vex.w, EXC_UD);
+        goto simd_0f_ymm;
+
+    case X86EMUL_OPC_XOP(09, 0xc1): /* vphaddbw xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xc2): /* vphaddbd xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xc3): /* vphaddbq xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xc6): /* vphaddwd xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xc7): /* vphaddwq xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xcb): /* vphadddq xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xd1): /* vphaddubw xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xd2): /* vphaddubd xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xd3): /* vphaddubq xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xd6): /* vphadduwd xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xd7): /* vphadduwq xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xdb): /* vphaddudq xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xe2): /* vphsubwd xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xe3): /* vphsubdq xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xe1): /* vphsubbw xmm/m128,xmm */
+        generate_exception_if(vex.w, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_XOP(09, 0x90): /* vprotb xmm/m128,xmm,xmm */
+                                    /* vprotb xmm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x91): /* vprotw xmm/m128,xmm,xmm */
+                                    /* vprotw xmm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x92): /* vprotd xmm/m128,xmm,xmm */
+                                    /* vprotd xmm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x93): /* vprotq xmm/m128,xmm,xmm */
+                                    /* vprotq xmm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x94): /* vpshlb xmm/m128,xmm,xmm */
+                                    /* vpshlb xmm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x95): /* vpshlw xmm/m128,xmm,xmm */
+                                    /* vpshlw xmm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x96): /* vpshld xmm/m128,xmm,xmm */
+                                    /* vpshld xmm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x97): /* vpshlq xmm/m128,xmm,xmm */
+                                    /* vpshlq xmm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x98): /* vpshab xmm/m128,xmm,xmm */
+                                    /* vpshab xmm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x99): /* vpshaw xmm/m128,xmm,xmm */
+                                    /* vpshaw xmm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x9a): /* vpshad xmm/m128,xmm,xmm */
+                                    /* vpshad xmm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x9b): /* vpshaq xmm/m128,xmm,xmm */
+                                    /* vpshaq xmm,xmm/m128,xmm */
+        generate_exception_if(vex.l, EXC_UD);
+        host_and_vcpu_must_have(xop);
+        goto simd_0f_ymm;
+
     case X86EMUL_OPC_XOP(0a, 0x10): /* bextr imm,r/m,r */
     {
         uint8_t *buf = get_stub(stub);
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -76,6 +76,7 @@
 #define cpu_has_cmp_legacy      boot_cpu_has(X86_FEATURE_CMP_LEGACY)
 #define cpu_has_svm             boot_cpu_has(X86_FEATURE_SVM)
 #define cpu_has_sse4a           boot_cpu_has(X86_FEATURE_SSE4A)
+#define cpu_has_xop             boot_cpu_has(X86_FEATURE_XOP)
 #define cpu_has_lwp             boot_cpu_has(X86_FEATURE_LWP)
 #define cpu_has_fma4            boot_cpu_has(X86_FEATURE_FMA4)
 #define cpu_has_tbm             boot_cpu_has(X86_FEATURE_TBM)



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* [PATCH v3 10/25] x86emul: support 3DNow! insns
  2017-12-07 13:49 [PATCH v3 00/25] x86: emulator enhancements Jan Beulich
                   ` (8 preceding siblings ...)
  2017-12-07 14:04 ` [PATCH v3 09/25] x86emul: support XOP insns Jan Beulich
@ 2017-12-07 14:05 ` Jan Beulich
  2018-02-02 13:02   ` Andrew Cooper
  2017-12-07 14:06 ` [PATCH v3 11/25] x86emul: place test blobs in executable section Jan Beulich
                   ` (14 subsequent siblings)
  24 siblings, 1 reply; 85+ messages in thread
From: Jan Beulich @ 2017-12-07 14:05 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper

Yes, recent AMD CPUs don't support them anymore, but I think we should
nevertheless cope.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: Re-base.

--- a/.gitignore
+++ b/.gitignore
@@ -223,6 +223,7 @@
 tools/security/xensec_tool
 tools/tests/x86_emulator/*.bin
 tools/tests/x86_emulator/*.tmp
+tools/tests/x86_emulator/3dnow*.[ch]
 tools/tests/x86_emulator/asm
 tools/tests/x86_emulator/avx*.[ch]
 tools/tests/x86_emulator/blowfish.h
--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -11,7 +11,7 @@ all: $(TARGET)
 run: $(TARGET)
 	./$(TARGET)
 
-SIMD := sse sse2 sse4 avx avx2 xop
+SIMD := 3dnow sse sse2 sse4 avx avx2 xop
 FMA := fma4 fma
 SG := avx2-sg
 TESTCASES := blowfish $(SIMD) $(FMA) $(SG)
@@ -19,6 +19,9 @@ TESTCASES := blowfish $(SIMD) $(FMA) $(S
 blowfish-cflags := ""
 blowfish-cflags-x86_32 := "-mno-accumulate-outgoing-args -Dstatic="
 
+3dnow-vecs := 8
+3dnow-ints :=
+3dnow-flts := 4
 sse-vecs := 16
 sse-ints :=
 sse-flts := 4
@@ -49,8 +52,13 @@ xop-ints := 1 2 4 8
 xop-flts := $(avx-flts)
 
 # For AVX and later, have the compiler avoid XMM0 to widen coverage of
-# the VEX.vvvv checks in the emulator.
-non-sse = $(if $(filter sse%,$(1)),,-ffixed-xmm0)
+# the VEX.vvvv checks in the emulator.  For 3DNow!, however, force SSE
+# use for floating point operations, to avoid mixing MMX and FPU register
+# uses.  Also enable 3DNow! extensions, but note that we can't use 3dnowa
+# as the test flavor right away since -m3dnowa is being understood only
+# by gcc 7.x and newer (older ones want a specific machine model instead).
+3dnowa := $(call cc-option,$(CC),-m3dnowa,-march=k8)
+non-sse = $(if $(filter sse%,$(1)),,$(if $(filter 3dnow%,$(1)),-msse -mfpmath=sse $(3dnowa),-ffixed-xmm0))
 
 define simd-defs
 $(1)-cflags := \
@@ -81,8 +89,9 @@ $(addsuffix .h,$(TESTCASES)): %.h: %.c t
 	$(foreach arch,$(filter-out $(XEN_COMPILE_ARCH),x86_32) $(XEN_COMPILE_ARCH), \
 	    for cflags in $($*-cflags) $($*-cflags-$(arch)); do \
 		$(MAKE) -f testcase.mk TESTCASE=$* XEN_TARGET_ARCH=$(arch) $*-cflags="$$cflags" all; \
+		prefix=$(shell echo $(subst -,_,$*) | sed -e 's,^\([0-9]\),_\1,'); \
 		flavor=$$(echo $${cflags} | sed -e 's, .*,,' -e 'y,-=,__,') ; \
-		(echo "static const unsigned int $(subst -,_,$*)_$(arch)$${flavor}[] = {"; \
+		(echo "static const unsigned int $${prefix}_$(arch)$${flavor}[] = {"; \
 		 od -v -t x $*.bin | sed -e 's/^[0-9]* /0x/' -e 's/ /, 0x/g' -e 's/$$/,/'; \
 		 echo "};") >>$@.new; \
 		rm -f $*.bin; \
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -48,6 +48,8 @@ static inline bool _to_bool(byte_vec_t b
 
 #if VEC_SIZE == FLOAT_SIZE
 # define to_int(x) ((vec_t){ (int)(x)[0] })
+#elif VEC_SIZE == 8 && FLOAT_SIZE == 4 && defined(__3dNOW__)
+# define to_int(x) __builtin_ia32_pi2fd(__builtin_ia32_pf2id(x))
 #elif VEC_SIZE == 16 && defined(__SSE2__)
 # if FLOAT_SIZE == 4
 #  define to_int(x) __builtin_ia32_cvtdq2ps(__builtin_ia32_cvtps2dq(x))
@@ -70,7 +72,24 @@ static inline bool _to_bool(byte_vec_t b
 })
 #endif
 
-#if FLOAT_SIZE == 4 && defined(__SSE__)
+#if VEC_SIZE == 8 && FLOAT_SIZE == 4 && defined(__3dNOW_A__)
+# define max __builtin_ia32_pfmax
+# define min __builtin_ia32_pfmin
+# define recip(x) ({ \
+    vec_t t_ = __builtin_ia32_pfrcp(x); \
+    touch(x); \
+    t_[1] = __builtin_ia32_pfrcp(__builtin_ia32_pswapdsf(x))[0]; \
+    touch(x); \
+    __builtin_ia32_pfrcpit2(__builtin_ia32_pfrcpit1(t_, x), t_); \
+})
+# define rsqrt(x) ({ \
+    vec_t t_ = __builtin_ia32_pfrsqrt(x); \
+    touch(x); \
+    t_[1] = __builtin_ia32_pfrsqrt(__builtin_ia32_pswapdsf(x))[0]; \
+    touch(x); \
+    __builtin_ia32_pfrcpit2(__builtin_ia32_pfrsqit1(__builtin_ia32_pfmul(t_, t_), x), t_); \
+})
+#elif FLOAT_SIZE == 4 && defined(__SSE__)
 # if VEC_SIZE == 32 && defined(__AVX__)
 #  if defined(__AVX2__)
 #   define broadcast(x) \
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -5,6 +5,7 @@
 
 #include "x86-emulate.h"
 #include "blowfish.h"
+#include "3dnow.h"
 #include "sse.h"
 #include "sse2.h"
 #include "sse4.h"
@@ -28,6 +29,11 @@ static bool blowfish_check_regs(const st
     return regs->eax == 2 && regs->edx == 1;
 }
 
+static bool simd_check__3dnow(void)
+{
+    return cpu_has_3dnow_ext && cpu_has_sse;
+}
+
 static bool simd_check_sse(void)
 {
     return cpu_has_sse;
@@ -117,6 +123,7 @@ static const struct {
 #else
 # define SIMD(desc, feat, form) SIMD_(32, desc, feat, form)
 #endif
+    SIMD(3DNow! single,          _3dnow,     8f4),
     SIMD(SSE scalar single,      sse,         f4),
     SIMD(SSE packed single,      sse,       16f4),
     SIMD(SSE2 scalar single,     sse2,        f4),
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -171,6 +171,12 @@ static inline uint64_t xgetbv(uint32_t x
     (res.b & (1U << 8)) != 0; \
 })
 
+#define cpu_has_3dnow_ext ({ \
+    struct cpuid_leaf res; \
+    emul_test_cpuid(0x80000001, 0, &res, NULL); \
+    (res.d & (1U << 30)) != 0; \
+})
+
 #define cpu_has_sse4a ({ \
     struct cpuid_leaf res; \
     emul_test_cpuid(0x80000001, 0, &res, NULL); \
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -355,6 +355,36 @@ static const struct {
     [0xff] = { ModRM }
 };
 
+static const uint16_t _3dnow_table[16] = {
+    [0x0] = (1 << 0xd) /* pi2fd */,
+    [0x1] = (1 << 0xd) /* pf2id */,
+    [0x9] = (1 << 0x0) /* pfcmpge */ |
+            (1 << 0x4) /* pfmin */ |
+            (1 << 0x6) /* pfrcp */ |
+            (1 << 0x7) /* pfrsqrt */ |
+            (1 << 0xa) /* pfsub */ |
+            (1 << 0xe) /* pfadd */,
+    [0xa] = (1 << 0x0) /* pfcmpge */ |
+            (1 << 0x4) /* pfmax */ |
+            (1 << 0x6) /* pfrcpit1 */ |
+            (1 << 0x7) /* pfrsqit1 */ |
+            (1 << 0xa) /* pfsubr */ |
+            (1 << 0xe) /* pfacc */,
+    [0xb] = (1 << 0x0) /* pfcmpeq */ |
+            (1 << 0x4) /* pfmul */ |
+            (1 << 0x6) /* pfrcpit2 */ |
+            (1 << 0x7) /* pmulhrw */ |
+            (1 << 0xf) /* pavgusb */,
+};
+
+static const uint16_t _3dnow_ext_table[16] = {
+    [0x1] = (1 << 0xd) /* pi2fw */,
+    [0x1] = (1 << 0xc) /* pf2iw */,
+    [0x8] = (1 << 0xa) /* pfnacc */ |
+            (1 << 0xa) /* pfpnacc */,
+    [0xb] = (1 << 0xb) /* pfswapd */,
+};
+
 /*
  * "two_op" and "four_op" below refer to the number of register operands
  * (one of which possibly also allowing to be a memory one). The named
@@ -1671,6 +1701,8 @@ static bool vcpu_has(
 #define vcpu_has_rdrand()      vcpu_has(         1, ECX, 30, ctxt, ops)
 #define vcpu_has_mmxext()     (vcpu_has(0x80000001, EDX, 22, ctxt, ops) || \
                                vcpu_has_sse())
+#define vcpu_has_3dnow_ext()   vcpu_has(0x80000001, EDX, 30, ctxt, ops)
+#define vcpu_has_3dnow()       vcpu_has(0x80000001, EDX, 31, ctxt, ops)
 #define vcpu_has_lahf_lm()     vcpu_has(0x80000001, ECX,  0, ctxt, ops)
 #define vcpu_has_cr8_legacy()  vcpu_has(0x80000001, ECX,  4, ctxt, ops)
 #define vcpu_has_lzcnt()       vcpu_has(0x80000001, ECX,  5, ctxt, ops)
@@ -5505,6 +5537,26 @@ x86_emulate(
     case X86EMUL_OPC(0x0f, 0x19) ... X86EMUL_OPC(0x0f, 0x1f): /* nop */
         break;
 
+    case X86EMUL_OPC(0x0f, 0x0e): /* femms */
+        host_and_vcpu_must_have(3dnow);
+        asm volatile ( "femms" );
+        break;
+
+    case X86EMUL_OPC(0x0f, 0x0f): /* 3DNow! */
+        if ( _3dnow_ext_table[(imm1 >> 4) & 0xf] & (1 << (imm1 & 0xf)) )
+            host_and_vcpu_must_have(3dnow_ext);
+        else if ( _3dnow_table[(imm1 >> 4) & 0xf] & (1 << (imm1 & 0xf)) )
+            host_and_vcpu_must_have(3dnow);
+        else
+            generate_exception(EXC_UD);
+
+        get_fpu(X86EMUL_FPU_mmx, &fic);
+
+        d = DstReg | SrcMem;
+        op_bytes = 8;
+        state->simd_size = simd_other;
+        goto simd_0f_imm8;
+
 #define CASE_SIMD_PACKED_INT(pfx, opc)       \
     case X86EMUL_OPC(pfx, opc):              \
     case X86EMUL_OPC_66(pfx, opc)
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -71,6 +71,8 @@
                                  && boot_cpu_has(X86_FEATURE_FFXSR))
 #define cpu_has_page1gb         boot_cpu_has(X86_FEATURE_PAGE1GB)
 #define cpu_has_rdtscp          boot_cpu_has(X86_FEATURE_RDTSCP)
+#define cpu_has_3dnow_ext       boot_cpu_has(X86_FEATURE_3DNOWEXT)
+#define cpu_has_3dnow           boot_cpu_has(X86_FEATURE_3DNOW)
 
 /* CPUID level 0x80000001.ecx */
 #define cpu_has_cmp_legacy      boot_cpu_has(X86_FEATURE_CMP_LEGACY)



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* [PATCH v3 11/25] x86emul: place test blobs in executable section
  2017-12-07 13:49 [PATCH v3 00/25] x86: emulator enhancements Jan Beulich
                   ` (9 preceding siblings ...)
  2017-12-07 14:05 ` [PATCH v3 10/25] x86emul: support 3DNow! insns Jan Beulich
@ 2017-12-07 14:06 ` Jan Beulich
  2018-02-02 13:03   ` Andrew Cooper
  2017-12-07 14:07 ` [PATCH v3 12/25] x86emul: abstract out XCRn accesses Jan Beulich
                   ` (13 subsequent siblings)
  24 siblings, 1 reply; 85+ messages in thread
From: Jan Beulich @ 2017-12-07 14:06 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper

This allows the section contents to be disassembled without going
through any extra hoops, simplifying the analysis of problems in test
and/or emulation code.

The blobs being emitted as (r/o) data means we need to accept an
assembler warning here (about the differing section attributes).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -91,7 +91,7 @@ $(addsuffix .h,$(TESTCASES)): %.h: %.c t
 		$(MAKE) -f testcase.mk TESTCASE=$* XEN_TARGET_ARCH=$(arch) $*-cflags="$$cflags" all; \
 		prefix=$(shell echo $(subst -,_,$*) | sed -e 's,^\([0-9]\),_\1,'); \
 		flavor=$$(echo $${cflags} | sed -e 's, .*,,' -e 'y,-=,__,') ; \
-		(echo "static const unsigned int $${prefix}_$(arch)$${flavor}[] = {"; \
+		(echo 'static const unsigned int __attribute__((section(".test"))) '"$${prefix}_$(arch)$${flavor}[] = {"; \
 		 od -v -t x $*.bin | sed -e 's/^[0-9]* /0x/' -e 's/ /, 0x/g' -e 's/$$/,/'; \
 		 echo "};") >>$@.new; \
 		rm -f $*.bin; \
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -4,6 +4,9 @@
 #include <sys/mman.h>
 
 #include "x86-emulate.h"
+
+asm ( ".pushsection .test, \"ax\", @progbits; .popsection" );
+
 #include "blowfish.h"
 #include "3dnow.h"
 #include "sse.h"
@@ -1127,9 +1130,9 @@ int main(int argc, char **argv)
 
 #define decl_insn(which) extern const unsigned char which[], \
                          which##_end[] asm ( ".L" #which "_end" )
-#define put_insn(which, insn) ".pushsection .test, \"ax\", @progbits\n" \
-                              #which ": " insn "\n"                     \
-                              ".L" #which "_end:\n"                     \
+#define put_insn(which, insn) ".pushsection .test\n" \
+                              #which ": " insn "\n"  \
+                              ".L" #which "_end:\n"  \
                               ".popsection"
 #define set_insn(which) (regs.eip = (unsigned long)(which))
 #define valid_eip(which) (regs.eip >= (unsigned long)(which) && \




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* [PATCH v3 12/25] x86emul: abstract out XCRn accesses
  2017-12-07 13:49 [PATCH v3 00/25] x86: emulator enhancements Jan Beulich
                   ` (10 preceding siblings ...)
  2017-12-07 14:06 ` [PATCH v3 11/25] x86emul: place test blobs in executable section Jan Beulich
@ 2017-12-07 14:07 ` Jan Beulich
  2018-02-02 13:29   ` Andrew Cooper
  2017-12-07 14:08 ` [PATCH v3 13/25] x86emul: adjust_bnd() should check XCR0 Jan Beulich
                   ` (12 subsequent siblings)
  24 siblings, 1 reply; 85+ messages in thread
From: Jan Beulich @ 2017-12-07 14:07 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper

Use hooks, just like done for other special purpose registers.

This includes moving XCR0 checks from hvmemul_get_fpu() to the emulator
itself as well as adding support for XGETBV emulation.

For now fuzzer reads will obtain the real values (minus the fuzzing of
the hook pointer itself).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Paul Durrant <paul.durrant@citrix.com>
---
v2: Re-base.

--- a/tools/fuzz/x86_instruction_emulator/fuzz-emul.c
+++ b/tools/fuzz/x86_instruction_emulator/fuzz-emul.c
@@ -458,6 +458,8 @@ static int fuzz_write_cr(
     return X86EMUL_OKAY;
 }
 
+#define fuzz_read_xcr emul_test_read_xcr
+
 enum {
     MSRI_IA32_SYSENTER_CS,
     MSRI_IA32_SYSENTER_ESP,
@@ -576,6 +578,7 @@ static const struct x86_emulate_ops all_
     SET(write_io),
     SET(read_cr),
     SET(write_cr),
+    SET(read_xcr),
     SET(read_msr),
     SET(write_msr),
     SET(wbinvd),
@@ -684,6 +687,7 @@ enum {
     HOOK_write_cr,
     HOOK_read_dr,
     HOOK_write_dr,
+    HOOK_read_xcr,
     HOOK_read_msr,
     HOOK_write_msr,
     HOOK_wbinvd,
@@ -728,6 +732,7 @@ static void disable_hooks(struct x86_emu
     MAYBE_DISABLE_HOOK(write_io);
     MAYBE_DISABLE_HOOK(read_cr);
     MAYBE_DISABLE_HOOK(write_cr);
+    MAYBE_DISABLE_HOOK(read_xcr);
     MAYBE_DISABLE_HOOK(read_msr);
     MAYBE_DISABLE_HOOK(write_msr);
     MAYBE_DISABLE_HOOK(wbinvd);
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -371,6 +371,7 @@ static struct x86_emulate_ops emulops =
     .read_segment = read_segment,
     .cpuid      = emul_test_cpuid,
     .read_cr    = emul_test_read_cr,
+    .read_xcr   = emul_test_read_xcr,
     .read_msr   = read_msr,
     .get_fpu    = emul_test_get_fpu,
     .put_fpu    = emul_test_put_fpu,
--- a/tools/tests/x86_emulator/x86-emulate.c
+++ b/tools/tests/x86_emulator/x86-emulate.c
@@ -120,6 +120,19 @@ int emul_test_read_cr(
     return X86EMUL_UNHANDLEABLE;
 }
 
+int emul_test_read_xcr(
+    unsigned int reg,
+    uint64_t *val,
+    struct x86_emulate_ctxt *ctxt)
+{
+    uint32_t lo, hi;
+
+    asm ( "xgetbv" : "=a" (lo), "=d" (hi) : "c" (reg) );
+    *val = lo | ((uint64_t)hi << 32);
+
+    return X86EMUL_OKAY;
+}
+
 int emul_test_get_fpu(
     void (*exception_callback)(void *, struct cpu_user_regs *),
     void *exception_callback_arg,
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -220,6 +220,11 @@ int emul_test_read_cr(
     unsigned long *val,
     struct x86_emulate_ctxt *ctxt);
 
+int emul_test_read_xcr(
+    unsigned int reg,
+    uint64_t *val,
+    struct x86_emulate_ctxt *ctxt);
+
 int emul_test_get_fpu(
     void (*exception_callback)(void *, struct cpu_user_regs *),
     void *exception_callback_arg,
--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -1825,6 +1825,49 @@ static int hvmemul_write_cr(
     return rc;
 }
 
+static int hvmemul_read_xcr(
+    unsigned int reg,
+    uint64_t *val,
+    struct x86_emulate_ctxt *ctxt)
+{
+    uint32_t lo, hi;
+
+    switch ( reg )
+    {
+    case 0:
+        *val = current->arch.xcr0;
+        return X86EMUL_OKAY;
+
+    case 1:
+        if ( !cpu_has_xgetbv1 )
+            return X86EMUL_UNHANDLEABLE;
+        break;
+
+    default:
+        return X86EMUL_UNHANDLEABLE;
+    }
+
+    asm ( ".byte 0x0f,0x01,0xd0" /* xgetbv */
+          : "=a" (lo), "=d" (hi) : "c" (reg) );
+    *val = lo | ((uint64_t)hi << 32);
+    HVMTRACE_LONG_2D(XCR_READ, reg, TRC_PAR_LONG(*val));
+
+    return X86EMUL_OKAY;
+}
+
+static int hvmemul_write_xcr(
+    unsigned int reg,
+    uint64_t val,
+    struct x86_emulate_ctxt *ctxt)
+{
+    HVMTRACE_LONG_2D(XCR_WRITE, reg, TRC_PAR_LONG(val));
+    if ( likely(handle_xsetbv(reg, val) == 0) )
+        return X86EMUL_OKAY;
+
+    x86_emul_hw_exception(TRAP_gp_fault, 0, ctxt);
+    return X86EMUL_EXCEPTION;
+}
+
 static int hvmemul_read_msr(
     unsigned int reg,
     uint64_t *val,
@@ -1873,22 +1916,6 @@ static int hvmemul_get_fpu(
 {
     struct vcpu *curr = current;
 
-    switch ( type )
-    {
-    case X86EMUL_FPU_fpu:
-    case X86EMUL_FPU_wait:
-    case X86EMUL_FPU_mmx:
-    case X86EMUL_FPU_xmm:
-        break;
-    case X86EMUL_FPU_ymm:
-        if ( !(curr->arch.xcr0 & XSTATE_SSE) ||
-             !(curr->arch.xcr0 & XSTATE_YMM) )
-            return X86EMUL_UNHANDLEABLE;
-        break;
-    default:
-        return X86EMUL_UNHANDLEABLE;
-    }
-
     if ( !curr->fpu_dirtied )
         hvm_funcs.fpu_dirty_intercept();
     else if ( type == X86EMUL_FPU_fpu )
@@ -2072,6 +2099,8 @@ static const struct x86_emulate_ops hvm_
     .write_io      = hvmemul_write_io,
     .read_cr       = hvmemul_read_cr,
     .write_cr      = hvmemul_write_cr,
+    .read_xcr      = hvmemul_read_xcr,
+    .write_xcr     = hvmemul_write_xcr,
     .read_msr      = hvmemul_read_msr,
     .write_msr     = hvmemul_write_msr,
     .wbinvd        = hvmemul_wbinvd,
@@ -2097,6 +2126,8 @@ static const struct x86_emulate_ops hvm_
     .write_io      = hvmemul_write_io_discard,
     .read_cr       = hvmemul_read_cr,
     .write_cr      = hvmemul_write_cr,
+    .read_xcr      = hvmemul_read_xcr,
+    .write_xcr     = hvmemul_write_xcr,
     .read_msr      = hvmemul_read_msr,
     .write_msr     = hvmemul_write_msr_discard,
     .wbinvd        = hvmemul_wbinvd_discard,
--- a/xen/arch/x86/pv/emul-priv-op.c
+++ b/xen/arch/x86/pv/emul-priv-op.c
@@ -36,6 +36,7 @@
 #include <asm/shared.h>
 #include <asm/traps.h>
 #include <asm/x86_emulate.h>
+#include <asm/xstate.h>
 
 #include <xsm/xsm.h>
 
@@ -818,6 +819,16 @@ static int write_dr(unsigned int reg, un
            ? X86EMUL_OKAY : X86EMUL_UNHANDLEABLE;
 }
 
+static int write_xcr(unsigned int reg, uint64_t val,
+                     struct x86_emulate_ctxt *ctxt)
+{
+    if ( likely(handle_xsetbv(reg, val) == 0) )
+        return X86EMUL_OKAY;
+
+    x86_emul_hw_exception(TRAP_gp_fault, 0, ctxt);
+    return X86EMUL_EXCEPTION;
+}
+
 static inline uint64_t guest_misc_enable(uint64_t val)
 {
     val &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL |
@@ -1313,6 +1324,7 @@ static const struct x86_emulate_ops priv
     .write_cr            = write_cr,
     .read_dr             = read_dr,
     .write_dr            = write_dr,
+    .write_xcr           = write_xcr,
     .read_msr            = read_msr,
     .write_msr           = write_msr,
     .cpuid               = pv_emul_cpuid,
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1118,10 +1118,27 @@ static int _get_fpu(
     struct x86_emulate_ctxt *ctxt,
     const struct x86_emulate_ops *ops)
 {
+    uint64_t xcr0;
     int rc;
 
     fail_if(!ops->get_fpu);
     ASSERT(type != X86EMUL_FPU_none);
+
+    if ( type < X86EMUL_FPU_ymm || !ops->read_xcr ||
+         ops->read_xcr(0, &xcr0, ctxt) != X86EMUL_OKAY )
+        xcr0 = 0;
+
+    switch ( type )
+    {
+    case X86EMUL_FPU_ymm:
+        if ( !(xcr0 & XSTATE_SSE) || !(xcr0 & XSTATE_YMM) )
+            return X86EMUL_UNHANDLEABLE;
+        break;
+
+    default:
+        break;
+    }
+
     rc = ops->get_fpu(fpu_handle_exception, fic, type, ctxt);
 
     if ( rc == X86EMUL_OKAY )
@@ -1649,7 +1666,8 @@ in_protmode(
 #define EBX 3
 
 static bool vcpu_has(
-    unsigned int eax,
+    unsigned int leaf,
+    unsigned int subleaf,
     unsigned int reg,
     unsigned int bit,
     struct x86_emulate_ctxt *ctxt,
@@ -1659,7 +1677,7 @@ static bool vcpu_has(
     int rc = X86EMUL_OKAY;
 
     fail_if(!ops->cpuid);
-    rc = ops->cpuid(eax, 0, &res, ctxt);
+    rc = ops->cpuid(leaf, subleaf, &res, ctxt);
     if ( rc == X86EMUL_OKAY )
     {
         switch ( reg )
@@ -1678,53 +1696,56 @@ static bool vcpu_has(
     return rc == X86EMUL_OKAY;
 }
 
-#define vcpu_has_fpu()         vcpu_has(         1, EDX,  0, ctxt, ops)
-#define vcpu_has_sep()         vcpu_has(         1, EDX, 11, ctxt, ops)
-#define vcpu_has_cx8()         vcpu_has(         1, EDX,  8, ctxt, ops)
-#define vcpu_has_cmov()        vcpu_has(         1, EDX, 15, ctxt, ops)
-#define vcpu_has_clflush()     vcpu_has(         1, EDX, 19, ctxt, ops)
-#define vcpu_has_mmx()         vcpu_has(         1, EDX, 23, ctxt, ops)
-#define vcpu_has_sse()         vcpu_has(         1, EDX, 25, ctxt, ops)
-#define vcpu_has_sse2()        vcpu_has(         1, EDX, 26, ctxt, ops)
-#define vcpu_has_sse3()        vcpu_has(         1, ECX,  0, ctxt, ops)
-#define vcpu_has_pclmulqdq()   vcpu_has(         1, ECX,  1, ctxt, ops)
-#define vcpu_has_ssse3()       vcpu_has(         1, ECX,  9, ctxt, ops)
-#define vcpu_has_fma()         vcpu_has(         1, ECX, 12, ctxt, ops)
-#define vcpu_has_cx16()        vcpu_has(         1, ECX, 13, ctxt, ops)
-#define vcpu_has_sse4_1()      vcpu_has(         1, ECX, 19, ctxt, ops)
-#define vcpu_has_sse4_2()      vcpu_has(         1, ECX, 20, ctxt, ops)
-#define vcpu_has_movbe()       vcpu_has(         1, ECX, 22, ctxt, ops)
-#define vcpu_has_popcnt()      vcpu_has(         1, ECX, 23, ctxt, ops)
-#define vcpu_has_aesni()       vcpu_has(         1, ECX, 25, ctxt, ops)
-#define vcpu_has_avx()         vcpu_has(         1, ECX, 28, ctxt, ops)
-#define vcpu_has_f16c()        vcpu_has(         1, ECX, 29, ctxt, ops)
-#define vcpu_has_rdrand()      vcpu_has(         1, ECX, 30, ctxt, ops)
-#define vcpu_has_mmxext()     (vcpu_has(0x80000001, EDX, 22, ctxt, ops) || \
+#define X 0 /* Just for documentation purposes. */
+
+#define vcpu_has_fpu()         vcpu_has(         1, X, EDX,  0, ctxt, ops)
+#define vcpu_has_sep()         vcpu_has(         1, X, EDX, 11, ctxt, ops)
+#define vcpu_has_cx8()         vcpu_has(         1, X, EDX,  8, ctxt, ops)
+#define vcpu_has_cmov()        vcpu_has(         1, X, EDX, 15, ctxt, ops)
+#define vcpu_has_clflush()     vcpu_has(         1, X, EDX, 19, ctxt, ops)
+#define vcpu_has_mmx()         vcpu_has(         1, X, EDX, 23, ctxt, ops)
+#define vcpu_has_sse()         vcpu_has(         1, X, EDX, 25, ctxt, ops)
+#define vcpu_has_sse2()        vcpu_has(         1, X, EDX, 26, ctxt, ops)
+#define vcpu_has_sse3()        vcpu_has(         1, X, ECX,  0, ctxt, ops)
+#define vcpu_has_pclmulqdq()   vcpu_has(         1, X, ECX,  1, ctxt, ops)
+#define vcpu_has_ssse3()       vcpu_has(         1, X, ECX,  9, ctxt, ops)
+#define vcpu_has_fma()         vcpu_has(         1, X, ECX, 12, ctxt, ops)
+#define vcpu_has_cx16()        vcpu_has(         1, X, ECX, 13, ctxt, ops)
+#define vcpu_has_sse4_1()      vcpu_has(         1, X, ECX, 19, ctxt, ops)
+#define vcpu_has_sse4_2()      vcpu_has(         1, X, ECX, 20, ctxt, ops)
+#define vcpu_has_movbe()       vcpu_has(         1, X, ECX, 22, ctxt, ops)
+#define vcpu_has_popcnt()      vcpu_has(         1, X, ECX, 23, ctxt, ops)
+#define vcpu_has_aesni()       vcpu_has(         1, X, ECX, 25, ctxt, ops)
+#define vcpu_has_avx()         vcpu_has(         1, X, ECX, 28, ctxt, ops)
+#define vcpu_has_f16c()        vcpu_has(         1, X, ECX, 29, ctxt, ops)
+#define vcpu_has_rdrand()      vcpu_has(         1, X, ECX, 30, ctxt, ops)
+#define vcpu_has_mmxext()     (vcpu_has(0x80000001, X, EDX, 22, ctxt, ops) || \
                                vcpu_has_sse())
-#define vcpu_has_3dnow_ext()   vcpu_has(0x80000001, EDX, 30, ctxt, ops)
-#define vcpu_has_3dnow()       vcpu_has(0x80000001, EDX, 31, ctxt, ops)
-#define vcpu_has_lahf_lm()     vcpu_has(0x80000001, ECX,  0, ctxt, ops)
-#define vcpu_has_cr8_legacy()  vcpu_has(0x80000001, ECX,  4, ctxt, ops)
-#define vcpu_has_lzcnt()       vcpu_has(0x80000001, ECX,  5, ctxt, ops)
-#define vcpu_has_sse4a()       vcpu_has(0x80000001, ECX,  6, ctxt, ops)
-#define vcpu_has_misalignsse() vcpu_has(0x80000001, ECX,  7, ctxt, ops)
-#define vcpu_has_xop()         vcpu_has(0x80000001, ECX, 12, ctxt, ops)
-#define vcpu_has_fma4()        vcpu_has(0x80000001, ECX, 16, ctxt, ops)
-#define vcpu_has_tbm()         vcpu_has(0x80000001, ECX, 21, ctxt, ops)
-#define vcpu_has_bmi1()        vcpu_has(         7, EBX,  3, ctxt, ops)
-#define vcpu_has_hle()         vcpu_has(         7, EBX,  4, ctxt, ops)
-#define vcpu_has_avx2()        vcpu_has(         7, EBX,  5, ctxt, ops)
-#define vcpu_has_bmi2()        vcpu_has(         7, EBX,  8, ctxt, ops)
-#define vcpu_has_rtm()         vcpu_has(         7, EBX, 11, ctxt, ops)
-#define vcpu_has_mpx()         vcpu_has(         7, EBX, 14, ctxt, ops)
-#define vcpu_has_rdseed()      vcpu_has(         7, EBX, 18, ctxt, ops)
-#define vcpu_has_adx()         vcpu_has(         7, EBX, 19, ctxt, ops)
-#define vcpu_has_smap()        vcpu_has(         7, EBX, 20, ctxt, ops)
-#define vcpu_has_clflushopt()  vcpu_has(         7, EBX, 23, ctxt, ops)
-#define vcpu_has_clwb()        vcpu_has(         7, EBX, 24, ctxt, ops)
-#define vcpu_has_sha()         vcpu_has(         7, EBX, 29, ctxt, ops)
-#define vcpu_has_rdpid()       vcpu_has(         7, ECX, 22, ctxt, ops)
-#define vcpu_has_clzero()      vcpu_has(0x80000008, EBX,  0, ctxt, ops)
+#define vcpu_has_3dnow_ext()   vcpu_has(0x80000001, X, EDX, 30, ctxt, ops)
+#define vcpu_has_3dnow()       vcpu_has(0x80000001, X, EDX, 31, ctxt, ops)
+#define vcpu_has_lahf_lm()     vcpu_has(0x80000001, X, ECX,  0, ctxt, ops)
+#define vcpu_has_cr8_legacy()  vcpu_has(0x80000001, X, ECX,  4, ctxt, ops)
+#define vcpu_has_lzcnt()       vcpu_has(0x80000001, X, ECX,  5, ctxt, ops)
+#define vcpu_has_sse4a()       vcpu_has(0x80000001, X, ECX,  6, ctxt, ops)
+#define vcpu_has_misalignsse() vcpu_has(0x80000001, X, ECX,  7, ctxt, ops)
+#define vcpu_has_xop()         vcpu_has(0x80000001, X, ECX, 12, ctxt, ops)
+#define vcpu_has_fma4()        vcpu_has(0x80000001, X, ECX, 16, ctxt, ops)
+#define vcpu_has_tbm()         vcpu_has(0x80000001, X, ECX, 21, ctxt, ops)
+#define vcpu_has_bmi1()        vcpu_has(         7, 0, EBX,  3, ctxt, ops)
+#define vcpu_has_hle()         vcpu_has(         7, 0, EBX,  4, ctxt, ops)
+#define vcpu_has_avx2()        vcpu_has(         7, 0, EBX,  5, ctxt, ops)
+#define vcpu_has_bmi2()        vcpu_has(         7, 0, EBX,  8, ctxt, ops)
+#define vcpu_has_rtm()         vcpu_has(         7, 0, EBX, 11, ctxt, ops)
+#define vcpu_has_mpx()         vcpu_has(         7, 0, EBX, 14, ctxt, ops)
+#define vcpu_has_rdseed()      vcpu_has(         7, 0, EBX, 18, ctxt, ops)
+#define vcpu_has_adx()         vcpu_has(         7, 0, EBX, 19, ctxt, ops)
+#define vcpu_has_smap()        vcpu_has(         7, 0, EBX, 20, ctxt, ops)
+#define vcpu_has_clflushopt()  vcpu_has(         7, 0, EBX, 23, ctxt, ops)
+#define vcpu_has_clwb()        vcpu_has(         7, 0, EBX, 24, ctxt, ops)
+#define vcpu_has_sha()         vcpu_has(         7, 0, EBX, 29, ctxt, ops)
+#define vcpu_has_rdpid()       vcpu_has(         7, 0, ECX, 22, ctxt, ops)
+#define vcpu_has_xgetbv1()     vcpu_has(       0xd, 1, EAX,  2, ctxt, ops)
+#define vcpu_has_clzero()      vcpu_has(0x80000008, X, EBX,  0, ctxt, ops)
 
 #define vcpu_must_have(feat) \
     generate_exception_if(!vcpu_has_##feat(), EXC_UD)
@@ -5161,18 +5182,33 @@ x86_emulate(
                 _regs.eflags |= X86_EFLAGS_AC;
             break;
 
-#ifdef __XEN__
-        case 0xd1: /* xsetbv */
+        case 0xd0: /* xgetbv */
             generate_exception_if(vex.pfx, EXC_UD);
-            if ( !ops->read_cr || ops->read_cr(4, &cr4, ctxt) != X86EMUL_OKAY )
+            if ( !ops->read_cr || !ops->read_xcr ||
+                 ops->read_cr(4, &cr4, ctxt) != X86EMUL_OKAY )
                 cr4 = 0;
             generate_exception_if(!(cr4 & X86_CR4_OSXSAVE), EXC_UD);
-            generate_exception_if(!mode_ring0() ||
-                                  handle_xsetbv(_regs.ecx,
-                                                _regs.eax | (_regs.rdx << 32)),
+            generate_exception_if(_regs.ecx > (vcpu_has_xgetbv1() ? 1 : 0),
                                   EXC_GP, 0);
+            rc = ops->read_xcr(_regs.ecx, &msr_val, ctxt);
+            if ( rc != X86EMUL_OKAY )
+                goto done;
+            _regs.r(ax) = (uint32_t)msr_val;
+            _regs.r(dx) = msr_val >> 32;
+            break;
+
+        case 0xd1: /* xsetbv */
+            generate_exception_if(vex.pfx, EXC_UD);
+            if ( !ops->read_cr || !ops->write_xcr ||
+                 ops->read_cr(4, &cr4, ctxt) != X86EMUL_OKAY )
+                cr4 = 0;
+            generate_exception_if(!(cr4 & X86_CR4_OSXSAVE), EXC_UD);
+            generate_exception_if(!mode_ring0() || _regs.ecx, EXC_GP, 0);
+            rc = ops->write_xcr(_regs.ecx,
+                                _regs.eax | ((uint64_t)_regs.edx << 32), ctxt);
+            if ( rc != X86EMUL_OKAY )
+                goto done;
             break;
-#endif
 
         case 0xd4: /* vmfunc */
             generate_exception_if(vex.pfx, EXC_UD);
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -390,6 +390,24 @@ struct x86_emulate_ops
         struct x86_emulate_ctxt *ctxt);
 
     /*
+     * read_xcr: Read from extended control register.
+     *  @reg:   [IN ] Register to read.
+     */
+    int (*read_xcr)(
+        unsigned int reg,
+        uint64_t *val,
+        struct x86_emulate_ctxt *ctxt);
+
+    /*
+     * write_xcr: Write to extended control register.
+     *  @reg:   [IN ] Register to write.
+     */
+    int (*write_xcr)(
+        unsigned int reg,
+        uint64_t val,
+        struct x86_emulate_ctxt *ctxt);
+
+    /*
      * read_msr: Read from model-specific register.
      *  @reg:   [IN ] Register to read.
      */
--- a/xen/include/asm-x86/hvm/trace.h
+++ b/xen/include/asm-x86/hvm/trace.h
@@ -33,6 +33,8 @@
 #define DO_TRC_HVM_CR_WRITE64  DEFAULT_HVM_REGACCESS
 #define DO_TRC_HVM_DR_READ     DEFAULT_HVM_REGACCESS
 #define DO_TRC_HVM_DR_WRITE    DEFAULT_HVM_REGACCESS
+#define DO_TRC_HVM_XCR_READ64  DEFAULT_HVM_REGACCESS
+#define DO_TRC_HVM_XCR_WRITE64 DEFAULT_HVM_REGACCESS
 #define DO_TRC_HVM_MSR_READ    DEFAULT_HVM_REGACCESS
 #define DO_TRC_HVM_MSR_WRITE   DEFAULT_HVM_REGACCESS
 #define DO_TRC_HVM_RDTSC       DEFAULT_HVM_REGACCESS
--- a/xen/include/asm-x86/x86-defns.h
+++ b/xen/include/asm-x86/x86-defns.h
@@ -66,4 +66,28 @@
 #define X86_CR4_SMAP       0x00200000 /* enable SMAP */
 #define X86_CR4_PKE        0x00400000 /* enable PKE */
 
+/*
+ * XSTATE component flags in XCR0
+ */
+#define _XSTATE_FP                0
+#define XSTATE_FP                 (1ULL << _XSTATE_FP)
+#define _XSTATE_SSE               1
+#define XSTATE_SSE                (1ULL << _XSTATE_SSE)
+#define _XSTATE_YMM               2
+#define XSTATE_YMM                (1ULL << _XSTATE_YMM)
+#define _XSTATE_BNDREGS           3
+#define XSTATE_BNDREGS            (1ULL << _XSTATE_BNDREGS)
+#define _XSTATE_BNDCSR            4
+#define XSTATE_BNDCSR             (1ULL << _XSTATE_BNDCSR)
+#define _XSTATE_OPMASK            5
+#define XSTATE_OPMASK             (1ULL << _XSTATE_OPMASK)
+#define _XSTATE_ZMM               6
+#define XSTATE_ZMM                (1ULL << _XSTATE_ZMM)
+#define _XSTATE_HI_ZMM            7
+#define XSTATE_HI_ZMM             (1ULL << _XSTATE_HI_ZMM)
+#define _XSTATE_PKRU              9
+#define XSTATE_PKRU               (1ULL << _XSTATE_PKRU)
+#define _XSTATE_LWP               62
+#define XSTATE_LWP                (1ULL << _XSTATE_LWP)
+
 #endif	/* __XEN_X86_DEFNS_H__ */
--- a/xen/include/asm-x86/xstate.h
+++ b/xen/include/asm-x86/xstate.h
@@ -10,6 +10,7 @@
 
 #include <xen/sched.h>
 #include <asm/cpufeature.h>
+#include <asm/x86-defns.h>
 
 #define FCW_DEFAULT               0x037f
 #define FCW_RESET                 0x0040
@@ -28,27 +29,6 @@ extern uint32_t mxcsr_mask;
 #define XSAVE_HDR_OFFSET          FXSAVE_SIZE
 #define XSTATE_AREA_MIN_SIZE      (FXSAVE_SIZE + XSAVE_HDR_SIZE)
 
-#define _XSTATE_FP                0
-#define XSTATE_FP                 (1ULL << _XSTATE_FP)
-#define _XSTATE_SSE               1
-#define XSTATE_SSE                (1ULL << _XSTATE_SSE)
-#define _XSTATE_YMM               2
-#define XSTATE_YMM                (1ULL << _XSTATE_YMM)
-#define _XSTATE_BNDREGS           3
-#define XSTATE_BNDREGS            (1ULL << _XSTATE_BNDREGS)
-#define _XSTATE_BNDCSR            4
-#define XSTATE_BNDCSR             (1ULL << _XSTATE_BNDCSR)
-#define _XSTATE_OPMASK            5
-#define XSTATE_OPMASK             (1ULL << _XSTATE_OPMASK)
-#define _XSTATE_ZMM               6
-#define XSTATE_ZMM                (1ULL << _XSTATE_ZMM)
-#define _XSTATE_HI_ZMM            7
-#define XSTATE_HI_ZMM             (1ULL << _XSTATE_HI_ZMM)
-#define _XSTATE_PKRU              9
-#define XSTATE_PKRU               (1ULL << _XSTATE_PKRU)
-#define _XSTATE_LWP               62
-#define XSTATE_LWP                (1ULL << _XSTATE_LWP)
-
 #define XSTATE_FP_SSE  (XSTATE_FP | XSTATE_SSE)
 #define XCNTXT_MASK    (XSTATE_FP | XSTATE_SSE | XSTATE_YMM | XSTATE_OPMASK | \
                         XSTATE_ZMM | XSTATE_HI_ZMM | XSTATE_NONLAZY)
--- a/xen/include/public/trace.h
+++ b/xen/include/public/trace.h
@@ -235,6 +235,8 @@
 #define TRC_HVM_TRAP             (TRC_HVM_HANDLER + 0x23)
 #define TRC_HVM_TRAP_DEBUG       (TRC_HVM_HANDLER + 0x24)
 #define TRC_HVM_VLAPIC           (TRC_HVM_HANDLER + 0x25)
+#define TRC_HVM_XCR_READ64      (TRC_HVM_HANDLER + TRC_64_FLAG + 0x26)
+#define TRC_HVM_XCR_WRITE64     (TRC_HVM_HANDLER + TRC_64_FLAG + 0x27)
 
 #define TRC_HVM_IOPORT_WRITE    (TRC_HVM_HANDLER + 0x216)
 #define TRC_HVM_IOMEM_WRITE     (TRC_HVM_HANDLER + 0x217)



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* [PATCH v3 13/25] x86emul: adjust_bnd() should check XCR0
  2017-12-07 13:49 [PATCH v3 00/25] x86: emulator enhancements Jan Beulich
                   ` (11 preceding siblings ...)
  2017-12-07 14:07 ` [PATCH v3 12/25] x86emul: abstract out XCRn accesses Jan Beulich
@ 2017-12-07 14:08 ` Jan Beulich
  2018-02-02 13:30   ` Andrew Cooper
  2017-12-07 14:09 ` [PATCH v3 14/25] x86emul: make all FPU emulation use the stub Jan Beulich
                   ` (11 subsequent siblings)
  24 siblings, 1 reply; 85+ messages in thread
From: Jan Beulich @ 2017-12-07 14:08 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper

Experimentally MPX instructions have been confirmed to behave as NOPs
unless both related XCR0 bits are set to 1. By implication branches
then also don't clear BNDn.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -2143,12 +2143,16 @@ static bool umip_active(struct x86_emula
 static void adjust_bnd(struct x86_emulate_ctxt *ctxt,
                        const struct x86_emulate_ops *ops, enum vex_pfx pfx)
 {
-    uint64_t bndcfg;
+    uint64_t xcr0, bndcfg;
     int rc;
 
     if ( pfx == vex_f2 || !cpu_has_mpx || !vcpu_has_mpx() )
         return;
 
+    if ( !ops->read_xcr || ops->read_xcr(0, &xcr0, ctxt) != X86EMUL_OKAY ||
+         !(xcr0 & XSTATE_BNDREGS) || !(xcr0 & XSTATE_BNDCSR) )
+        return;
+
     if ( !mode_ring0() )
         bndcfg = read_bndcfgu();
     else if ( !ops->read_msr ||




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* [PATCH v3 14/25] x86emul: make all FPU emulation use the stub
  2017-12-07 13:49 [PATCH v3 00/25] x86: emulator enhancements Jan Beulich
                   ` (12 preceding siblings ...)
  2017-12-07 14:08 ` [PATCH v3 13/25] x86emul: adjust_bnd() should check XCR0 Jan Beulich
@ 2017-12-07 14:09 ` Jan Beulich
  2018-02-02 13:37   ` Andrew Cooper
  2017-12-07 14:10 ` [PATCH v3 15/25] x86/HVM: eliminate custom #MF/#XM handling Jan Beulich
                   ` (10 subsequent siblings)
  24 siblings, 1 reply; 85+ messages in thread
From: Jan Beulich @ 2017-12-07 14:09 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper

While this means quite some reduction of (source) code, the main
purpose is to no longer have exceptions raised from other than stubs.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: Re-base.

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1262,28 +1262,25 @@ static inline bool fpu_check_write(void)
     return !(fsw & FSW_ES);
 }
 
-#define emulate_fpu_insn(_op)                           \
-    asm volatile (                                      \
-        "movb $2f-1f,%0 \n"                             \
-        "1: " _op "     \n"                             \
-        "2:             \n"                             \
-        : "=m" (fic.insn_bytes) : : "memory" )
-
-#define emulate_fpu_insn_memdst(_op, _arg)              \
-    asm volatile (                                      \
-        "movb $2f-1f,%0 \n"                             \
-        "1: " _op " %1  \n"                             \
-        "2:             \n"                             \
-        : "=m" (fic.insn_bytes), "=m" (_arg)            \
-        : : "memory" )
-
-#define emulate_fpu_insn_memsrc(_op, _arg)              \
-    asm volatile (                                      \
-        "movb $2f-1f,%0 \n"                             \
-        "1: " _op " %1  \n"                             \
-        "2:             \n"                             \
-        : "=m" (fic.insn_bytes)                         \
-        : "m" (_arg) : "memory" )
+#define emulate_fpu_insn_memdst(opc, ext, arg)                          \
+do {                                                                    \
+    /* ModRM: mod=0, reg=ext, rm=0, i.e. a (%rax) operand */            \
+    fic.insn_bytes = 2;                                                 \
+    memcpy(get_stub(stub),                                              \
+           ((uint8_t[]){ opc, ((ext) & 7) << 3, 0xc3 }), 3);            \
+    invoke_stub("", "", "+m" (fic), "+m" (arg) : "a" (&(arg)));         \
+    put_stub(stub);                                                     \
+} while (0)
+
+#define emulate_fpu_insn_memsrc(opc, ext, arg)                          \
+do {                                                                    \
+    /* ModRM: mod=0, reg=ext, rm=0, i.e. a (%rax) operand */            \
+    fic.insn_bytes = 2;                                                 \
+    memcpy(get_stub(stub),                                              \
+           ((uint8_t[]){ opc, ((ext) & 7) << 3, 0xc3 }), 3);            \
+    invoke_stub("", "", "+m" (fic) : "m" (arg), "a" (&(arg)));          \
+    put_stub(stub);                                                     \
+} while (0)
 
 #define emulate_fpu_insn_stub(bytes...)                                 \
 do {                                                                    \
@@ -3846,8 +3843,7 @@ x86_emulate(
     case 0x9b:  /* wait/fwait */
         host_and_vcpu_must_have(fpu);
         get_fpu(X86EMUL_FPU_wait, &fic);
-        fic.insn_bytes = 1;
-        asm volatile ( "fwait" ::: "memory" );
+        emulate_fpu_insn_stub(b);
         check_fpu_exn(&fic);
         break;
 
@@ -4266,37 +4262,13 @@ x86_emulate(
             emulate_fpu_insn_stub(0xd8, modrm);
             break;
         default:
+        fpu_memsrc32:
             ASSERT(ea.type == OP_MEM);
             if ( (rc = ops->read(ea.mem.seg, ea.mem.off, &src.val,
                                  4, ctxt)) != X86EMUL_OKAY )
                 goto done;
-            switch ( modrm_reg & 7 )
-            {
-            case 0: /* fadd */
-                emulate_fpu_insn_memsrc("fadds", src.val);
-                break;
-            case 1: /* fmul */
-                emulate_fpu_insn_memsrc("fmuls", src.val);
-                break;
-            case 2: /* fcom */
-                emulate_fpu_insn_memsrc("fcoms", src.val);
-                break;
-            case 3: /* fcomp */
-                emulate_fpu_insn_memsrc("fcomps", src.val);
-                break;
-            case 4: /* fsub */
-                emulate_fpu_insn_memsrc("fsubs", src.val);
-                break;
-            case 5: /* fsubr */
-                emulate_fpu_insn_memsrc("fsubrs", src.val);
-                break;
-            case 6: /* fdiv */
-                emulate_fpu_insn_memsrc("fdivs", src.val);
-                break;
-            case 7: /* fdivr */
-                emulate_fpu_insn_memsrc("fdivrs", src.val);
-                break;
-            }
+            emulate_fpu_insn_memsrc(b, modrm_reg, src.val);
+            break;
         }
         check_fpu_exn(&fic);
         break;
@@ -4343,52 +4315,46 @@ x86_emulate(
             break;
         default:
             generate_exception_if(ea.type != OP_MEM, EXC_UD);
-            dst = ea;
             switch ( modrm_reg & 7 )
             {
             case 0: /* fld m32fp */
-                if ( (rc = ops->read(ea.mem.seg, ea.mem.off, &src.val,
-                                     4, ctxt)) != X86EMUL_OKAY )
-                    goto done;
-                emulate_fpu_insn_memsrc("flds", src.val);
-                dst.type = OP_NONE;
-                break;
+                goto fpu_memsrc32;
             case 2: /* fst m32fp */
-                emulate_fpu_insn_memdst("fsts", dst.val);
-                dst.bytes = 4;
-                break;
             case 3: /* fstp m32fp */
-                emulate_fpu_insn_memdst("fstps", dst.val);
+            fpu_memdst32:
+                dst = ea;
                 dst.bytes = 4;
+                emulate_fpu_insn_memdst(b, modrm_reg, dst.val);
                 break;
             case 4: /* fldenv - TODO */
                 state->fpu_ctrl = true;
                 goto unimplemented_insn;
             case 5: /* fldcw m2byte */
                 state->fpu_ctrl = true;
+            fpu_memsrc16:
                 if ( (rc = ops->read(ea.mem.seg, ea.mem.off, &src.val,
                                      2, ctxt)) != X86EMUL_OKAY )
                     goto done;
-                emulate_fpu_insn_memsrc("fldcw", src.val);
-                dst.type = OP_NONE;
+                emulate_fpu_insn_memsrc(b, modrm_reg, src.val);
                 break;
             case 6: /* fnstenv - TODO */
                 state->fpu_ctrl = true;
                 goto unimplemented_insn;
             case 7: /* fnstcw m2byte */
                 state->fpu_ctrl = true;
-                emulate_fpu_insn_memdst("fnstcw", dst.val);
+            fpu_memdst16:
+                dst = ea;
                 dst.bytes = 2;
+                emulate_fpu_insn_memdst(b, modrm_reg, dst.val);
                 break;
             default:
                 generate_exception(EXC_UD);
             }
             /*
              * Control instructions can't raise FPU exceptions, so we need
-             * to consider suppressing writes only for non-control ones. All
-             * of them in this group have data width 4.
+             * to consider suppressing writes only for non-control ones.
              */
-            if ( dst.type == OP_MEM && dst.bytes == 4 && !fpu_check_write() )
+            if ( dst.type == OP_MEM && !state->fpu_ctrl && !fpu_check_write() )
                 dst.type = OP_NONE;
         }
         check_fpu_exn(&fic);
@@ -4411,36 +4377,7 @@ x86_emulate(
             break;
         default:
             generate_exception_if(ea.type != OP_MEM, EXC_UD);
-            if ( (rc = ops->read(ea.mem.seg, ea.mem.off, &src.val,
-                                 4, ctxt)) != X86EMUL_OKAY )
-                goto done;
-            switch ( modrm_reg & 7 )
-            {
-            case 0: /* fiadd m32i */
-                emulate_fpu_insn_memsrc("fiaddl", src.val);
-                break;
-            case 1: /* fimul m32i */
-                emulate_fpu_insn_memsrc("fimull", src.val);
-                break;
-            case 2: /* ficom m32i */
-                emulate_fpu_insn_memsrc("ficoml", src.val);
-                break;
-            case 3: /* ficomp m32i */
-                emulate_fpu_insn_memsrc("ficompl", src.val);
-                break;
-            case 4: /* fisub m32i */
-                emulate_fpu_insn_memsrc("fisubl", src.val);
-                break;
-            case 5: /* fisubr m32i */
-                emulate_fpu_insn_memsrc("fisubrl", src.val);
-                break;
-            case 6: /* fidiv m32i */
-                emulate_fpu_insn_memsrc("fidivl", src.val);
-                break;
-            case 7: /* fidivr m32i */
-                emulate_fpu_insn_memsrc("fidivrl", src.val);
-                break;
-            }
+            goto fpu_memsrc32;
         }
         check_fpu_exn(&fic);
         break;
@@ -4470,50 +4407,35 @@ x86_emulate(
             break;
         default:
             generate_exception_if(ea.type != OP_MEM, EXC_UD);
-            dst = ea;
             switch ( modrm_reg & 7 )
             {
             case 0: /* fild m32i */
-                if ( (rc = ops->read(ea.mem.seg, ea.mem.off, &src.val,
-                                     4, ctxt)) != X86EMUL_OKAY )
-                    goto done;
-                emulate_fpu_insn_memsrc("fildl", src.val);
-                dst.type = OP_NONE;
-                break;
+                goto fpu_memsrc32;
             case 1: /* fisttp m32i */
                 host_and_vcpu_must_have(sse3);
-                emulate_fpu_insn_memdst("fisttpl", dst.val);
-                dst.bytes = 4;
-                break;
+                /* fall through */
             case 2: /* fist m32i */
-                emulate_fpu_insn_memdst("fistl", dst.val);
-                dst.bytes = 4;
-                break;
             case 3: /* fistp m32i */
-                emulate_fpu_insn_memdst("fistpl", dst.val);
-                dst.bytes = 4;
-                break;
+                goto fpu_memdst32;
             case 5: /* fld m80fp */
+            fpu_memsrc80:
                 if ( (rc = ops->read(ea.mem.seg, ea.mem.off, mmvalp,
                                      10, ctxt)) != X86EMUL_OKAY )
                     goto done;
-                emulate_fpu_insn_memsrc("fldt", *mmvalp);
-                dst.type = OP_NONE;
+                emulate_fpu_insn_memsrc(b, modrm_reg, *mmvalp);
                 break;
             case 7: /* fstp m80fp */
+            fpu_memdst80:
                 fail_if(!ops->write);
-                emulate_fpu_insn_memdst("fstpt", *mmvalp);
+                emulate_fpu_insn_memdst(b, modrm_reg, *mmvalp);
                 if ( fpu_check_write() &&
                      (rc = ops->write(ea.mem.seg, ea.mem.off, mmvalp,
                                       10, ctxt)) != X86EMUL_OKAY )
                     goto done;
-                dst.type = OP_NONE;
                 break;
             default:
                 generate_exception(EXC_UD);
             }
-            if ( dst.type == OP_MEM && !fpu_check_write() )
-                dst.type = OP_NONE;
         }
         check_fpu_exn(&fic);
         break;
@@ -4534,37 +4456,13 @@ x86_emulate(
             emulate_fpu_insn_stub(0xdc, modrm);
             break;
         default:
+        fpu_memsrc64:
             ASSERT(ea.type == OP_MEM);
             if ( (rc = ops->read(ea.mem.seg, ea.mem.off, &src.val,
                                  8, ctxt)) != X86EMUL_OKAY )
                 goto done;
-            switch ( modrm_reg & 7 )
-            {
-            case 0: /* fadd m64fp */
-                emulate_fpu_insn_memsrc("faddl", src.val);
-                break;
-            case 1: /* fmul m64fp */
-                emulate_fpu_insn_memsrc("fmull", src.val);
-                break;
-            case 2: /* fcom m64fp */
-                emulate_fpu_insn_memsrc("fcoml", src.val);
-                break;
-            case 3: /* fcomp m64fp */
-                emulate_fpu_insn_memsrc("fcompl", src.val);
-                break;
-            case 4: /* fsub m64fp */
-                emulate_fpu_insn_memsrc("fsubl", src.val);
-                break;
-            case 5: /* fsubr m64fp */
-                emulate_fpu_insn_memsrc("fsubrl", src.val);
-                break;
-            case 6: /* fdiv m64fp */
-                emulate_fpu_insn_memsrc("fdivl", src.val);
-                break;
-            case 7: /* fdivr m64fp */
-                emulate_fpu_insn_memsrc("fdivrl", src.val);
-                break;
-            }
+            emulate_fpu_insn_memsrc(b, modrm_reg, src.val);
+            break;
         }
         check_fpu_exn(&fic);
         break;
@@ -4584,28 +4482,19 @@ x86_emulate(
             break;
         default:
             generate_exception_if(ea.type != OP_MEM, EXC_UD);
-            dst = ea;
             switch ( modrm_reg & 7 )
             {
             case 0: /* fld m64fp */;
-                if ( (rc = ops->read(ea.mem.seg, ea.mem.off, &src.val,
-                                     8, ctxt)) != X86EMUL_OKAY )
-                    goto done;
-                emulate_fpu_insn_memsrc("fldl", src.val);
-                dst.type = OP_NONE;
-                break;
+                goto fpu_memsrc64;
             case 1: /* fisttp m64i */
                 host_and_vcpu_must_have(sse3);
-                emulate_fpu_insn_memdst("fisttpll", dst.val);
-                dst.bytes = 8;
-                break;
+                /* fall through */
             case 2: /* fst m64fp */
-                emulate_fpu_insn_memdst("fstl", dst.val);
-                dst.bytes = 8;
-                break;
             case 3: /* fstp m64fp */
-                emulate_fpu_insn_memdst("fstpl", dst.val);
+            fpu_memdst64:
+                dst = ea;
                 dst.bytes = 8;
+                emulate_fpu_insn_memdst(b, modrm_reg, dst.val);
                 break;
             case 4: /* frstor - TODO */
             case 6: /* fnsave - TODO */
@@ -4613,18 +4502,15 @@ x86_emulate(
                 goto unimplemented_insn;
             case 7: /* fnstsw m2byte */
                 state->fpu_ctrl = true;
-                emulate_fpu_insn_memdst("fnstsw", dst.val);
-                dst.bytes = 2;
-                break;
+                goto fpu_memdst16;
             default:
                 generate_exception(EXC_UD);
             }
             /*
              * Control instructions can't raise FPU exceptions, so we need
-             * to consider suppressing writes only for non-control ones. All
-             * of them in this group have data width 8.
+             * to consider suppressing writes only for non-control ones.
              */
-            if ( dst.type == OP_MEM && dst.bytes == 8 && !fpu_check_write() )
+            if ( dst.type == OP_MEM && !state->fpu_ctrl && !fpu_check_write() )
                 dst.type = OP_NONE;
         }
         check_fpu_exn(&fic);
@@ -4647,33 +4533,8 @@ x86_emulate(
             break;
         default:
             generate_exception_if(ea.type != OP_MEM, EXC_UD);
-            switch ( modrm_reg & 7 )
-            {
-            case 0: /* fiadd m16i */
-                emulate_fpu_insn_memsrc("fiadds", src.val);
-                break;
-            case 1: /* fimul m16i */
-                emulate_fpu_insn_memsrc("fimuls", src.val);
-                break;
-            case 2: /* ficom m16i */
-                emulate_fpu_insn_memsrc("ficoms", src.val);
-                break;
-            case 3: /* ficomp m16i */
-                emulate_fpu_insn_memsrc("ficomps", src.val);
-                break;
-            case 4: /* fisub m16i */
-                emulate_fpu_insn_memsrc("fisubs", src.val);
-                break;
-            case 5: /* fisubr m16i */
-                emulate_fpu_insn_memsrc("fisubrs", src.val);
-                break;
-            case 6: /* fidiv m16i */
-                emulate_fpu_insn_memsrc("fidivs", src.val);
-                break;
-            case 7: /* fidivr m16i */
-                emulate_fpu_insn_memsrc("fidivrs", src.val);
-                break;
-            }
+            emulate_fpu_insn_memsrc(b, modrm_reg, src.val);
+            break;
         }
         check_fpu_exn(&fic);
         break;
@@ -4689,7 +4550,7 @@ x86_emulate(
             dst.bytes = 2;
             dst.type = OP_REG;
             dst.reg = (void *)&_regs.ax;
-            emulate_fpu_insn_memdst("fnstsw", dst.val);
+            emulate_fpu_insn_memdst(b, modrm_reg, dst.val);
             break;
         case 0xe8 ... 0xef: /* fucomip %stN */
         case 0xf0 ... 0xf7: /* fcomip %stN */
@@ -4704,59 +4565,26 @@ x86_emulate(
             break;
         default:
             generate_exception_if(ea.type != OP_MEM, EXC_UD);
-            dst = ea;
             switch ( modrm_reg & 7 )
             {
             case 0: /* fild m16i */
-                if ( (rc = ops->read(ea.mem.seg, ea.mem.off, &src.val,
-                                     2, ctxt)) != X86EMUL_OKAY )
-                    goto done;
-                emulate_fpu_insn_memsrc("filds", src.val);
-                dst.type = OP_NONE;
-                break;
+                goto fpu_memsrc16;
             case 1: /* fisttp m16i */
                 host_and_vcpu_must_have(sse3);
-                emulate_fpu_insn_memdst("fisttps", dst.val);
-                dst.bytes = 2;
-                break;
+                /* fall through */
             case 2: /* fist m16i */
-                emulate_fpu_insn_memdst("fists", dst.val);
-                dst.bytes = 2;
-                break;
             case 3: /* fistp m16i */
-                emulate_fpu_insn_memdst("fistps", dst.val);
-                dst.bytes = 2;
-                break;
+                goto fpu_memdst16;
             case 4: /* fbld m80dec */
-                if ( (rc = ops->read(ea.mem.seg, ea.mem.off, mmvalp,
-                                     10, ctxt)) != X86EMUL_OKAY )
-                    goto done;
-                emulate_fpu_insn_memsrc("fbld", *mmvalp);
-                dst.type = OP_NONE;
-                break;
+                goto fpu_memsrc80;
             case 5: /* fild m64i */
-                if ( (rc = ops->read(ea.mem.seg, ea.mem.off, &src.val,
-                                     8, ctxt)) != X86EMUL_OKAY )
-                    goto done;
-                emulate_fpu_insn_memsrc("fildll", src.val);
                 dst.type = OP_NONE;
-                break;
+                goto fpu_memsrc64;
             case 6: /* fbstp packed bcd */
-                fail_if(!ops->write);
-                emulate_fpu_insn_memdst("fbstp", *mmvalp);
-                if ( fpu_check_write() &&
-                     (rc = ops->write(ea.mem.seg, ea.mem.off, mmvalp,
-                                      10, ctxt)) != X86EMUL_OKAY )
-                    goto done;
-                dst.type = OP_NONE;
-                break;
+                goto fpu_memdst80;
             case 7: /* fistp m64i */
-                emulate_fpu_insn_memdst("fistpll", dst.val);
-                dst.bytes = 8;
-                break;
+                goto fpu_memdst64;
             }
-            if ( dst.type == OP_MEM && !fpu_check_write() )
-                dst.type = OP_NONE;
         }
         check_fpu_exn(&fic);
         break;



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* [PATCH v3 15/25] x86/HVM: eliminate custom #MF/#XM handling
  2017-12-07 13:49 [PATCH v3 00/25] x86: emulator enhancements Jan Beulich
                   ` (13 preceding siblings ...)
  2017-12-07 14:09 ` [PATCH v3 14/25] x86emul: make all FPU emulation use the stub Jan Beulich
@ 2017-12-07 14:10 ` Jan Beulich
  2018-02-02 13:38   ` Andrew Cooper
  2017-12-07 14:11 ` [PATCH v3 16/25] x86emul: support SWAPGS Jan Beulich
                   ` (9 subsequent siblings)
  24 siblings, 1 reply; 85+ messages in thread
From: Jan Beulich @ 2017-12-07 14:10 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper

Use the generic stub exception handling instead.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Paul Durrant <paul.durrant@citrix.com>
---
v3: Re-base.
v2: Re-base.

--- a/tools/tests/x86_emulator/x86-emulate.c
+++ b/tools/tests/x86_emulator/x86-emulate.c
@@ -134,8 +134,6 @@ int emul_test_read_xcr(
 }
 
 int emul_test_get_fpu(
-    void (*exception_callback)(void *, struct cpu_user_regs *),
-    void *exception_callback_arg,
     enum x86_emulate_fpu_type type,
     struct x86_emulate_ctxt *ctxt)
 {
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -226,8 +226,6 @@ int emul_test_read_xcr(
     struct x86_emulate_ctxt *ctxt);
 
 int emul_test_get_fpu(
-    void (*exception_callback)(void *, struct cpu_user_regs *),
-    void *exception_callback_arg,
     enum x86_emulate_fpu_type type,
     struct x86_emulate_ctxt *ctxt);
 
--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -1909,8 +1909,6 @@ int hvmemul_cpuid(uint32_t leaf, uint32_
 }
 
 static int hvmemul_get_fpu(
-    void (*exception_callback)(void *, struct cpu_user_regs *),
-    void *exception_callback_arg,
     enum x86_emulate_fpu_type type,
     struct x86_emulate_ctxt *ctxt)
 {
@@ -1948,9 +1946,6 @@ static int hvmemul_get_fpu(
         }
     }
 
-    curr->arch.hvm_vcpu.fpu_exception_callback = exception_callback;
-    curr->arch.hvm_vcpu.fpu_exception_callback_arg = exception_callback_arg;
-
     return X86EMUL_OKAY;
 }
 
@@ -1961,8 +1956,6 @@ static void hvmemul_put_fpu(
 {
     struct vcpu *curr = current;
 
-    curr->arch.hvm_vcpu.fpu_exception_callback = NULL;
-
     if ( aux )
     {
         typeof(curr->arch.xsave_area->fpu_sse) *fpu_ctxt = curr->arch.fpu_ctxt;
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -704,7 +704,6 @@ void do_reserved_trap(struct cpu_user_re
 
 void do_trap(struct cpu_user_regs *regs)
 {
-    struct vcpu *curr = current;
     unsigned int trapnr = regs->entry_vector;
     unsigned long fixup;
 
@@ -724,15 +723,6 @@ void do_trap(struct cpu_user_regs *regs)
         return;
     }
 
-    if ( ((trapnr == TRAP_copro_error) || (trapnr == TRAP_simd_error)) &&
-         system_state >= SYS_STATE_active && is_hvm_vcpu(curr) &&
-         curr->arch.hvm_vcpu.fpu_exception_callback )
-    {
-        curr->arch.hvm_vcpu.fpu_exception_callback(
-            curr->arch.hvm_vcpu.fpu_exception_callback_arg, regs);
-        return;
-    }
-
     if ( likely((fixup = search_exception_table(regs)) != 0) )
     {
         dprintk(XENLOG_ERR, "Trap %u: %p [%ps] -> %p\n",
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -959,6 +959,33 @@ static inline int mkec(uint8_t e, int32_
 #define generate_exception(e, ec...) generate_exception_if(true, e, ##ec)
 
 #ifdef __XEN__
+static int exception_from_stub(union stub_exception_token res,
+                               void *stub, unsigned int line,
+                               struct x86_emulate_ctxt *ctxt,
+                               const struct x86_emulate_ops *ops)
+{
+    int rc = X86EMUL_UNHANDLEABLE;
+
+    generate_exception_if(res.fields.trapnr == EXC_MF, EXC_MF);
+    if ( res.fields.trapnr == EXC_XM )
+    {
+        unsigned long cr4;
+
+        if ( !ops->read_cr || !ops->read_cr(4, &cr4, ctxt) == X86EMUL_OKAY )
+            cr4 = X86_CR4_OSXMMEXCPT;
+        generate_exception(cr4 & X86_CR4_OSXMMEXCPT ? EXC_XM : EXC_UD);
+    }
+    gprintk(XENLOG_WARNING,
+            "exception %u (ec=%04x) in emulation stub (line %u)\n",
+            res.fields.trapnr, res.fields.ec, line);
+    gprintk(XENLOG_INFO, "stub: %"__stringify(MAX_INST_LEN)"ph\n",  stub);
+    generate_exception_if(res.fields.trapnr == EXC_UD, EXC_UD);
+    domain_crash(current->domain);
+
+ done:
+    return rc;
+}
+
 # define invoke_stub(pre, post, constraints...) do {                    \
     union stub_exception_token res_ = { .raw = ~0 };                    \
     asm volatile ( pre "\n\tcall *%[stub]\n\t" post "\n"                \
@@ -974,14 +1001,7 @@ static inline int mkec(uint8_t e, int32_
                      "m" (*(uint8_t(*)[MAX_INST_LEN + 1])stub.ptr) );   \
     if ( unlikely(~res_.raw) )                                          \
     {                                                                   \
-        gprintk(XENLOG_WARNING,                                         \
-                "exception %u (ec=%04x) in emulation stub (line %u)\n", \
-                res_.fields.trapnr, res_.fields.ec, __LINE__);          \
-        gprintk(XENLOG_INFO, "stub: %"__stringify(MAX_INST_LEN)"ph\n",  \
-                stub.func);                                             \
-        generate_exception_if(res_.fields.trapnr == EXC_UD, EXC_UD);    \
-        domain_crash(current->domain);                                  \
-        rc = X86EMUL_UNHANDLEABLE;                                      \
+        rc = exception_from_stub(res_, stub.func, __LINE__, ctxt, ops); \
         goto done;                                                      \
     }                                                                   \
 } while (0)
@@ -1098,23 +1118,8 @@ do {
     ops->write_segment(x86_seg_cs, cs, ctxt);                           \
 })
 
-struct fpu_insn_ctxt {
-    uint8_t insn_bytes;
-    uint8_t type;
-    int8_t exn_raised;
-};
-
-static void fpu_handle_exception(void *_fic, struct cpu_user_regs *regs)
-{
-    struct fpu_insn_ctxt *fic = _fic;
-    ASSERT(regs->entry_vector < 0x20);
-    fic->exn_raised = regs->entry_vector;
-    regs->r(ip) += fic->insn_bytes;
-}
-
 static int _get_fpu(
     enum x86_emulate_fpu_type type,
-    struct fpu_insn_ctxt *fic,
     struct x86_emulate_ctxt *ctxt,
     const struct x86_emulate_ops *ops)
 {
@@ -1139,14 +1144,13 @@ static int _get_fpu(
         break;
     }
 
-    rc = ops->get_fpu(fpu_handle_exception, fic, type, ctxt);
+    rc = ops->get_fpu(type, ctxt);
 
     if ( rc == X86EMUL_OKAY )
     {
         unsigned long cr0;
 
         fail_if(type == X86EMUL_FPU_fpu && !ops->put_fpu);
-        fic->type = type;
 
         fail_if(!ops->read_cr);
         if ( type >= X86EMUL_FPU_xmm )
@@ -1184,37 +1188,22 @@ static int _get_fpu(
     return rc;
 }
 
-#define get_fpu(_type, _fic)                                    \
+#define get_fpu(type)                                           \
 do {                                                            \
-    rc = _get_fpu(_type, _fic, ctxt, ops);                      \
+    rc = _get_fpu(fpu_type = (type), ctxt, ops);                \
     if ( rc ) goto done;                                        \
 } while (0)
 
-#define check_fpu_exn(fic)                                      \
-do {                                                            \
-    generate_exception_if((fic)->exn_raised >= 0,               \
-                          (fic)->exn_raised);                   \
-} while (0)
-
-#define check_xmm_exn(fic)                                      \
-do {                                                            \
-    if ( (fic)->exn_raised == EXC_XM && ops->read_cr &&         \
-         ops->read_cr(4, &cr4, ctxt) == X86EMUL_OKAY &&         \
-         !(cr4 & X86_CR4_OSXMMEXCPT) )                          \
-        (fic)->exn_raised = EXC_UD;                             \
-    check_fpu_exn(fic);                                         \
-} while (0)
-
 static void put_fpu(
-    struct fpu_insn_ctxt *fic,
+    enum x86_emulate_fpu_type type,
     bool failed_late,
     const struct x86_emulate_state *state,
     struct x86_emulate_ctxt *ctxt,
     const struct x86_emulate_ops *ops)
 {
-    if ( unlikely(failed_late) && fic->type == X86EMUL_FPU_fpu )
+    if ( unlikely(failed_late) && type == X86EMUL_FPU_fpu )
         ops->put_fpu(ctxt, X86EMUL_FPU_fpu, NULL);
-    else if ( unlikely(fic->type == X86EMUL_FPU_fpu) && !state->fpu_ctrl )
+    else if ( unlikely(type == X86EMUL_FPU_fpu) && !state->fpu_ctrl )
     {
         struct x86_emul_fpu_aux aux = {
             .ip = ctxt->regs->r(ip),
@@ -1248,9 +1237,8 @@ static void put_fpu(
         }
         ops->put_fpu(ctxt, X86EMUL_FPU_none, &aux);
     }
-    else if ( fic->type != X86EMUL_FPU_none && ops->put_fpu )
+    else if ( type != X86EMUL_FPU_none && ops->put_fpu )
         ops->put_fpu(ctxt, X86EMUL_FPU_none, NULL);
-    fic->type = X86EMUL_FPU_none;
 }
 
 static inline bool fpu_check_write(void)
@@ -1265,29 +1253,27 @@ static inline bool fpu_check_write(void)
 #define emulate_fpu_insn_memdst(opc, ext, arg)                          \
 do {                                                                    \
     /* ModRM: mod=0, reg=ext, rm=0, i.e. a (%rax) operand */            \
-    fic.insn_bytes = 2;                                                 \
+    insn_bytes = 2;                                                     \
     memcpy(get_stub(stub),                                              \
            ((uint8_t[]){ opc, ((ext) & 7) << 3, 0xc3 }), 3);            \
-    invoke_stub("", "", "+m" (fic), "+m" (arg) : "a" (&(arg)));         \
+    invoke_stub("", "", "+m" (arg) : "a" (&(arg)));                     \
     put_stub(stub);                                                     \
 } while (0)
 
 #define emulate_fpu_insn_memsrc(opc, ext, arg)                          \
 do {                                                                    \
     /* ModRM: mod=0, reg=ext, rm=0, i.e. a (%rax) operand */            \
-    fic.insn_bytes = 2;                                                 \
     memcpy(get_stub(stub),                                              \
            ((uint8_t[]){ opc, ((ext) & 7) << 3, 0xc3 }), 3);            \
-    invoke_stub("", "", "+m" (fic) : "m" (arg), "a" (&(arg)));          \
+    invoke_stub("", "", "=m" (dummy) : "m" (arg), "a" (&(arg)));        \
     put_stub(stub);                                                     \
 } while (0)
 
 #define emulate_fpu_insn_stub(bytes...)                                 \
 do {                                                                    \
     unsigned int nr_ = sizeof((uint8_t[]){ bytes });                    \
-    fic.insn_bytes = nr_;                                               \
     memcpy(get_stub(stub), ((uint8_t[]){ bytes, 0xc3 }), nr_ + 1);      \
-    invoke_stub("", "", "=m" (fic) : "m" (fic));                        \
+    invoke_stub("", "", "=m" (dummy) : "i" (0));                        \
     put_stub(stub);                                                     \
 } while (0)
 
@@ -1295,12 +1281,10 @@ do {
 do {                                                                    \
     unsigned int nr_ = sizeof((uint8_t[]){ bytes });                    \
     unsigned long tmp_;                                                 \
-    fic.insn_bytes = nr_;                                               \
     memcpy(get_stub(stub), ((uint8_t[]){ bytes, 0xc3 }), nr_ + 1);      \
     invoke_stub(_PRE_EFLAGS("[eflags]", "[mask]", "[tmp]"),             \
                 _POST_EFLAGS("[eflags]", "[mask]", "[tmp]"),            \
-                [eflags] "+g" (_regs.eflags), [tmp] "=&r" (tmp_),       \
-                "+m" (fic)                                              \
+                [eflags] "+g" (_regs.eflags), [tmp] "=&r" (tmp_)        \
                 : [mask] "i" (X86_EFLAGS_ZF|X86_EFLAGS_PF|X86_EFLAGS_CF)); \
     put_stub(stub);                                                     \
 } while (0)
@@ -3143,14 +3127,14 @@ x86_emulate(
     struct x86_emulate_state state;
     int rc;
     uint8_t b, d, *opc = NULL;
-    unsigned int first_byte = 0;
+    unsigned int first_byte = 0, insn_bytes = 0;
     bool singlestep = (_regs.eflags & X86_EFLAGS_TF) &&
 	    !is_branch_step(ctxt, ops);
     bool sfence = false;
     struct operand src = { .reg = PTR_POISON };
     struct operand dst = { .reg = PTR_POISON };
     unsigned long cr4;
-    struct fpu_insn_ctxt fic = { .type = X86EMUL_FPU_none, .exn_raised = -1 };
+    enum x86_emulate_fpu_type fpu_type = X86EMUL_FPU_none;
     struct x86_emulate_stub stub = {};
     DECLARE_ALIGNED(mmval_t, mmval);
 
@@ -3842,9 +3826,8 @@ x86_emulate(
 
     case 0x9b:  /* wait/fwait */
         host_and_vcpu_must_have(fpu);
-        get_fpu(X86EMUL_FPU_wait, &fic);
+        get_fpu(X86EMUL_FPU_wait);
         emulate_fpu_insn_stub(b);
-        check_fpu_exn(&fic);
         break;
 
     case 0x9c: /* pushf */
@@ -4248,7 +4231,7 @@ x86_emulate(
 
     case 0xd8: /* FPU 0xd8 */
         host_and_vcpu_must_have(fpu);
-        get_fpu(X86EMUL_FPU_fpu, &fic);
+        get_fpu(X86EMUL_FPU_fpu);
         switch ( modrm )
         {
         case 0xc0 ... 0xc7: /* fadd %stN,%st */
@@ -4270,12 +4253,11 @@ x86_emulate(
             emulate_fpu_insn_memsrc(b, modrm_reg, src.val);
             break;
         }
-        check_fpu_exn(&fic);
         break;
 
     case 0xd9: /* FPU 0xd9 */
         host_and_vcpu_must_have(fpu);
-        get_fpu(X86EMUL_FPU_fpu, &fic);
+        get_fpu(X86EMUL_FPU_fpu);
         switch ( modrm )
         {
         case 0xfb: /* fsincos */
@@ -4357,12 +4339,11 @@ x86_emulate(
             if ( dst.type == OP_MEM && !state->fpu_ctrl && !fpu_check_write() )
                 dst.type = OP_NONE;
         }
-        check_fpu_exn(&fic);
         break;
 
     case 0xda: /* FPU 0xda */
         host_and_vcpu_must_have(fpu);
-        get_fpu(X86EMUL_FPU_fpu, &fic);
+        get_fpu(X86EMUL_FPU_fpu);
         switch ( modrm )
         {
         case 0xc0 ... 0xc7: /* fcmovb %stN */
@@ -4379,12 +4360,11 @@ x86_emulate(
             generate_exception_if(ea.type != OP_MEM, EXC_UD);
             goto fpu_memsrc32;
         }
-        check_fpu_exn(&fic);
         break;
 
     case 0xdb: /* FPU 0xdb */
         host_and_vcpu_must_have(fpu);
-        get_fpu(X86EMUL_FPU_fpu, &fic);
+        get_fpu(X86EMUL_FPU_fpu);
         switch ( modrm )
         {
         case 0xc0 ... 0xc7: /* fcmovnb %stN */
@@ -4437,12 +4417,11 @@ x86_emulate(
                 generate_exception(EXC_UD);
             }
         }
-        check_fpu_exn(&fic);
         break;
 
     case 0xdc: /* FPU 0xdc */
         host_and_vcpu_must_have(fpu);
-        get_fpu(X86EMUL_FPU_fpu, &fic);
+        get_fpu(X86EMUL_FPU_fpu);
         switch ( modrm )
         {
         case 0xc0 ... 0xc7: /* fadd %st,%stN */
@@ -4464,12 +4443,11 @@ x86_emulate(
             emulate_fpu_insn_memsrc(b, modrm_reg, src.val);
             break;
         }
-        check_fpu_exn(&fic);
         break;
 
     case 0xdd: /* FPU 0xdd */
         host_and_vcpu_must_have(fpu);
-        get_fpu(X86EMUL_FPU_fpu, &fic);
+        get_fpu(X86EMUL_FPU_fpu);
         switch ( modrm )
         {
         case 0xc0 ... 0xc7: /* ffree %stN */
@@ -4513,12 +4491,11 @@ x86_emulate(
             if ( dst.type == OP_MEM && !state->fpu_ctrl && !fpu_check_write() )
                 dst.type = OP_NONE;
         }
-        check_fpu_exn(&fic);
         break;
 
     case 0xde: /* FPU 0xde */
         host_and_vcpu_must_have(fpu);
-        get_fpu(X86EMUL_FPU_fpu, &fic);
+        get_fpu(X86EMUL_FPU_fpu);
         switch ( modrm )
         {
         case 0xc0 ... 0xc7: /* faddp %stN */
@@ -4536,12 +4513,11 @@ x86_emulate(
             emulate_fpu_insn_memsrc(b, modrm_reg, src.val);
             break;
         }
-        check_fpu_exn(&fic);
         break;
 
     case 0xdf: /* FPU 0xdf */
         host_and_vcpu_must_have(fpu);
-        get_fpu(X86EMUL_FPU_fpu, &fic);
+        get_fpu(X86EMUL_FPU_fpu);
         switch ( modrm )
         {
         case 0xe0:
@@ -4586,7 +4562,6 @@ x86_emulate(
                 goto fpu_memdst64;
             }
         }
-        check_fpu_exn(&fic);
         break;
 
     case 0xe0 ... 0xe2: /* loop{,z,nz} */ {
@@ -5418,7 +5393,7 @@ x86_emulate(
         else
             generate_exception(EXC_UD);
 
-        get_fpu(X86EMUL_FPU_mmx, &fic);
+        get_fpu(X86EMUL_FPU_mmx);
 
         d = DstReg | SrcMem;
         op_bytes = 8;
@@ -5508,7 +5483,7 @@ x86_emulate(
             else
                 vcpu_must_have(sse);
     simd_0f_xmm:
-            get_fpu(X86EMUL_FPU_xmm, &fic);
+            get_fpu(X86EMUL_FPU_xmm);
         }
         else
         {
@@ -5518,7 +5493,7 @@ x86_emulate(
     simd_0f_avx:
             host_and_vcpu_must_have(avx);
     simd_0f_ymm:
-            get_fpu(X86EMUL_FPU_ymm, &fic);
+            get_fpu(X86EMUL_FPU_ymm);
         }
     simd_0f_common:
         opc = init_prefixes(stub);
@@ -5531,7 +5506,7 @@ x86_emulate(
             vex.b = 1;
             opc[1] &= 0x38;
         }
-        fic.insn_bytes = PFX_BYTES + 2;
+        insn_bytes = PFX_BYTES + 2;
         break;
 
     case X86EMUL_OPC_66(0x0f, 0x12):       /* movlpd m64,xmm */
@@ -5618,12 +5593,12 @@ x86_emulate(
                 vcpu_must_have(sse2);
             else
                 vcpu_must_have(sse);
-            get_fpu(X86EMUL_FPU_xmm, &fic);
+            get_fpu(X86EMUL_FPU_xmm);
         }
         else
         {
             host_and_vcpu_must_have(avx);
-            get_fpu(X86EMUL_FPU_ymm, &fic);
+            get_fpu(X86EMUL_FPU_ymm);
         }
 
         if ( ea.type == OP_MEM )
@@ -5649,14 +5624,14 @@ x86_emulate(
                 vcpu_must_have(sse2);
             else
                 vcpu_must_have(sse);
-            get_fpu(X86EMUL_FPU_xmm, &fic);
+            get_fpu(X86EMUL_FPU_xmm);
         }
         else
         {
             generate_exception_if(vex.reg != 0xf, EXC_UD);
             vex.l = 0;
             host_and_vcpu_must_have(avx);
-            get_fpu(X86EMUL_FPU_ymm, &fic);
+            get_fpu(X86EMUL_FPU_ymm);
         }
 
         opc = init_prefixes(stub);
@@ -5679,17 +5654,14 @@ x86_emulate(
             opc[1] = modrm & 0xc7;
         if ( !mode_64bit() )
             vex.w = 0;
-        fic.insn_bytes = PFX_BYTES + 2;
+        insn_bytes = PFX_BYTES + 2;
         opc[2] = 0xc3;
 
         copy_REX_VEX(opc, rex_prefix, vex);
         ea.reg = decode_register(modrm_reg, &_regs, 0);
-        invoke_stub("", "", "=a" (*ea.reg), "+m" (fic.exn_raised)
-                            : "c" (mmvalp), "m" (*mmvalp));
+        invoke_stub("", "", "=a" (*ea.reg) : "c" (mmvalp), "m" (*mmvalp));
 
         put_stub(stub);
-        check_xmm_exn(&fic);
-
         state->simd_size = simd_none;
         break;
 
@@ -5703,13 +5675,13 @@ x86_emulate(
                 vcpu_must_have(sse2);
             else
                 vcpu_must_have(sse);
-            get_fpu(X86EMUL_FPU_xmm, &fic);
+            get_fpu(X86EMUL_FPU_xmm);
         }
         else
         {
             generate_exception_if(vex.reg != 0xf, EXC_UD);
             host_and_vcpu_must_have(avx);
-            get_fpu(X86EMUL_FPU_ymm, &fic);
+            get_fpu(X86EMUL_FPU_ymm);
         }
 
         opc = init_prefixes(stub);
@@ -5727,20 +5699,17 @@ x86_emulate(
             vex.b = 1;
             opc[1] &= 0x38;
         }
-        fic.insn_bytes = PFX_BYTES + 2;
+        insn_bytes = PFX_BYTES + 2;
         opc[2] = 0xc3;
 
         copy_REX_VEX(opc, rex_prefix, vex);
         invoke_stub(_PRE_EFLAGS("[eflags]", "[mask]", "[tmp]"),
                     _POST_EFLAGS("[eflags]", "[mask]", "[tmp]"),
                     [eflags] "+g" (_regs.eflags),
-                    [tmp] "=&r" (dummy), "+m" (*mmvalp),
-                    "+m" (fic.exn_raised)
+                    [tmp] "=&r" (dummy), "+m" (*mmvalp)
                     : "a" (mmvalp), [mask] "i" (EFLAGS_MASK));
 
         put_stub(stub);
-        check_xmm_exn(&fic);
-
         ASSERT(!state->simd_size);
         break;
 
@@ -5878,9 +5847,9 @@ x86_emulate(
         if ( !mode_64bit() )
             vex.w = 0;
         opc[1] = modrm & 0xc7;
-        fic.insn_bytes = PFX_BYTES + 2;
+        insn_bytes = PFX_BYTES + 2;
     simd_0f_to_gpr:
-        opc[fic.insn_bytes - PFX_BYTES] = 0xc3;
+        opc[insn_bytes - PFX_BYTES] = 0xc3;
 
         generate_exception_if(ea.type != OP_REG, EXC_UD);
 
@@ -5899,9 +5868,9 @@ x86_emulate(
                     vcpu_must_have(sse);
             }
             if ( b == 0x50 || (vex.pfx & VEX_PREFIX_DOUBLE_MASK) )
-                get_fpu(X86EMUL_FPU_xmm, &fic);
+                get_fpu(X86EMUL_FPU_xmm);
             else
-                get_fpu(X86EMUL_FPU_mmx, &fic);
+                get_fpu(X86EMUL_FPU_mmx);
         }
         else
         {
@@ -5910,14 +5879,13 @@ x86_emulate(
                 host_and_vcpu_must_have(avx);
             else
                 host_and_vcpu_must_have(avx2);
-            get_fpu(X86EMUL_FPU_ymm, &fic);
+            get_fpu(X86EMUL_FPU_ymm);
         }
 
         copy_REX_VEX(opc, rex_prefix, vex);
         invoke_stub("", "", "=a" (dst.val) : [dummy] "i" (0));
 
         put_stub(stub);
-        check_xmm_exn(&fic);
 
         ASSERT(!state->simd_size);
         dst.bytes = 4;
@@ -6083,7 +6051,7 @@ x86_emulate(
             goto simd_0f_sse2;
     simd_0f_mmx:
         host_and_vcpu_must_have(mmx);
-        get_fpu(X86EMUL_FPU_mmx, &fic);
+        get_fpu(X86EMUL_FPU_mmx);
         goto simd_0f_common;
 
     CASE_SIMD_PACKED_INT(0x0f, 0x6e):    /* mov{d,q} r/m,{,x}mm */
@@ -6094,17 +6062,17 @@ x86_emulate(
         {
             generate_exception_if(vex.l || vex.reg != 0xf, EXC_UD);
             host_and_vcpu_must_have(avx);
-            get_fpu(X86EMUL_FPU_ymm, &fic);
+            get_fpu(X86EMUL_FPU_ymm);
         }
         else if ( vex.pfx )
         {
             vcpu_must_have(sse2);
-            get_fpu(X86EMUL_FPU_xmm, &fic);
+            get_fpu(X86EMUL_FPU_xmm);
         }
         else
         {
             host_and_vcpu_must_have(mmx);
-            get_fpu(X86EMUL_FPU_mmx, &fic);
+            get_fpu(X86EMUL_FPU_mmx);
         }
 
     simd_0f_rm:
@@ -6116,17 +6084,14 @@ x86_emulate(
         if ( !mode_64bit() )
             vex.w = 0;
         opc[1] = modrm & 0x38;
-        fic.insn_bytes = PFX_BYTES + 2;
+        insn_bytes = PFX_BYTES + 2;
         opc[2] = 0xc3;
 
         copy_REX_VEX(opc, rex_prefix, vex);
-        invoke_stub("", "", "+m" (src.val), "+m" (fic.exn_raised)
-                            : "a" (&src.val));
+        invoke_stub("", "", "+m" (src.val) : "a" (&src.val));
         dst.val = src.val;
 
         put_stub(stub);
-        check_xmm_exn(&fic);
-
         ASSERT(!state->simd_size);
         break;
 
@@ -6192,19 +6157,19 @@ x86_emulate(
                 host_and_vcpu_must_have(avx);
             }
     simd_0f_imm8_ymm:
-            get_fpu(X86EMUL_FPU_ymm, &fic);
+            get_fpu(X86EMUL_FPU_ymm);
         }
         else if ( vex.pfx )
         {
     simd_0f_imm8_sse2:
             vcpu_must_have(sse2);
-            get_fpu(X86EMUL_FPU_xmm, &fic);
+            get_fpu(X86EMUL_FPU_xmm);
         }
         else
         {
             host_and_vcpu_must_have(mmx);
             vcpu_must_have(mmxext);
-            get_fpu(X86EMUL_FPU_mmx, &fic);
+            get_fpu(X86EMUL_FPU_mmx);
         }
     simd_0f_imm8:
         opc = init_prefixes(stub);
@@ -6218,7 +6183,7 @@ x86_emulate(
             opc[1] &= 0x38;
         }
         opc[2] = imm1;
-        fic.insn_bytes = PFX_BYTES + 3;
+        insn_bytes = PFX_BYTES + 3;
         break;
 
     CASE_SIMD_PACKED_INT(0x0f, 0x71):    /* Grp12 */
@@ -6246,33 +6211,31 @@ x86_emulate(
                 host_and_vcpu_must_have(avx2);
             else
                 host_and_vcpu_must_have(avx);
-            get_fpu(X86EMUL_FPU_ymm, &fic);
+            get_fpu(X86EMUL_FPU_ymm);
         }
         else if ( vex.pfx )
         {
             vcpu_must_have(sse2);
-            get_fpu(X86EMUL_FPU_xmm, &fic);
+            get_fpu(X86EMUL_FPU_xmm);
         }
         else
         {
             host_and_vcpu_must_have(mmx);
-            get_fpu(X86EMUL_FPU_mmx, &fic);
+            get_fpu(X86EMUL_FPU_mmx);
         }
 
         opc = init_prefixes(stub);
         opc[0] = b;
         opc[1] = modrm;
         opc[2] = imm1;
-        fic.insn_bytes = PFX_BYTES + 3;
+        insn_bytes = PFX_BYTES + 3;
     simd_0f_reg_only:
-        opc[fic.insn_bytes - PFX_BYTES] = 0xc3;
+        opc[insn_bytes - PFX_BYTES] = 0xc3;
 
         copy_REX_VEX(opc, rex_prefix, vex);
         invoke_stub("", "", [dummy_out] "=g" (dummy) : [dummy_in] "i" (0) );
 
         put_stub(stub);
-        check_xmm_exn(&fic);
-
         ASSERT(!state->simd_size);
         break;
 
@@ -6307,7 +6270,7 @@ x86_emulate(
         {
             generate_exception_if(vex.reg != 0xf, EXC_UD);
             host_and_vcpu_must_have(avx);
-            get_fpu(X86EMUL_FPU_ymm, &fic);
+            get_fpu(X86EMUL_FPU_ymm);
 
 #ifdef __x86_64__
             if ( !mode_64bit() )
@@ -6349,12 +6312,12 @@ x86_emulate(
         else
         {
             host_and_vcpu_must_have(mmx);
-            get_fpu(X86EMUL_FPU_mmx, &fic);
+            get_fpu(X86EMUL_FPU_mmx);
         }
 
         opc = init_prefixes(stub);
         opc[0] = b;
-        fic.insn_bytes = PFX_BYTES + 1;
+        insn_bytes = PFX_BYTES + 1;
         goto simd_0f_reg_only;
 
     case X86EMUL_OPC_66(0x0f, 0x78):     /* Grp17 */
@@ -6370,14 +6333,14 @@ x86_emulate(
         generate_exception_if(ea.type != OP_REG, EXC_UD);
 
         host_and_vcpu_must_have(sse4a);
-        get_fpu(X86EMUL_FPU_xmm, &fic);
+        get_fpu(X86EMUL_FPU_xmm);
 
         opc = init_prefixes(stub);
         opc[0] = b;
         opc[1] = modrm;
         opc[2] = imm1;
         opc[3] = imm2;
-        fic.insn_bytes = PFX_BYTES + 4;
+        insn_bytes = PFX_BYTES + 4;
         goto simd_0f_reg_only;
 
     case X86EMUL_OPC_66(0x0f, 0x79):     /* extrq xmm,xmm */
@@ -6505,7 +6468,7 @@ x86_emulate(
             vcpu_must_have(sse);
         ldmxcsr:
             generate_exception_if(src.type != OP_MEM, EXC_UD);
-            get_fpu(vex.opcx ? X86EMUL_FPU_ymm : X86EMUL_FPU_xmm, &fic);
+            get_fpu(vex.opcx ? X86EMUL_FPU_ymm : X86EMUL_FPU_xmm);
             generate_exception_if(src.val & ~mxcsr_mask, EXC_GP, 0);
             asm volatile ( "ldmxcsr %0" :: "m" (src.val) );
             break;
@@ -6515,7 +6478,7 @@ x86_emulate(
             vcpu_must_have(sse);
         stmxcsr:
             generate_exception_if(dst.type != OP_MEM, EXC_UD);
-            get_fpu(vex.opcx ? X86EMUL_FPU_ymm : X86EMUL_FPU_xmm, &fic);
+            get_fpu(vex.opcx ? X86EMUL_FPU_ymm : X86EMUL_FPU_xmm);
             asm volatile ( "stmxcsr %0" : "=m" (dst.val) );
             break;
 
@@ -6769,7 +6732,7 @@ x86_emulate(
             if ( vex.pfx & VEX_PREFIX_DOUBLE_MASK )
                 goto simd_0f_imm8_sse2;
             vcpu_must_have(sse);
-            get_fpu(X86EMUL_FPU_xmm, &fic);
+            get_fpu(X86EMUL_FPU_xmm);
             goto simd_0f_imm8;
         }
         goto simd_0f_imm8_avx;
@@ -6800,7 +6763,7 @@ x86_emulate(
             vex.w = 0;
         opc[1] = modrm & 0xc7;
         opc[2] = imm1;
-        fic.insn_bytes = PFX_BYTES + 3;
+        insn_bytes = PFX_BYTES + 3;
         goto simd_0f_to_gpr;
 
     case X86EMUL_OPC(0x0f, 0xc7): /* Grp9 */
@@ -7048,18 +7011,18 @@ x86_emulate(
             generate_exception_if(vex.l || vex.reg != 0xf, EXC_UD);
             d |= TwoOp;
             host_and_vcpu_must_have(avx);
-            get_fpu(X86EMUL_FPU_ymm, &fic);
+            get_fpu(X86EMUL_FPU_ymm);
         }
         else if ( vex.pfx )
         {
             vcpu_must_have(sse2);
-            get_fpu(X86EMUL_FPU_xmm, &fic);
+            get_fpu(X86EMUL_FPU_xmm);
         }
         else
         {
             host_and_vcpu_must_have(mmx);
             vcpu_must_have(mmxext);
-            get_fpu(X86EMUL_FPU_mmx, &fic);
+            get_fpu(X86EMUL_FPU_mmx);
         }
 
         /*
@@ -7079,7 +7042,6 @@ x86_emulate(
         if ( !mode_64bit() )
             vex.w = 0;
         opc[1] = modrm & 0xc7;
-        fic.insn_bytes = PFX_BYTES + 2;
         opc[2] = 0xc3;
 
         copy_REX_VEX(opc, rex_prefix, vex);
@@ -7092,6 +7054,7 @@ x86_emulate(
         opc = init_prefixes(stub);
         opc[0] = b;
         opc[1] = modrm;
+        insn_bytes = PFX_BYTES + 2;
         /* Restore high bit of XMM destination. */
         if ( sfence )
         {
@@ -7138,12 +7101,12 @@ x86_emulate(
         if ( vex.pfx )
         {
     simd_0f38_common:
-            get_fpu(X86EMUL_FPU_xmm, &fic);
+            get_fpu(X86EMUL_FPU_xmm);
         }
         else
         {
             host_and_vcpu_must_have(mmx);
-            get_fpu(X86EMUL_FPU_mmx, &fic);
+            get_fpu(X86EMUL_FPU_mmx);
         }
         opc = init_prefixes(stub);
         opc[0] = 0x38;
@@ -7156,7 +7119,7 @@ x86_emulate(
             vex.b = 1;
             opc[2] &= 0x38;
         }
-        fic.insn_bytes = PFX_BYTES + 3;
+        insn_bytes = PFX_BYTES + 3;
         break;
 
     case X86EMUL_OPC_VEX_66(0x0f38, 0x19): /* vbroadcastsd xmm/m64,ymm */
@@ -7184,13 +7147,13 @@ x86_emulate(
         if ( vex.opcx == vex_none )
         {
             host_and_vcpu_must_have(sse4_1);
-            get_fpu(X86EMUL_FPU_xmm, &fic);
+            get_fpu(X86EMUL_FPU_xmm);
         }
         else
         {
             generate_exception_if(vex.reg != 0xf, EXC_UD);
             host_and_vcpu_must_have(avx);
-            get_fpu(X86EMUL_FPU_ymm, &fic);
+            get_fpu(X86EMUL_FPU_ymm);
         }
 
         opc = init_prefixes(stub);
@@ -7209,21 +7172,19 @@ x86_emulate(
             vex.b = 1;
             opc[1] &= 0x38;
         }
-        fic.insn_bytes = PFX_BYTES + 2;
+        insn_bytes = PFX_BYTES + 2;
         opc[2] = 0xc3;
         if ( vex.opcx == vex_none )
         {
             /* Cover for extra prefix byte. */
             --opc;
-            ++fic.insn_bytes;
+            ++insn_bytes;
         }
 
         copy_REX_VEX(opc, rex_prefix, vex);
         emulate_stub("+m" (*mmvalp), "a" (mmvalp));
 
         put_stub(stub);
-        check_xmm_exn(&fic);
-
         state->simd_size = simd_none;
         dst.type = OP_NONE;
         break;
@@ -7312,7 +7273,7 @@ x86_emulate(
 
         generate_exception_if(ea.type != OP_MEM || vex.w, EXC_UD);
         host_and_vcpu_must_have(avx);
-        get_fpu(X86EMUL_FPU_ymm, &fic);
+        get_fpu(X86EMUL_FPU_ymm);
 
         /*
          * While we can't reasonably provide fully correct behavior here
@@ -7361,7 +7322,7 @@ x86_emulate(
         rex_prefix &= ~REX_B;
         vex.b = 1;
         opc[1] = modrm & 0x38;
-        fic.insn_bytes = PFX_BYTES + 2;
+        insn_bytes = PFX_BYTES + 2;
 
         break;
     }
@@ -7410,7 +7371,7 @@ x86_emulate(
 
         generate_exception_if(ea.type != OP_MEM, EXC_UD);
         host_and_vcpu_must_have(avx2);
-        get_fpu(X86EMUL_FPU_ymm, &fic);
+        get_fpu(X86EMUL_FPU_ymm);
 
         /*
          * While we can't reasonably provide fully correct behavior here
@@ -7457,7 +7418,7 @@ x86_emulate(
         rex_prefix &= ~REX_B;
         vex.b = 1;
         opc[1] = modrm & 0x38;
-        fic.insn_bytes = PFX_BYTES + 2;
+        insn_bytes = PFX_BYTES + 2;
 
         break;
     }
@@ -7480,7 +7441,7 @@ x86_emulate(
                               state->sib_index == mask_reg, EXC_UD);
         generate_exception_if(!cpu_has_avx, EXC_UD);
         vcpu_must_have(avx2);
-        get_fpu(X86EMUL_FPU_ymm, &fic);
+        get_fpu(X86EMUL_FPU_ymm);
 
         /* Read destination, index, and mask registers. */
         opc = init_prefixes(stub);
@@ -7817,12 +7778,12 @@ x86_emulate(
         if ( vex.pfx )
         {
     simd_0f3a_common:
-            get_fpu(X86EMUL_FPU_xmm, &fic);
+            get_fpu(X86EMUL_FPU_xmm);
         }
         else
         {
             host_and_vcpu_must_have(mmx);
-            get_fpu(X86EMUL_FPU_mmx, &fic);
+            get_fpu(X86EMUL_FPU_mmx);
         }
         opc = init_prefixes(stub);
         opc[0] = 0x3a;
@@ -7836,7 +7797,7 @@ x86_emulate(
             opc[2] &= 0x38;
         }
         opc[3] = imm1;
-        fic.insn_bytes = PFX_BYTES + 4;
+        insn_bytes = PFX_BYTES + 4;
         break;
 
     case X86EMUL_OPC_66(0x0f3a, 0x14): /* pextrb $imm8,xmm,r/m */
@@ -7844,7 +7805,7 @@ x86_emulate(
     case X86EMUL_OPC_66(0x0f3a, 0x16): /* pextr{d,q} $imm8,xmm,r/m */
     case X86EMUL_OPC_66(0x0f3a, 0x17): /* extractps $imm8,xmm,r/m */
         host_and_vcpu_must_have(sse4_1);
-        get_fpu(X86EMUL_FPU_xmm, &fic);
+        get_fpu(X86EMUL_FPU_xmm);
 
         opc = init_prefixes(stub);
         opc++[0] = 0x3a;
@@ -7857,20 +7818,16 @@ x86_emulate(
             vex.w = 0;
         opc[1] = modrm & 0x38;
         opc[2] = imm1;
-        fic.insn_bytes = PFX_BYTES + 3;
         opc[3] = 0xc3;
         if ( vex.opcx == vex_none )
         {
             /* Cover for extra prefix byte. */
             --opc;
-            ++fic.insn_bytes;
         }
 
         copy_REX_VEX(opc, rex_prefix, vex);
         invoke_stub("", "", "=m" (dst.val) : "a" (&dst.val));
-
         put_stub(stub);
-        check_xmm_exn(&fic);
 
         ASSERT(!state->simd_size);
         dst.bytes = dst.type == OP_REG || b == 0x17 ? 4 : 1 << (b & 3);
@@ -7884,7 +7841,7 @@ x86_emulate(
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x17): /* vextractps $imm8,xmm,r/m */
         generate_exception_if(vex.l || vex.reg != 0xf, EXC_UD);
         host_and_vcpu_must_have(avx);
-        get_fpu(X86EMUL_FPU_ymm, &fic);
+        get_fpu(X86EMUL_FPU_ymm);
         opc = init_prefixes(stub);
         goto pextr;
 
@@ -7906,17 +7863,15 @@ x86_emulate(
             opc[1] &= 0x38;
         }
         opc[2] = imm1;
-        fic.insn_bytes = PFX_BYTES + 3;
+        insn_bytes = PFX_BYTES + 3;
         opc[3] = 0xc3;
 
         copy_VEX(opc, vex);
         /* Latch MXCSR - we may need to restore it below. */
         invoke_stub("stmxcsr %[mxcsr]", "",
-                    "=m" (*mmvalp), "+m" (fic.exn_raised), [mxcsr] "=m" (mxcsr)
-                    : "a" (mmvalp));
+                    "=m" (*mmvalp), [mxcsr] "=m" (mxcsr) : "a" (mmvalp));
 
         put_stub(stub);
-        check_xmm_exn(&fic);
 
         if ( ea.type == OP_MEM )
         {
@@ -7935,7 +7890,7 @@ x86_emulate(
     case X86EMUL_OPC_66(0x0f3a, 0x20): /* pinsrb $imm8,r32/m8,xmm */
     case X86EMUL_OPC_66(0x0f3a, 0x22): /* pinsr{d,q} $imm8,r/m,xmm */
         host_and_vcpu_must_have(sse4_1);
-        get_fpu(X86EMUL_FPU_xmm, &fic);
+        get_fpu(X86EMUL_FPU_xmm);
         memcpy(mmvalp, &src.val, op_bytes);
         ea.type = OP_MEM;
         op_bytes = src.bytes;
@@ -8045,13 +8000,13 @@ x86_emulate(
         if ( vex.opcx == vex_none )
         {
             host_and_vcpu_must_have(sse4_2);
-            get_fpu(X86EMUL_FPU_xmm, &fic);
+            get_fpu(X86EMUL_FPU_xmm);
         }
         else
         {
             generate_exception_if(vex.l || vex.reg != 0xf, EXC_UD);
             host_and_vcpu_must_have(avx);
-            get_fpu(X86EMUL_FPU_ymm, &fic);
+            get_fpu(X86EMUL_FPU_ymm);
         }
 
         opc = init_prefixes(stub);
@@ -8072,13 +8027,13 @@ x86_emulate(
                 goto done;
         }
         opc[2] = imm1;
-        fic.insn_bytes = PFX_BYTES + 3;
+        insn_bytes = PFX_BYTES + 3;
         opc[3] = 0xc3;
         if ( vex.opcx == vex_none )
         {
             /* Cover for extra prefix byte. */
             --opc;
-            ++fic.insn_bytes;
+            ++insn_bytes;
         }
 
         copy_REX_VEX(opc, rex_prefix, vex);
@@ -8309,7 +8264,7 @@ x86_emulate(
 
         if ( !opc )
             BUG();
-        opc[fic.insn_bytes - PFX_BYTES] = 0xc3;
+        opc[insn_bytes - PFX_BYTES] = 0xc3;
         copy_REX_VEX(opc, rex_prefix, vex);
 
         if ( ea.type == OP_MEM )
@@ -8387,13 +8342,11 @@ x86_emulate(
         if ( likely((ctxt->opcode & ~(X86EMUL_OPC_PFX_MASK |
                                       X86EMUL_OPC_ENCODING_MASK)) !=
                     X86EMUL_OPC(0x0f, 0xf7)) )
-            invoke_stub("", "", "+m" (*mmvalp), "+m" (fic.exn_raised)
-                                : "a" (mmvalp));
+            invoke_stub("", "", "+m" (*mmvalp) : "a" (mmvalp));
         else
             invoke_stub("", "", "+m" (*mmvalp) : "D" (mmvalp));
 
         put_stub(stub);
-        check_xmm_exn(&fic);
     }
 
     switch ( dst.type )
@@ -8436,7 +8389,8 @@ x86_emulate(
     }
 
  complete_insn: /* Commit shadow register state. */
-    put_fpu(&fic, false, state, ctxt, ops);
+    put_fpu(fpu_type, false, state, ctxt, ops);
+    fpu_type = X86EMUL_FPU_none;
 
     /* Zero the upper 32 bits of %rip if not in 64-bit mode. */
     if ( !mode_64bit() )
@@ -8460,7 +8414,7 @@ x86_emulate(
     ctxt->regs->eflags &= ~X86_EFLAGS_RF;
 
  done:
-    put_fpu(&fic, fic.insn_bytes > 0 && dst.type == OP_MEM, state, ctxt, ops);
+    put_fpu(fpu_type, insn_bytes > 0 && dst.type == OP_MEM, state, ctxt, ops);
     put_stub(stub);
     return rc;
 #undef state
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -438,12 +438,8 @@ struct x86_emulate_ops
 
     /*
      * get_fpu: Load emulated environment's FPU state onto processor.
-     *  @exn_callback: On any FPU or SIMD exception, pass control to
-     *                 (*exception_callback)(exception_callback_arg, regs).
      */
     int (*get_fpu)(
-        void (*exception_callback)(void *, struct cpu_user_regs *),
-        void *exception_callback_arg,
         enum x86_emulate_fpu_type type,
         struct x86_emulate_ctxt *ctxt);
 
--- a/xen/include/asm-x86/hvm/vcpu.h
+++ b/xen/include/asm-x86/hvm/vcpu.h
@@ -198,10 +198,6 @@ struct hvm_vcpu {
 
     struct hvm_vcpu_io  hvm_io;
 
-    /* Callback into x86_emulate when emulating FPU/MMX/XMM instructions. */
-    void (*fpu_exception_callback)(void *, struct cpu_user_regs *);
-    void *fpu_exception_callback_arg;
-
     /* Pending hw/sw interrupt (.vector = -1 means nothing pending). */
     struct x86_event     inject_event;
 



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* [PATCH v3 16/25] x86emul: support SWAPGS
  2017-12-07 13:49 [PATCH v3 00/25] x86: emulator enhancements Jan Beulich
                   ` (14 preceding siblings ...)
  2017-12-07 14:10 ` [PATCH v3 15/25] x86/HVM: eliminate custom #MF/#XM handling Jan Beulich
@ 2017-12-07 14:11 ` Jan Beulich
  2018-02-02 13:41   ` Andrew Cooper
  2017-12-07 14:11 ` [PATCH v3 17/25] x86emul: emulate {MONITOR, MWAIT}{, X} as no-op Jan Beulich
                   ` (8 subsequent siblings)
  24 siblings, 1 reply; 85+ messages in thread
From: Jan Beulich @ 2017-12-07 14:11 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -5047,6 +5047,24 @@ x86_emulate(
                 goto done;
             break;
 
+        case 0xf8: /* swapgs */
+            generate_exception_if(!mode_64bit(), EXC_UD);
+            generate_exception_if(!mode_ring0(), EXC_GP, 0);
+            fail_if(!ops->read_segment || !ops->read_msr ||
+                    !ops->write_segment || !ops->write_msr);
+            if ( (rc = ops->read_segment(x86_seg_gs, &sreg,
+                                         ctxt)) != X86EMUL_OKAY ||
+                 (rc = ops->read_msr(MSR_SHADOW_GS_BASE, &msr_val,
+                                     ctxt)) != X86EMUL_OKAY ||
+                 (rc = ops->write_msr(MSR_SHADOW_GS_BASE, sreg.base,
+                                      ctxt)) != X86EMUL_OKAY )
+                goto done;
+            sreg.base = msr_val;
+            if ( (rc = ops->write_segment(x86_seg_gs, &sreg,
+                                          ctxt)) != X86EMUL_OKAY )
+                goto done;
+            break;
+
         case 0xf9: /* rdtscp */
             fail_if(ops->read_msr == NULL);
             if ( (rc = ops->read_msr(MSR_TSC_AUX,




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* [PATCH v3 17/25] x86emul: emulate {MONITOR, MWAIT}{, X} as no-op
  2017-12-07 13:49 [PATCH v3 00/25] x86: emulator enhancements Jan Beulich
                   ` (15 preceding siblings ...)
  2017-12-07 14:11 ` [PATCH v3 16/25] x86emul: support SWAPGS Jan Beulich
@ 2017-12-07 14:11 ` Jan Beulich
  2018-02-02 14:05   ` Andrew Cooper
  2017-12-07 14:12 ` [PATCH v3 18/25] x86emul: add missing suffixes in test harness Jan Beulich
                   ` (7 subsequent siblings)
  24 siblings, 1 reply; 85+ messages in thread
From: Jan Beulich @ 2017-12-07 14:11 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper

As mentioned in Linux commit 87c00572ba ("kvm: x86: emulate monitor and
mwait instructions as nop"), older OS X versions (for example) may make
use of the insns without checking CPUID flags (presumably implying
availability from family/model).

While the instruction prefix check appears to contradict the SDM,
Intel's XED project has the insns marked "no_refining_prefix", which
also matches up with neighboring insns in this group. The latter fact
also is what explains why MONITORX/MWAITX do _not_ have such a check.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/xen/arch/x86/cpu/mcheck/x86_mca.h
+++ b/xen/arch/x86/cpu/mcheck/x86_mca.h
@@ -89,9 +89,6 @@
 /* reserved bits */
 #define MCi_STATUS_OTHER_RESERVED2      0x0180000000000000ULL
 
-/* Bitfield of MSR_K8_HWCR register */
-#define K8_HWCR_MCi_STATUS_WREN		(1ULL << 18)
-
 /*Intel Specific bitfield*/
 #define MCi_MISC_ADDRMOD_MASK (0x7UL << 6)
 #define MCi_MISC_PHYSMOD    (0x2UL << 6)
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1712,6 +1712,7 @@ static bool vcpu_has(
 #define vcpu_has_xop()         vcpu_has(0x80000001, X, ECX, 12, ctxt, ops)
 #define vcpu_has_fma4()        vcpu_has(0x80000001, X, ECX, 16, ctxt, ops)
 #define vcpu_has_tbm()         vcpu_has(0x80000001, X, ECX, 21, ctxt, ops)
+#define vcpu_has_monitorx()    vcpu_has(0x80000001, X, ECX, 29, ctxt, ops)
 #define vcpu_has_bmi1()        vcpu_has(         7, 0, EBX,  3, ctxt, ops)
 #define vcpu_has_hle()         vcpu_has(         7, 0, EBX,  4, ctxt, ops)
 #define vcpu_has_avx2()        vcpu_has(         7, 0, EBX,  5, ctxt, ops)
@@ -4979,6 +4980,43 @@ x86_emulate(
 
         switch( modrm )
         {
+        case 0xc8: /* monitor */
+        case 0xc9: /* mwait */
+        {
+            bool user = false;
+
+            generate_exception_if(vex.pfx, EXC_UD);
+            /* There is intentionally no feature flag check here. */
+            if ( ops->read_msr )
+            {
+                switch ( ctxt->vendor )
+                {
+                case X86_VENDOR_AMD:
+                    if ( ops->read_msr(MSR_K8_HWCR, &msr_val,
+                                       ctxt) == X86EMUL_OKAY &&
+                         (msr_val & K8_HWCR_MON_MWAIT_USER_EN) )
+                        user = true;
+                    break;
+
+                case X86_VENDOR_INTEL:
+                    if ( ops->read_msr(MSR_INTEL_MISC_FEATURES_ENABLES,
+                                       &msr_val, ctxt) == X86EMUL_OKAY &&
+                         (msr_val & MSR_MISC_FEATURES_RING3MWAIT) )
+                        user = true;
+                    break;
+                }
+            }
+            generate_exception_if(!user && !mode_ring0(), EXC_UD);
+            if ( modrm & 1 )
+                break;
+        monitor:
+            /* Zero-byte read to perform segmentation checks. */
+            rc = ops->read(ea.mem.seg, truncate_ea(_regs.r(ax)), NULL, 0, ctxt);
+            if ( rc != X86EMUL_OKAY )
+                goto done;
+            break;
+        }
+
         case 0xca: /* clac */
         case 0xcb: /* stac */
             vcpu_must_have(smap);
@@ -5073,6 +5111,14 @@ x86_emulate(
             _regs.r(cx) = (uint32_t)msr_val;
             goto rdtsc;
 
+        case 0xfa: /* monitorx */
+            vcpu_must_have(monitorx);
+            goto monitor;
+
+        case 0xfb: /* mwaitx */
+            vcpu_must_have(monitorx);
+            break;
+
         case 0xfc: /* clzero */
         {
             unsigned long zero = 0;
--- a/xen/include/asm-x86/msr-index.h
+++ b/xen/include/asm-x86/msr-index.h
@@ -161,6 +161,9 @@
 
 #define MSR_K7_HWCR			0xc0010015
 #define MSR_K8_HWCR			0xc0010015
+#define K8_HWCR_MON_MWAIT_USER_EN	(1ULL << 10)
+#define K8_HWCR_MCi_STATUS_WREN		(1ULL << 18)
+
 #define MSR_K7_FID_VID_CTL		0xc0010041
 #define MSR_K7_FID_VID_STATUS		0xc0010042
 #define MSR_K8_PSTATE_LIMIT		0xc0010061
@@ -516,6 +519,8 @@
 #define MSR_INTEL_MISC_FEATURES_ENABLES	0x00000140
 #define _MSR_MISC_FEATURES_CPUID_FAULTING	0
 #define MSR_MISC_FEATURES_CPUID_FAULTING	(1ULL << _MSR_MISC_FEATURES_CPUID_FAULTING)
+#define _MSR_MISC_FEATURES_RING3MWAIT		1
+#define MSR_MISC_FEATURES_RING3MWAIT		(1ULL << _MSR_MISC_FEATURES_RING3MWAIT)
 
 #define MSR_CC6_DEMOTION_POLICY_CONFIG	0x00000668
 #define MSR_MC6_DEMOTION_POLICY_CONFIG	0x00000669



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* [PATCH v3 18/25] x86emul: add missing suffixes in test harness
  2017-12-07 13:49 [PATCH v3 00/25] x86: emulator enhancements Jan Beulich
                   ` (16 preceding siblings ...)
  2017-12-07 14:11 ` [PATCH v3 17/25] x86emul: emulate {MONITOR, MWAIT}{, X} as no-op Jan Beulich
@ 2017-12-07 14:12 ` Jan Beulich
  2018-02-02 14:13   ` Andrew Cooper
  2017-12-07 14:14 ` [PATCH v3 19/25] x86emul: tell cmpxchg hook whether LOCK is in effect Jan Beulich
                   ` (6 subsequent siblings)
  24 siblings, 1 reply; 85+ messages in thread
From: Jan Beulich @ 2017-12-07 14:12 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper

I'm in the process of putting together a gas change issuing at least
warnings when the intended size of a memory operation can't be deduced
from another (register) operand. Add missing suffixes to silence such
future diagnostics.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -785,13 +785,13 @@ int main(int argc, char **argv)
         regs.eflags |= (i & 0x100) ? X86_EFLAGS_AF : 0;
         if ( i & 0x400 )
             __asm__ (
-                "pushf; and $0xffffffee,(%%esp); or %1,(%%esp); popf; das; "
+                "pushf; andl $~0x11,(%%esp); or %1,(%%esp); popf; das; "
                 "pushf; popl %1"
                 : "=a" (bcdres_native), "=r" (regs.eflags)
                 : "0" (i & 0xff), "1" (regs.eflags) );
         else
             __asm__ (
-                "pushf; and $0xffffffee,(%%esp); or %1,(%%esp); popf; daa; "
+                "pushf; andl $~0x11,(%%esp); or %1,(%%esp); popf; daa; "
                 "pushf; popl %1"
                 : "=a" (bcdres_native), "=r" (regs.eflags)
                 : "0" (i & 0xff), "1" (regs.eflags) );




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* [PATCH v3 19/25] x86emul: tell cmpxchg hook whether LOCK is in effect
  2017-12-07 13:49 [PATCH v3 00/25] x86: emulator enhancements Jan Beulich
                   ` (17 preceding siblings ...)
  2017-12-07 14:12 ` [PATCH v3 18/25] x86emul: add missing suffixes in test harness Jan Beulich
@ 2017-12-07 14:14 ` Jan Beulich
  2017-12-08 10:58   ` Paul Durrant
  2018-02-02 14:13   ` Andrew Cooper
  2017-12-07 14:15 ` [PATCH v3 20/25] x86emul: correctly handle CMPXCHG* comparison failures Jan Beulich
                   ` (5 subsequent siblings)
  24 siblings, 2 replies; 85+ messages in thread
From: Jan Beulich @ 2017-12-07 14:14 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Paul Durrant, Tim Deegan

This is necessary for the hook to correctly perform the operation.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/tools/fuzz/x86_instruction_emulator/fuzz-emul.c
+++ b/tools/fuzz/x86_instruction_emulator/fuzz-emul.c
@@ -346,6 +346,7 @@ static int fuzz_cmpxchg(
     void *old,
     void *new,
     unsigned int bytes,
+    bool lock,
     struct x86_emulate_ctxt *ctxt)
 {
     /*
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -320,6 +320,7 @@ static int cmpxchg(
     void *old,
     void *new,
     unsigned int bytes,
+    bool lock,
     struct x86_emulate_ctxt *ctxt)
 {
     if ( verbose )
--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -1249,6 +1249,7 @@ static int hvmemul_cmpxchg_discard(
     void *p_old,
     void *p_new,
     unsigned int bytes,
+    bool lock,
     struct x86_emulate_ctxt *ctxt)
 {
     return X86EMUL_OKAY;
@@ -1292,6 +1293,7 @@ static int hvmemul_cmpxchg(
     void *p_old,
     void *p_new,
     unsigned int bytes,
+    bool lock,
     struct x86_emulate_ctxt *ctxt)
 {
     /* Fix this in case the guest is really relying on r-m-w atomicity. */
--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -281,6 +281,7 @@ hvm_emulate_cmpxchg(enum x86_segment seg
                     void *p_old,
                     void *p_new,
                     unsigned int bytes,
+                    bool lock,
                     struct x86_emulate_ctxt *ctxt)
 {
     struct sh_emulate_ctxt *sh_ctxt =
--- a/xen/arch/x86/pv/ro-page-fault.c
+++ b/xen/arch/x86/pv/ro-page-fault.c
@@ -216,7 +216,7 @@ static int ptwr_emulated_write(enum x86_
 
 static int ptwr_emulated_cmpxchg(enum x86_segment seg, unsigned long offset,
                                  void *p_old, void *p_new, unsigned int bytes,
-                                 struct x86_emulate_ctxt *ctxt)
+                                 bool lock, struct x86_emulate_ctxt *ctxt)
 {
     paddr_t old = 0, new = 0;
 
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1973,7 +1973,7 @@ protmode_load_seg(
 
         fail_if(!ops->cmpxchg);
         switch ( (rc = ops->cmpxchg(sel_seg, (sel & 0xfff8) + 4, &desc.b,
-                                    &new_desc_b, sizeof(desc.b), ctxt)) )
+                                    &new_desc_b, sizeof(desc.b), true, ctxt)) )
         {
         case X86EMUL_OKAY:
             break;
@@ -6982,7 +6982,8 @@ x86_emulate(
             }
 
             if ( (rc = ops->cmpxchg(ea.mem.seg, ea.mem.off, old, aux,
-                                    op_bytes, ctxt)) != X86EMUL_OKAY )
+                                    op_bytes, lock_prefix,
+                                    ctxt)) != X86EMUL_OKAY )
                 goto done;
             _regs.eflags |= X86_EFLAGS_ZF;
         }
@@ -8434,7 +8435,7 @@ x86_emulate(
             fail_if(!ops->cmpxchg);
             rc = ops->cmpxchg(
                 dst.mem.seg, dst.mem.off, &dst.orig_val,
-                &dst.val, dst.bytes, ctxt);
+                &dst.val, dst.bytes, true, ctxt);
         }
         else
         {
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -237,10 +237,11 @@ struct x86_emulate_ops
         struct x86_emulate_ctxt *ctxt);
 
     /*
-     * cmpxchg: Emulate an atomic (LOCKed) CMPXCHG operation.
+     * cmpxchg: Emulate a CMPXCHG operation.
      *  @p_old: [IN ] Pointer to value expected to be current at @addr.
      *  @p_new: [IN ] Pointer to value to write to @addr.
      *  @bytes: [IN ] Operation size (up to 8 (x86/32) or 16 (x86/64) bytes).
+     *  @lock:  [IN ] atomic (LOCKed) operation
      */
     int (*cmpxchg)(
         enum x86_segment seg,
@@ -248,6 +249,7 @@ struct x86_emulate_ops
         void *p_old,
         void *p_new,
         unsigned int bytes,
+        bool lock,
         struct x86_emulate_ctxt *ctxt);
 
     /*



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* [PATCH v3 20/25] x86emul: correctly handle CMPXCHG* comparison failures
  2017-12-07 13:49 [PATCH v3 00/25] x86: emulator enhancements Jan Beulich
                   ` (18 preceding siblings ...)
  2017-12-07 14:14 ` [PATCH v3 19/25] x86emul: tell cmpxchg hook whether LOCK is in effect Jan Beulich
@ 2017-12-07 14:15 ` Jan Beulich
  2018-02-02 14:49   ` Andrew Cooper
  2017-12-07 14:16 ` [PATCH v3 21/25] x86emul: add read-modify-write hook Jan Beulich
                   ` (4 subsequent siblings)
  24 siblings, 1 reply; 85+ messages in thread
From: Jan Beulich @ 2017-12-07 14:15 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Tim Deegan

If the ->cmpxchg() hook finds a mismatch, we should deal with this the
same way as when the "manual" comparison reports a mismatch.

This involves reverting bfce0e62c3 ("x86/emul: Drop
X86EMUL_CMPXCHG_FAILED"), albeit with X86EMUL_CMPXCHG_FAILED now
becoming a value distinct from X86EMUL_RETRY.

In order to not leave mixed code also fully switch affected functions
from paddr_t to intpte_t.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.
---
The code could be further simplified if we could rely on all
->cmpxchg() hooks always using CMPXCHG, but for now we need to cope
with them using plain writes (and hence accept the double reads if
CMPXCHG is actually being used).
Note that the patch doesn't address the incorrectness of there not
being a memory write even in the comparison-failed case.

--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -302,8 +302,12 @@ hvm_emulate_cmpxchg(enum x86_segment seg
     memcpy(&old, p_old, bytes);
     memcpy(&new, p_new, bytes);
 
-    return v->arch.paging.mode->shadow.x86_emulate_cmpxchg(
-               v, addr, old, new, bytes, sh_ctxt);
+    rc = v->arch.paging.mode->shadow.x86_emulate_cmpxchg(
+             v, addr, &old, new, bytes, sh_ctxt);
+
+    memcpy(p_old, &old, bytes);
+
+    return rc;
 }
 
 static const struct x86_emulate_ops hvm_shadow_emulator_ops = {
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -4741,11 +4741,11 @@ sh_x86_emulate_write(struct vcpu *v, uns
 
 static int
 sh_x86_emulate_cmpxchg(struct vcpu *v, unsigned long vaddr,
-                        unsigned long old, unsigned long new,
-                        unsigned int bytes, struct sh_emulate_ctxt *sh_ctxt)
+                       unsigned long *p_old, unsigned long new,
+                       unsigned int bytes, struct sh_emulate_ctxt *sh_ctxt)
 {
     void *addr;
-    unsigned long prev;
+    unsigned long prev, old = *p_old;
     int rv = X86EMUL_OKAY;
 
     /* Unaligned writes are only acceptable on HVM */
@@ -4769,7 +4769,10 @@ sh_x86_emulate_cmpxchg(struct vcpu *v, u
     }
 
     if ( prev != old )
-        rv = X86EMUL_RETRY;
+    {
+        *p_old = prev;
+        rv = X86EMUL_CMPXCHG_FAILED;
+    }
 
     SHADOW_DEBUG(EMULATE, "va %#lx was %#lx expected %#lx"
                   " wanted %#lx now %#lx bytes %u\n",
--- a/xen/arch/x86/pv/ro-page-fault.c
+++ b/xen/arch/x86/pv/ro-page-fault.c
@@ -65,14 +65,16 @@ static int ptwr_emulated_read(enum x86_s
     return X86EMUL_OKAY;
 }
 
-static int ptwr_emulated_update(unsigned long addr, paddr_t old, paddr_t val,
-                                unsigned int bytes, unsigned int do_cmpxchg,
+static int ptwr_emulated_update(unsigned long addr, intpte_t *p_old,
+                                intpte_t val, unsigned int bytes,
                                 struct x86_emulate_ctxt *ctxt)
 {
     unsigned long mfn;
     unsigned long unaligned_addr = addr;
     struct page_info *page;
     l1_pgentry_t pte, ol1e, nl1e, *pl1e;
+    intpte_t old = p_old ? *p_old : 0;
+    unsigned int offset = 0;
     struct vcpu *v = current;
     struct domain *d = v->domain;
     struct ptwr_emulate_ctxt *ptwr_ctxt = ctxt->data;
@@ -88,28 +90,30 @@ static int ptwr_emulated_update(unsigned
     }
 
     /* Turn a sub-word access into a full-word access. */
-    if ( bytes != sizeof(paddr_t) )
+    if ( bytes != sizeof(val) )
     {
-        paddr_t      full;
-        unsigned int rc, offset = addr & (sizeof(paddr_t) - 1);
+        intpte_t full;
+        unsigned int rc;
+
+        offset = addr & (sizeof(full) - 1);
 
         /* Align address; read full word. */
-        addr &= ~(sizeof(paddr_t) - 1);
-        if ( (rc = copy_from_user(&full, (void *)addr, sizeof(paddr_t))) != 0 )
+        addr &= ~(sizeof(full) - 1);
+        if ( (rc = copy_from_user(&full, (void *)addr, sizeof(full))) != 0 )
         {
             x86_emul_pagefault(0, /* Read fault. */
-                               addr + sizeof(paddr_t) - rc,
+                               addr + sizeof(full) - rc,
                                ctxt);
             return X86EMUL_EXCEPTION;
         }
         /* Mask out bits provided by caller. */
-        full &= ~((((paddr_t)1 << (bytes * 8)) - 1) << (offset * 8));
+        full &= ~((((intpte_t)1 << (bytes * 8)) - 1) << (offset * 8));
         /* Shift the caller value and OR in the missing bits. */
-        val  &= (((paddr_t)1 << (bytes * 8)) - 1);
+        val  &= (((intpte_t)1 << (bytes * 8)) - 1);
         val <<= (offset) * 8;
         val  |= full;
         /* Also fill in missing parts of the cmpxchg old value. */
-        old  &= (((paddr_t)1 << (bytes * 8)) - 1);
+        old  &= (((intpte_t)1 << (bytes * 8)) - 1);
         old <<= (offset) * 8;
         old  |= full;
     }
@@ -131,7 +135,7 @@ static int ptwr_emulated_update(unsigned
     {
     default:
         if ( is_pv_32bit_domain(d) && (bytes == 4) && (unaligned_addr & 4) &&
-             !do_cmpxchg && (l1e_get_flags(nl1e) & _PAGE_PRESENT) )
+             !p_old && (l1e_get_flags(nl1e) & _PAGE_PRESENT) )
         {
             /*
              * If this is an upper-half write to a PAE PTE then we assume that
@@ -162,21 +166,26 @@ static int ptwr_emulated_update(unsigned
     /* Checked successfully: do the update (write or cmpxchg). */
     pl1e = map_domain_page(_mfn(mfn));
     pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
-    if ( do_cmpxchg )
+    if ( p_old )
     {
-        bool okay;
-        intpte_t t = old;
 
         ol1e = l1e_from_intpte(old);
-        okay = paging_cmpxchg_guest_entry(v, &l1e_get_intpte(*pl1e),
-                                          &t, l1e_get_intpte(nl1e), _mfn(mfn));
-        okay = (okay && t == old);
+        if ( !paging_cmpxchg_guest_entry(v, &l1e_get_intpte(*pl1e),
+                                         &old, l1e_get_intpte(nl1e), _mfn(mfn)) )
+            ret = X86EMUL_UNHANDLEABLE;
+        else if ( l1e_get_intpte(ol1e) == old )
+            ret = X86EMUL_OKAY;
+        else
+        {
+            *p_old = old >> (offset * 8);
+            ret = X86EMUL_CMPXCHG_FAILED;
+        }
 
-        if ( !okay )
+        if ( ret != X86EMUL_OKAY )
         {
             unmap_domain_page(pl1e);
             put_page_from_l1e(nl1e, d);
-            return X86EMUL_RETRY;
+            return ret;
         }
     }
     else
@@ -200,9 +209,9 @@ static int ptwr_emulated_write(enum x86_
                                void *p_data, unsigned int bytes,
                                struct x86_emulate_ctxt *ctxt)
 {
-    paddr_t val = 0;
+    intpte_t val = 0;
 
-    if ( (bytes > sizeof(paddr_t)) || (bytes & (bytes - 1)) || !bytes )
+    if ( (bytes > sizeof(val)) || (bytes & (bytes - 1)) || !bytes )
     {
         gdprintk(XENLOG_WARNING, "bad write size (addr=%lx, bytes=%u)\n",
                  offset, bytes);
@@ -211,16 +220,17 @@ static int ptwr_emulated_write(enum x86_
 
     memcpy(&val, p_data, bytes);
 
-    return ptwr_emulated_update(offset, 0, val, bytes, 0, ctxt);
+    return ptwr_emulated_update(offset, NULL, val, bytes, ctxt);
 }
 
 static int ptwr_emulated_cmpxchg(enum x86_segment seg, unsigned long offset,
                                  void *p_old, void *p_new, unsigned int bytes,
                                  bool lock, struct x86_emulate_ctxt *ctxt)
 {
-    paddr_t old = 0, new = 0;
+    intpte_t old = 0, new = 0;
+    int rc;
 
-    if ( (bytes > sizeof(paddr_t)) || (bytes & (bytes - 1)) )
+    if ( (bytes > sizeof(new)) || (bytes & (bytes - 1)) )
     {
         gdprintk(XENLOG_WARNING, "bad cmpxchg size (addr=%lx, bytes=%u)\n",
                  offset, bytes);
@@ -230,7 +240,11 @@ static int ptwr_emulated_cmpxchg(enum x8
     memcpy(&old, p_old, bytes);
     memcpy(&new, p_new, bytes);
 
-    return ptwr_emulated_update(offset, old, new, bytes, 1, ctxt);
+    rc = ptwr_emulated_update(offset, &old, new, bytes, ctxt);
+
+    memcpy(p_old, &old, bytes);
+
+    return rc;
 }
 
 static const struct x86_emulate_ops ptwr_emulate_ops = {
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1985,6 +1985,9 @@ protmode_load_seg(
 
         default:
             return rc;
+
+        case X86EMUL_CMPXCHG_FAILED:
+            return X86EMUL_RETRY;
         }
 
         /* Force the Accessed flag in our local copy. */
@@ -6644,21 +6647,45 @@ x86_emulate(
         break;
 
     case X86EMUL_OPC(0x0f, 0xb0): case X86EMUL_OPC(0x0f, 0xb1): /* cmpxchg */
-        /* Save real source value, then compare EAX against destination. */
-        src.orig_val = src.val;
-        src.val = _regs.r(ax);
-        /* cmp: %%eax - dst ==> dst and src swapped for macro invocation */
-        emulate_2op_SrcV("cmp", dst, src, _regs.eflags);
-        if ( _regs.eflags & X86_EFLAGS_ZF )
+        fail_if(!ops->cmpxchg);
+        _regs.eflags &= ~EFLAGS_MASK;
+        if ( !((dst.val ^ _regs.r(ax)) &
+               (~0UL >> (8 * (sizeof(long) - dst.bytes)))) )
         {
             /* Success: write back to memory. */
-            dst.val = src.orig_val;
+            if ( dst.type == OP_MEM )
+            {
+                dst.val = _regs.r(ax);
+                switch ( rc = ops->cmpxchg(dst.mem.seg, dst.mem.off, &dst.val,
+                                           &src.val, dst.bytes, lock_prefix,
+                                           ctxt) )
+                {
+                case X86EMUL_OKAY:
+                    dst.type = OP_NONE;
+                    _regs.eflags |= X86_EFLAGS_ZF | X86_EFLAGS_PF;
+                    break;
+                case X86EMUL_CMPXCHG_FAILED:
+                    rc = X86EMUL_OKAY;
+                    break;
+                default:
+                    goto done;
+                }
+            }
+            else
+            {
+                dst.val = src.val;
+                _regs.eflags |= X86_EFLAGS_ZF | X86_EFLAGS_PF;
+            }
         }
-        else
+        if ( !(_regs.eflags & X86_EFLAGS_ZF) )
         {
             /* Failure: write the value we saw to EAX. */
             dst.type = OP_REG;
             dst.reg  = (unsigned long *)&_regs.r(ax);
+            /* cmp: %%eax - dst ==> dst and src swapped for macro invocation */
+            src.val = _regs.r(ax);
+            emulate_2op_SrcV("cmp", dst, src, _regs.eflags);
+            ASSERT(!(_regs.eflags & X86_EFLAGS_ZF));
         }
         break;
 
@@ -6959,6 +6986,7 @@ x86_emulate(
 
         if ( memcmp(old, aux, op_bytes) )
         {
+        cmpxchgNb_failed:
             /* Expected != actual: store actual to rDX:rAX and clear ZF. */
             _regs.r(ax) = !(rex_prefix & REX_W) ? old->u32[0] : old->u64[0];
             _regs.r(dx) = !(rex_prefix & REX_W) ? old->u32[1] : old->u64[1];
@@ -6968,7 +6996,7 @@ x86_emulate(
         {
             /*
              * Expected == actual: Get proposed value, attempt atomic cmpxchg
-             * and set ZF.
+             * and set ZF if successful.
              */
             if ( !(rex_prefix & REX_W) )
             {
@@ -6981,11 +7009,20 @@ x86_emulate(
                 aux->u64[1] = _regs.r(cx);
             }
 
-            if ( (rc = ops->cmpxchg(ea.mem.seg, ea.mem.off, old, aux,
-                                    op_bytes, lock_prefix,
-                                    ctxt)) != X86EMUL_OKAY )
+            switch ( rc = ops->cmpxchg(ea.mem.seg, ea.mem.off, old, aux,
+                                       op_bytes, lock_prefix, ctxt) )
+            {
+            case X86EMUL_OKAY:
+                _regs.eflags |= X86_EFLAGS_ZF;
+                break;
+
+            case X86EMUL_CMPXCHG_FAILED:
+                rc = X86EMUL_OKAY;
+                goto cmpxchgNb_failed;
+
+            default:
                 goto done;
-            _regs.eflags |= X86_EFLAGS_ZF;
+            }
         }
         break;
     }
@@ -8436,6 +8473,8 @@ x86_emulate(
             rc = ops->cmpxchg(
                 dst.mem.seg, dst.mem.off, &dst.orig_val,
                 &dst.val, dst.bytes, true, ctxt);
+            if ( rc == X86EMUL_CMPXCHG_FAILED )
+                rc = X86EMUL_RETRY;
         }
         else
         {
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -150,6 +150,8 @@ struct x86_emul_fpu_aux {
   * strictly expected for now.
  */
 #define X86EMUL_UNRECOGNIZED   X86EMUL_UNIMPLEMENTED
+ /* (cmpxchg accessor): CMPXCHG failed. */
+#define X86EMUL_CMPXCHG_FAILED 7
 
 /* FPU sub-types which may be requested via ->get_fpu(). */
 enum x86_emulate_fpu_type {
@@ -239,6 +241,8 @@ struct x86_emulate_ops
     /*
      * cmpxchg: Emulate a CMPXCHG operation.
      *  @p_old: [IN ] Pointer to value expected to be current at @addr.
+     *          [OUT] Pointer to value found at @addr (may always be
+     *                updated, meaningful for X86EMUL_CMPXCHG_FAILED only).
      *  @p_new: [IN ] Pointer to value to write to @addr.
      *  @bytes: [IN ] Operation size (up to 8 (x86/32) or 16 (x86/64) bytes).
      *  @lock:  [IN ] atomic (LOCKed) operation
--- a/xen/include/asm-x86/paging.h
+++ b/xen/include/asm-x86/paging.h
@@ -86,7 +86,7 @@ struct shadow_paging_mode {
                                             void *src, u32 bytes,
                                             struct sh_emulate_ctxt *sh_ctxt);
     int           (*x86_emulate_cmpxchg   )(struct vcpu *v, unsigned long va,
-                                            unsigned long old, 
+                                            unsigned long *old,
                                             unsigned long new,
                                             unsigned int bytes,
                                             struct sh_emulate_ctxt *sh_ctxt);



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* [PATCH v3 21/25] x86emul: add read-modify-write hook
  2017-12-07 13:49 [PATCH v3 00/25] x86: emulator enhancements Jan Beulich
                   ` (19 preceding siblings ...)
  2017-12-07 14:15 ` [PATCH v3 20/25] x86emul: correctly handle CMPXCHG* comparison failures Jan Beulich
@ 2017-12-07 14:16 ` Jan Beulich
  2018-02-02 16:13   ` Andrew Cooper
  2017-12-07 14:16 ` [PATCH v3 22/25] x86/HVM: do actual CMPXCHG in hvmemul_cmpxchg() Jan Beulich
                   ` (3 subsequent siblings)
  24 siblings, 1 reply; 85+ messages in thread
From: Jan Beulich @ 2017-12-07 14:16 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper

In order to correctly emulate read-modify-write insns, especially
LOCKed ones, we should not issue reads and writes separately. Use a
new hook to combine both, and don't uniformly read the memory
destination anymore. Instead, DstMem opcodes without Mov now need to
have done so in their respective case blocks.

Also strip bogus _ prefixes from macro parameters when this only affects
lines which are being changed anyway.

In the test harness, besides some re-ordering to facilitate running a
few tests twice (one without and a second time with the .rmw hook in
place), tighten a few EFLAGS checks and add a test for NOT with memory
operand (in particular to verify EFLAGS don't get altered there).

For now make use of the hook optional for callers; eventually we may
want to consider making this mandatory.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.
---
TBD: Do we want to also support non-lockable RMW insns in the new hook
     and helper (SHL & friends, SHLD, SHRD)?

--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -314,6 +314,17 @@ static int write(
     return X86EMUL_OKAY;
 }
 
+static int rmw(
+    enum x86_segment seg,
+    unsigned long offset,
+    unsigned int bytes,
+    uint32_t *eflags,
+    struct x86_emulate_state *state,
+    struct x86_emulate_ctxt *ctxt)
+{
+    return x86_emul_rmw((void *)offset, bytes, eflags, state, ctxt);
+}
+
 static int cmpxchg(
     enum x86_segment seg,
     unsigned long offset,
@@ -378,6 +389,9 @@ static struct x86_emulate_ops emulops =
     .put_fpu    = emul_test_put_fpu,
 };
 
+#define EFLAGS_ALWAYS_SET (X86_EFLAGS_IF | X86_EFLAGS_MBS)
+#define EFLAGS_MASK (X86_EFLAGS_ARITH_MASK | EFLAGS_ALWAYS_SET)
+
 int main(int argc, char **argv)
 {
     struct x86_emulate_ctxt ctxt;
@@ -414,6 +428,7 @@ int main(int argc, char **argv)
     if ( !stack_exec )
         printf("Warning: Stack could not be made executable (%d).\n", errno);
 
+ rmw_restart:
     printf("%-40s", "Testing addl %ecx,(%eax)...");
     instr[0] = 0x01; instr[1] = 0x08;
     regs.eflags = 0x200;
@@ -541,35 +556,32 @@ int main(int argc, char **argv)
         goto fail;
     printf("okay\n");
 
-    printf("%-40s", "Testing rep movsw...");
-    instr[0] = 0xf3; instr[1] = 0x66; instr[2] = 0xa5;
+    printf("%-40s", "Testing notb (%edi)...");
+    instr[0] = 0xf6; instr[1] = 0x17;
     *res        = 0x22334455;
-    regs.eflags = 0x200;
-    regs.ecx    = 23;
+    regs.eflags = EFLAGS_MASK;
     regs.eip    = (unsigned long)&instr[0];
-    regs.esi    = (unsigned long)res + 0;
-    regs.edi    = (unsigned long)res + 2;
+    regs.edi    = (unsigned long)res;
     rc = x86_emulate(&ctxt, &emulops);
-    if ( (rc != X86EMUL_OKAY) || 
-         (*res != 0x44554455) ||
-         (regs.eflags != 0x200) ||
-         (regs.ecx != 22) || 
-         (regs.esi != ((unsigned long)res + 2)) ||
-         (regs.edi != ((unsigned long)res + 4)) ||
-         (regs.eip != (unsigned long)&instr[0]) )
+    if ( (rc != X86EMUL_OKAY) ||
+         (*res != 0x223344aa) ||
+         ((regs.eflags & EFLAGS_MASK) != EFLAGS_MASK) ||
+         (regs.eip != (unsigned long)&instr[2]) )
         goto fail;
     printf("okay\n");
 
     printf("%-40s", "Testing btrl $0x1,(%edi)...");
     instr[0] = 0x0f; instr[1] = 0xba; instr[2] = 0x37; instr[3] = 0x01;
     *res        = 0x2233445F;
-    regs.eflags = 0x200;
+    regs.eflags = EFLAGS_ALWAYS_SET;
     regs.eip    = (unsigned long)&instr[0];
     regs.edi    = (unsigned long)res;
     rc = x86_emulate(&ctxt, &emulops);
     if ( (rc != X86EMUL_OKAY) ||
          (*res != 0x2233445D) ||
-         ((regs.eflags&0x201) != 0x201) ||
+         ((regs.eflags & (EFLAGS_ALWAYS_SET | X86_EFLAGS_ZF |
+                          X86_EFLAGS_CF)) !=
+          (EFLAGS_ALWAYS_SET | X86_EFLAGS_CF)) ||
          (regs.eip != (unsigned long)&instr[4]) )
         goto fail;
     printf("okay\n");
@@ -577,14 +589,16 @@ int main(int argc, char **argv)
     printf("%-40s", "Testing btrl %eax,(%edi)...");
     instr[0] = 0x0f; instr[1] = 0xb3; instr[2] = 0x07;
     *res        = 0x2233445F;
-    regs.eflags = 0x200;
+    regs.eflags = EFLAGS_ALWAYS_SET | X86_EFLAGS_ZF;
     regs.eip    = (unsigned long)&instr[0];
     regs.eax    = -32;
     regs.edi    = (unsigned long)(res+1);
     rc = x86_emulate(&ctxt, &emulops);
     if ( (rc != X86EMUL_OKAY) ||
          (*res != 0x2233445E) ||
-         ((regs.eflags&0x201) != 0x201) ||
+         ((regs.eflags & (EFLAGS_ALWAYS_SET | X86_EFLAGS_ZF |
+                          X86_EFLAGS_CF)) !=
+          (EFLAGS_ALWAYS_SET | X86_EFLAGS_ZF | X86_EFLAGS_CF)) ||
          (regs.eip != (unsigned long)&instr[3]) )
         goto fail;
     printf("okay\n");
@@ -592,19 +606,63 @@ int main(int argc, char **argv)
 #ifdef __x86_64__
     printf("%-40s", "Testing btcq %r8,(%r11)...");
     instr[0] = 0x4d; instr[1] = 0x0f; instr[2] = 0xbb; instr[3] = 0x03;
-    regs.eflags = 0x200;
+    regs.eflags = EFLAGS_ALWAYS_SET;
     regs.rip    = (unsigned long)&instr[0];
     regs.r8     = (-1L << 40) + 1;
     regs.r11    = (unsigned long)(res + (1L << 35));
     rc = x86_emulate(&ctxt, &emulops);
     if ( (rc != X86EMUL_OKAY) ||
          (*res != 0x2233445C) ||
-         (regs.eflags != 0x201) ||
+         ((regs.eflags & (EFLAGS_ALWAYS_SET | X86_EFLAGS_ZF |
+                          X86_EFLAGS_CF)) !=
+          (EFLAGS_ALWAYS_SET | X86_EFLAGS_CF)) ||
          (regs.rip != (unsigned long)&instr[4]) )
         goto fail;
     printf("okay\n");
 #endif
 
+    printf("%-40s", "Testing xadd %ax,(%ecx)...");
+    instr[0] = 0x66; instr[1] = 0x0f; instr[2] = 0xc1; instr[3] = 0x01;
+    regs.eflags = EFLAGS_ALWAYS_SET | X86_EFLAGS_ARITH_MASK;
+    regs.eip    = (unsigned long)&instr[0];
+    regs.ecx    = (unsigned long)res;
+    regs.eax    = 0x12345678;
+    *res        = 0x11111111;
+    rc = x86_emulate(&ctxt, &emulops);
+    if ( (rc != X86EMUL_OKAY) ||
+         (*res != 0x11116789) ||
+         (regs.eax != 0x12341111) ||
+         ((regs.eflags & EFLAGS_MASK) != EFLAGS_ALWAYS_SET) ||
+         (regs.eip != (unsigned long)&instr[4]) )
+        goto fail;
+    printf("okay\n");
+
+    if ( !emulops.rmw )
+    {
+        printf("[Switching to read-modify-write mode]\n");
+        emulops.rmw = rmw;
+        goto rmw_restart;
+    }
+
+    printf("%-40s", "Testing rep movsw...");
+    instr[0] = 0xf3; instr[1] = 0x66; instr[2] = 0xa5;
+    *res        = 0x22334455;
+    regs.eflags = 0x200;
+    regs.ecx    = 23;
+    regs.eip    = (unsigned long)&instr[0];
+    regs.esi    = (unsigned long)res + 0;
+    regs.edi    = (unsigned long)res + 2;
+    rc = x86_emulate(&ctxt, &emulops);
+    if ( (rc != X86EMUL_OKAY) ||
+         (*res != 0x44554455) ||
+         (regs.eflags != 0x200) ||
+         (regs.ecx != 22) ||
+         (regs.esi != ((unsigned long)res + 2)) ||
+         (regs.edi != ((unsigned long)res + 4)) ||
+         (regs.eip != (unsigned long)&instr[0]) )
+        goto fail;
+    printf("okay\n");
+
     res[0] = 0x12345678;
     res[1] = 0x87654321;
 
@@ -730,22 +788,6 @@ int main(int argc, char **argv)
 #endif
     printf("okay\n");
 
-    printf("%-40s", "Testing xadd %ax,(%ecx)...");
-    instr[0] = 0x66; instr[1] = 0x0f; instr[2] = 0xc1; instr[3] = 0x01;
-    regs.eflags = 0x200;
-    regs.eip    = (unsigned long)&instr[0];
-    regs.ecx    = (unsigned long)res;
-    regs.eax    = 0x12345678;
-    *res        = 0x11111111;
-    rc = x86_emulate(&ctxt, &emulops);
-    if ( (rc != X86EMUL_OKAY) ||
-         (*res != 0x11116789) ||
-         (regs.eax != 0x12341111) ||
-         ((regs.eflags&0x240) != 0x200) ||
-         (regs.eip != (unsigned long)&instr[4]) )
-        goto fail;
-    printf("okay\n");
-
     printf("%-40s", "Testing dec %ax...");
 #ifndef __x86_64__
     instr[0] = 0x66; instr[1] = 0x48;
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -664,6 +664,25 @@ struct x86_emulate_state {
         ext_8f09,
         ext_8f0a,
     } ext;
+    enum {
+        rmw_NONE,
+        rmw_adc,
+        rmw_add,
+        rmw_and,
+        rmw_btc,
+        rmw_btr,
+        rmw_bts,
+        rmw_dec,
+        rmw_inc,
+        rmw_neg,
+        rmw_not,
+        rmw_or,
+        rmw_sbb,
+        rmw_sub,
+        rmw_xadd,
+        rmw_xchg,
+        rmw_xor,
+    } rmw;
     uint8_t modrm, modrm_mod, modrm_reg, modrm_rm;
     uint8_t sib_index, sib_scale;
     uint8_t rex_prefix;
@@ -818,123 +837,136 @@ typedef union {
 "orl  %"_LO32 _tmp",%"_LO32 _sav"; "
 
 /* Raw emulation: instruction has two explicit operands. */
-#define __emulate_2op_nobyte(_op,_src,_dst,_eflags, wsx,wsy,wdx,wdy,       \
-                             lsx,lsy,ldx,ldy, qsx,qsy,qdx,qdy)             \
+#define __emulate_2op_nobyte(_op, src, dst, sz, eflags, wsx,wsy,wdx,wdy,   \
+                             lsx,lsy,ldx,ldy, qsx,qsy,qdx,qdy, extra...)   \
 do{ unsigned long _tmp;                                                    \
-    switch ( (_dst).bytes )                                                \
+    switch ( sz )                                                          \
     {                                                                      \
     case 2:                                                                \
         asm volatile (                                                     \
             _PRE_EFLAGS("0","4","2")                                       \
             _op"w %"wsx"3,%"wdx"1; "                                       \
             _POST_EFLAGS("0","4","2")                                      \
-            : "+g" (_eflags), "+" wdy ((_dst).val), "=&r" (_tmp)           \
-            : wsy ((_src).val), "i" (EFLAGS_MASK) );                       \
+            : "+g" (eflags), "+" wdy (*(dst)), "=&r" (_tmp)                \
+            : wsy (src), "i" (EFLAGS_MASK), ## extra );                    \
         break;                                                             \
     case 4:                                                                \
         asm volatile (                                                     \
             _PRE_EFLAGS("0","4","2")                                       \
             _op"l %"lsx"3,%"ldx"1; "                                       \
             _POST_EFLAGS("0","4","2")                                      \
-            : "+g" (_eflags), "+" ldy ((_dst).val), "=&r" (_tmp)           \
-            : lsy ((_src).val), "i" (EFLAGS_MASK) );                       \
+            : "+g" (eflags), "+" ldy (*(dst)), "=&r" (_tmp)                \
+            : lsy (src), "i" (EFLAGS_MASK), ## extra );                    \
         break;                                                             \
     case 8:                                                                \
-        __emulate_2op_8byte(_op, _src, _dst, _eflags, qsx, qsy, qdx, qdy); \
+        __emulate_2op_8byte(_op, src, dst, eflags, qsx, qsy, qdx, qdy,     \
+                            ## extra);                                     \
         break;                                                             \
     }                                                                      \
 } while (0)
-#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy)\
+#define __emulate_2op(_op, src, dst, sz, eflags, _bx, by, wx, wy,          \
+                      lx, ly, qx, qy, extra...)                            \
 do{ unsigned long _tmp;                                                    \
-    switch ( (_dst).bytes )                                                \
+    switch ( sz )                                                          \
     {                                                                      \
     case 1:                                                                \
         asm volatile (                                                     \
             _PRE_EFLAGS("0","4","2")                                       \
             _op"b %"_bx"3,%1; "                                            \
             _POST_EFLAGS("0","4","2")                                      \
-            : "+g" (_eflags), "+m" ((_dst).val), "=&r" (_tmp)              \
-            : _by ((_src).val), "i" (EFLAGS_MASK) );                       \
+            : "+g" (eflags), "+m" (*(dst)), "=&r" (_tmp)                   \
+            : by (src), "i" (EFLAGS_MASK), ##extra );                      \
         break;                                                             \
     default:                                                               \
-        __emulate_2op_nobyte(_op,_src,_dst,_eflags, _wx,_wy,"","m",        \
-                             _lx,_ly,"","m", _qx,_qy,"","m");              \
+        __emulate_2op_nobyte(_op, src, dst, sz, eflags, wx, wy, "", "m",   \
+                             lx, ly, "", "m", qx, qy, "", "m", ##extra);   \
         break;                                                             \
     }                                                                      \
 } while (0)
 /* Source operand is byte-sized and may be restricted to just %cl. */
-#define emulate_2op_SrcB(_op, _src, _dst, _eflags)                         \
-    __emulate_2op(_op, _src, _dst, _eflags,                                \
+#define _emulate_2op_SrcB(op, src, dst, sz, eflags)                        \
+    __emulate_2op(op, src, dst, sz, eflags,                                \
                   "b", "c", "b", "c", "b", "c", "b", "c")
+#define emulate_2op_SrcB(op, src, dst, eflags)                             \
+    _emulate_2op_SrcB(op, (src).val, &(dst).val, (dst).bytes, eflags)
 /* Source operand is byte, word, long or quad sized. */
+#define _emulate_2op_SrcV(op, src, dst, sz, eflags, extra...)              \
+    __emulate_2op(op, src, dst, sz, eflags,                                \
+                  "b", "q", "w", "r", _LO32, "r", "", "r", ##extra)
 #define emulate_2op_SrcV(_op, _src, _dst, _eflags)                         \
-    __emulate_2op(_op, _src, _dst, _eflags,                                \
-                  "b", "q", "w", "r", _LO32, "r", "", "r")
+    _emulate_2op_SrcV(_op, (_src).val, &(_dst).val, (_dst).bytes, _eflags)
 /* Source operand is word, long or quad sized. */
+#define _emulate_2op_SrcV_nobyte(op, src, dst, sz, eflags, extra...)       \
+    __emulate_2op_nobyte(op, src, dst, sz, eflags, "w", "r", "", "m",      \
+                         _LO32, "r", "", "m", "", "r", "", "m", ##extra)
 #define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags)                  \
-    __emulate_2op_nobyte(_op, _src, _dst, _eflags, "w", "r", "", "m",      \
-                         _LO32, "r", "", "m", "", "r", "", "m")
+    _emulate_2op_SrcV_nobyte(_op, (_src).val, &(_dst).val, (_dst).bytes,   \
+                             _eflags)
 /* Operands are word, long or quad sized and source may be in memory. */
 #define emulate_2op_SrcV_srcmem(_op, _src, _dst, _eflags)                  \
-    __emulate_2op_nobyte(_op, _src, _dst, _eflags, "", "m", "w", "r",      \
+    __emulate_2op_nobyte(_op, (_src).val, &(_dst).val, (_dst).bytes,       \
+                         _eflags, "", "m", "w", "r",                       \
                          "", "m", _LO32, "r", "", "m", "", "r")
 
 /* Instruction has only one explicit operand (no source operand). */
-#define emulate_1op(_op,_dst,_eflags)                                      \
+#define _emulate_1op(_op, dst, sz, eflags, extra...)                       \
 do{ unsigned long _tmp;                                                    \
-    switch ( (_dst).bytes )                                                \
+    switch ( sz )                                                          \
     {                                                                      \
     case 1:                                                                \
         asm volatile (                                                     \
             _PRE_EFLAGS("0","3","2")                                       \
             _op"b %1; "                                                    \
             _POST_EFLAGS("0","3","2")                                      \
-            : "+g" (_eflags), "+m" ((_dst).val), "=&r" (_tmp)              \
-            : "i" (EFLAGS_MASK) );                                         \
+            : "+g" (eflags), "+m" (*(dst)), "=&r" (_tmp)                   \
+            : "i" (EFLAGS_MASK), ##extra );                                \
         break;                                                             \
     case 2:                                                                \
         asm volatile (                                                     \
             _PRE_EFLAGS("0","3","2")                                       \
             _op"w %1; "                                                    \
             _POST_EFLAGS("0","3","2")                                      \
-            : "+g" (_eflags), "+m" ((_dst).val), "=&r" (_tmp)              \
-            : "i" (EFLAGS_MASK) );                                         \
+            : "+g" (eflags), "+m" (*(dst)), "=&r" (_tmp)                   \
+            : "i" (EFLAGS_MASK), ##extra );                                \
         break;                                                             \
     case 4:                                                                \
         asm volatile (                                                     \
             _PRE_EFLAGS("0","3","2")                                       \
             _op"l %1; "                                                    \
             _POST_EFLAGS("0","3","2")                                      \
-            : "+g" (_eflags), "+m" ((_dst).val), "=&r" (_tmp)              \
-            : "i" (EFLAGS_MASK) );                                         \
+            : "+g" (eflags), "+m" (*(dst)), "=&r" (_tmp)                   \
+            : "i" (EFLAGS_MASK), ##extra );                                \
         break;                                                             \
     case 8:                                                                \
-        __emulate_1op_8byte(_op, _dst, _eflags);                           \
+        __emulate_1op_8byte(_op, dst, eflags, ##extra);                    \
         break;                                                             \
     }                                                                      \
 } while (0)
+#define emulate_1op(op, dst, eflags)                                       \
+    _emulate_1op(op, &(dst).val, (dst).bytes, eflags)
 
 /* Emulate an instruction with quadword operands (x86/64 only). */
 #if defined(__x86_64__)
-#define __emulate_2op_8byte(_op, _src, _dst, _eflags, qsx, qsy, qdx, qdy) \
+#define __emulate_2op_8byte(_op, src, dst, eflags,                      \
+                            qsx, qsy, qdx, qdy, extra...)               \
 do{ asm volatile (                                                      \
         _PRE_EFLAGS("0","4","2")                                        \
         _op"q %"qsx"3,%"qdx"1; "                                        \
         _POST_EFLAGS("0","4","2")                                       \
-        : "+g" (_eflags), "+" qdy ((_dst).val), "=&r" (_tmp)            \
-        : qsy ((_src).val), "i" (EFLAGS_MASK) );                        \
+        : "+g" (eflags), "+" qdy (*(dst)), "=&r" (_tmp)                 \
+        : qsy (src), "i" (EFLAGS_MASK), ##extra );                      \
 } while (0)
-#define __emulate_1op_8byte(_op, _dst, _eflags)                         \
+#define __emulate_1op_8byte(_op, dst, eflags, extra...)                 \
 do{ asm volatile (                                                      \
         _PRE_EFLAGS("0","3","2")                                        \
         _op"q %1; "                                                     \
         _POST_EFLAGS("0","3","2")                                       \
-        : "+g" (_eflags), "+m" ((_dst).val), "=&r" (_tmp)               \
-        : "i" (EFLAGS_MASK) );                                          \
+        : "+g" (eflags), "+m" (*(dst)), "=&r" (_tmp)                    \
+        : "i" (EFLAGS_MASK), ##extra );                                 \
 } while (0)
 #elif defined(__i386__)
-#define __emulate_2op_8byte(_op, _src, _dst, _eflags, qsx, qsy, qdx, qdy)
-#define __emulate_1op_8byte(_op, _dst, _eflags)
+#define __emulate_2op_8byte(op, src, dst, eflags, qsx, qsy, qdx, qdy, extra...)
+#define __emulate_1op_8byte(op, dst, eflags, extra...)
 #endif /* __i386__ */
 
 #define fail_if(p)                                      \
@@ -3244,7 +3276,7 @@ x86_emulate(
         break;
     }
 
-    /* Decode and fetch the destination operand: register or memory. */
+    /* Decode (but don't fetch) the destination operand: register or memory. */
     switch ( d & DstMask )
     {
     case DstNone: /* case DstImplicit: */
@@ -3330,7 +3362,13 @@ x86_emulate(
             case 8: dst.val = *(uint64_t *)dst.reg; break;
             }
         }
-        else if ( !(d & Mov) ) /* optimisation - avoid slow emulated read */
+        else if ( d & Mov ) /* optimisation - avoid slow emulated read */
+        {
+            /* Lock prefix is allowed only on RMW instructions. */
+            generate_exception_if(lock_prefix, EXC_UD);
+            fail_if(!ops->write);
+        }
+        else if ( !ops->rmw )
         {
             fail_if(lock_prefix ? !ops->cmpxchg : !ops->write);
             if ( (rc = read_ulong(dst.mem.seg, dst.mem.off,
@@ -3338,12 +3376,6 @@ x86_emulate(
                 goto done;
             dst.orig_val = dst.val;
         }
-        else
-        {
-            /* Lock prefix is allowed only on RMW instructions. */
-            generate_exception_if(lock_prefix, EXC_UD);
-            fail_if(!ops->write);
-        }
         break;
     }
 
@@ -3356,35 +3388,83 @@ x86_emulate(
         unsigned int i, n;
         unsigned long dummy;
 
-    case 0x00 ... 0x05: add: /* add */
-        emulate_2op_SrcV("add", src, dst, _regs.eflags);
+    case 0x00: case 0x01: add: /* add reg,mem */
+        if ( ops->rmw && dst.type == OP_MEM )
+            state->rmw = rmw_add;
+        else
+        {
+    case 0x02 ... 0x05: /* add */
+            emulate_2op_SrcV("add", src, dst, _regs.eflags);
+        }
         break;
 
-    case 0x08 ... 0x0d: or:  /* or */
-        emulate_2op_SrcV("or", src, dst, _regs.eflags);
+    case 0x08: case 0x09: or: /* or reg,mem */
+        if ( ops->rmw && dst.type == OP_MEM )
+            state->rmw = rmw_or;
+        else
+        {
+    case 0x0a ... 0x0d: /* or */
+            emulate_2op_SrcV("or", src, dst, _regs.eflags);
+        }
         break;
 
-    case 0x10 ... 0x15: adc: /* adc */
-        emulate_2op_SrcV("adc", src, dst, _regs.eflags);
+    case 0x10: case 0x11: adc: /* adc reg,mem */
+        if ( ops->rmw && dst.type == OP_MEM )
+            state->rmw = rmw_adc;
+        else
+        {
+    case 0x12 ... 0x15: /* adc */
+            emulate_2op_SrcV("adc", src, dst, _regs.eflags);
+        }
         break;
 
-    case 0x18 ... 0x1d: sbb: /* sbb */
-        emulate_2op_SrcV("sbb", src, dst, _regs.eflags);
+    case 0x18: case 0x19: sbb: /* sbb reg,mem */
+        if ( ops->rmw && dst.type == OP_MEM )
+            state->rmw = rmw_sbb;
+        else
+        {
+    case 0x1a ... 0x1d: /* sbb */
+            emulate_2op_SrcV("sbb", src, dst, _regs.eflags);
+        }
         break;
 
-    case 0x20 ... 0x25: and: /* and */
-        emulate_2op_SrcV("and", src, dst, _regs.eflags);
+    case 0x20: case 0x21: and: /* and reg,mem */
+        if ( ops->rmw && dst.type == OP_MEM )
+            state->rmw = rmw_and;
+        else
+        {
+    case 0x22 ... 0x25: /* and */
+            emulate_2op_SrcV("and", src, dst, _regs.eflags);
+        }
         break;
 
-    case 0x28 ... 0x2d: sub: /* sub */
-        emulate_2op_SrcV("sub", src, dst, _regs.eflags);
+    case 0x28: case 0x29: sub: /* sub reg,mem */
+        if ( ops->rmw && dst.type == OP_MEM )
+            state->rmw = rmw_sub;
+        else
+        {
+    case 0x2a ... 0x2d: /* sub */
+            emulate_2op_SrcV("sub", src, dst, _regs.eflags);
+        }
         break;
 
-    case 0x30 ... 0x35: xor: /* xor */
-        emulate_2op_SrcV("xor", src, dst, _regs.eflags);
+    case 0x30: case 0x31: xor: /* xor reg,mem */
+        if ( ops->rmw && dst.type == OP_MEM )
+            state->rmw = rmw_xor;
+        else
+        {
+    case 0x32 ... 0x35: /* xor */
+            emulate_2op_SrcV("xor", src, dst, _regs.eflags);
+        }
         break;
 
-    case 0x38 ... 0x3d: cmp: /* cmp */
+    case 0x38: case 0x39: cmp: /* cmp reg,mem */
+        if ( ops->rmw && dst.type == OP_MEM &&
+             (rc = read_ulong(dst.mem.seg, dst.mem.off, &dst.val,
+                              dst.bytes, ctxt, ops)) != X86EMUL_OKAY )
+            goto done;
+        /* fall through */
+    case 0x3a ... 0x3d: /* cmp */
         generate_exception_if(lock_prefix, EXC_UD);
         emulate_2op_SrcV("cmp", src, dst, _regs.eflags);
         dst.type = OP_NONE;
@@ -3700,6 +3780,13 @@ x86_emulate(
         break;
 
     case 0x86 ... 0x87: xchg: /* xchg */
+        /* The lock prefix is implied for this insn. */
+        lock_prefix = 1;
+        if ( ops->rmw && dst.type == OP_MEM )
+        {
+            state->rmw = rmw_xchg;
+            break;
+        }
         /* Write back the register source. */
         switch ( dst.bytes )
         {
@@ -3708,9 +3795,8 @@ x86_emulate(
         case 4: *src.reg = (uint32_t)dst.val; break; /* 64b reg: zero-extend */
         case 8: *src.reg = dst.val; break;
         }
-        /* Write back the memory destination with implicit LOCK prefix. */
+        /* Arrange for write back of the memory destination. */
         dst.val = src.val;
-        lock_prefix = 1;
         break;
 
     case 0xc6: /* Grp11: mov / xabort */
@@ -4027,6 +4113,13 @@ x86_emulate(
 
     case 0xc0 ... 0xc1: grp2: /* Grp2 */
         generate_exception_if(lock_prefix, EXC_UD);
+
+        if ( ops->rmw && dst.type == OP_MEM &&
+             (rc = read_ulong(dst.mem.seg, dst.mem.off, &dst.val,
+                              dst.bytes, ctxt, ops)) != X86EMUL_OKAY )
+            goto done;
+        dst.orig_val = dst.val;
+
         switch ( modrm_reg & 7 )
         {
         case 0: /* rol */
@@ -4665,12 +4758,22 @@ x86_emulate(
 
         case 0 ... 1: /* test */
             generate_exception_if(lock_prefix, EXC_UD);
+            if ( ops->rmw && dst.type == OP_MEM &&
+                 (rc = read_ulong(dst.mem.seg, dst.mem.off, &dst.val,
+                                  dst.bytes, ctxt, ops)) != X86EMUL_OKAY )
+                goto done;
             goto test;
         case 2: /* not */
-            dst.val = ~dst.val;
+            if ( ops->rmw && dst.type == OP_MEM )
+                state->rmw = rmw_not;
+            else
+                dst.val = ~dst.val;
             break;
         case 3: /* neg */
-            emulate_1op("neg", dst, _regs.eflags);
+            if ( ops->rmw && dst.type == OP_MEM )
+                state->rmw = rmw_neg;
+            else
+                emulate_1op("neg", dst, _regs.eflags);
             break;
         case 4: /* mul */
             _regs.eflags &= ~(X86_EFLAGS_OF | X86_EFLAGS_CF);
@@ -4894,10 +4997,16 @@ x86_emulate(
         switch ( modrm_reg & 7 )
         {
         case 0: /* inc */
-            emulate_1op("inc", dst, _regs.eflags);
+            if ( ops->rmw && dst.type == OP_MEM )
+                state->rmw = rmw_inc;
+            else
+                emulate_1op("inc", dst, _regs.eflags);
             break;
         case 1: /* dec */
-            emulate_1op("dec", dst, _regs.eflags);
+            if ( ops->rmw && dst.type == OP_MEM )
+                state->rmw = rmw_dec;
+            else
+                emulate_1op("dec", dst, _regs.eflags);
             break;
         case 2: /* call (near) */
             dst.val = _regs.r(ip);
@@ -6482,6 +6591,12 @@ x86_emulate(
 
     case X86EMUL_OPC(0x0f, 0xa3): bt: /* bt */
         generate_exception_if(lock_prefix, EXC_UD);
+
+        if ( ops->rmw && dst.type == OP_MEM &&
+             (rc = read_ulong(dst.mem.seg, dst.mem.off, &dst.val,
+                              dst.bytes, ctxt, ops)) != X86EMUL_OKAY )
+            goto done;
+
         emulate_2op_SrcV_nobyte("bt", src, dst, _regs.eflags);
         dst.type = OP_NONE;
         break;
@@ -6493,6 +6608,12 @@ x86_emulate(
         uint8_t shift, width = dst.bytes << 3;
 
         generate_exception_if(lock_prefix, EXC_UD);
+
+        if ( ops->rmw && dst.type == OP_MEM &&
+             (rc = read_ulong(dst.mem.seg, dst.mem.off, &dst.val,
+                              dst.bytes, ctxt, ops)) != X86EMUL_OKAY )
+            goto done;
+
         if ( b & 1 )
             shift = _regs.cl;
         else
@@ -6524,7 +6645,10 @@ x86_emulate(
     }
 
     case X86EMUL_OPC(0x0f, 0xab): bts: /* bts */
-        emulate_2op_SrcV_nobyte("bts", src, dst, _regs.eflags);
+        if ( ops->rmw && dst.type == OP_MEM )
+            state->rmw = rmw_bts;
+        else
+            emulate_2op_SrcV_nobyte("bts", src, dst, _regs.eflags);
         break;
 
     case X86EMUL_OPC(0x0f, 0xae): case X86EMUL_OPC_66(0x0f, 0xae): /* Grp15 */
@@ -6648,6 +6772,12 @@ x86_emulate(
 
     case X86EMUL_OPC(0x0f, 0xb0): case X86EMUL_OPC(0x0f, 0xb1): /* cmpxchg */
         fail_if(!ops->cmpxchg);
+
+        if ( ops->rmw && dst.type == OP_MEM &&
+             (rc = read_ulong(dst.mem.seg, dst.mem.off, &dst.val,
+                              dst.bytes, ctxt, ops)) != X86EMUL_OKAY )
+            goto done;
+
         _regs.eflags &= ~EFLAGS_MASK;
         if ( !((dst.val ^ _regs.r(ax)) &
                (~0UL >> (8 * (sizeof(long) - dst.bytes)))) )
@@ -6696,7 +6826,10 @@ x86_emulate(
         goto les;
 
     case X86EMUL_OPC(0x0f, 0xb3): btr: /* btr */
-        emulate_2op_SrcV_nobyte("btr", src, dst, _regs.eflags);
+        if ( ops->rmw && dst.type == OP_MEM )
+            state->rmw = rmw_btr;
+        else
+            emulate_2op_SrcV_nobyte("btr", src, dst, _regs.eflags);
         break;
 
     case X86EMUL_OPC(0x0f, 0xb6): /* movzx rm8,r{16,32,64} */
@@ -6730,7 +6863,10 @@ x86_emulate(
         break;
 
     case X86EMUL_OPC(0x0f, 0xbb): btc: /* btc */
-        emulate_2op_SrcV_nobyte("btc", src, dst, _regs.eflags);
+        if ( ops->rmw && dst.type == OP_MEM )
+            state->rmw = rmw_btc;
+        else
+            emulate_2op_SrcV_nobyte("btc", src, dst, _regs.eflags);
         break;
 
     case X86EMUL_OPC(0x0f, 0xbc): /* bsf or tzcnt */
@@ -6803,6 +6939,11 @@ x86_emulate(
         break;
 
     case X86EMUL_OPC(0x0f, 0xc0): case X86EMUL_OPC(0x0f, 0xc1): /* xadd */
+        if ( ops->rmw && dst.type == OP_MEM )
+        {
+            state->rmw = rmw_xadd;
+            break;
+        }
         /* Write back the register source. */
         switch ( dst.bytes )
         {
@@ -8358,7 +8499,36 @@ x86_emulate(
         goto done;
     }
 
-    if ( state->simd_size )
+    if ( state->rmw )
+    {
+        ea.val = src.val;
+        op_bytes = dst.bytes;
+        rc = ops->rmw(dst.mem.seg, dst.mem.off, dst.bytes, &_regs.eflags,
+                      state, ctxt);
+        if ( rc != X86EMUL_OKAY )
+            goto done;
+
+        /* Some operations require a register to be written. */
+        switch ( state->rmw )
+        {
+        case rmw_xchg:
+        case rmw_xadd:
+            switch ( dst.bytes )
+            {
+            case 1: *(uint8_t  *)src.reg = (uint8_t)ea.val; break;
+            case 2: *(uint16_t *)src.reg = (uint16_t)ea.val; break;
+            case 4: *src.reg = (uint32_t)ea.val; break; /* 64b reg: zero-extend */
+            case 8: *src.reg = ea.val; break;
+            }
+            break;
+
+        default:
+            break;
+        }
+
+        dst.type = OP_NONE;
+    }
+    else if ( state->simd_size )
     {
         generate_exception_if(!op_bytes, EXC_UD);
         generate_exception_if(vex.opcx && (d & TwoOp) && vex.reg != 0xf,
@@ -8536,6 +8706,142 @@ x86_emulate(
 #undef vex
 #undef ea
 
+int x86_emul_rmw(
+    void *ptr,
+    unsigned int bytes,
+    uint32_t *eflags,
+    struct x86_emulate_state *state,
+    struct x86_emulate_ctxt *ctxt)
+{
+    unsigned long *dst = ptr;
+
+    ASSERT(bytes == state->op_bytes);
+
+#ifdef __x86_64__
+# define JCXZ "jrcxz"
+#else
+# define JCXZ "jecxz"
+#endif
+
+#define COND_LOCK(op) \
+    JCXZ " .L" #op "%=\n\t" \
+    "lock\n" \
+    ".L" #op "%=:\n\t" \
+    #op
+
+    switch ( state->rmw )
+    {
+#define UNOP(op) \
+    case rmw_##op: \
+        _emulate_1op(COND_LOCK(op), dst, bytes, *eflags, \
+                     "c" ((long)state->lock_prefix) ); \
+        break
+#define BINOP(op, sfx) \
+    case rmw_##op: \
+        _emulate_2op_SrcV##sfx(COND_LOCK(op), \
+                               state->ea.val, dst, bytes, *eflags, \
+                               "c" ((long)state->lock_prefix) ); \
+        break
+
+    BINOP(adc, );
+    BINOP(add, );
+    BINOP(and, );
+    BINOP(btc, _nobyte);
+    BINOP(bts, _nobyte);
+    BINOP(btr, _nobyte);
+     UNOP(dec);
+     UNOP(inc);
+     UNOP(neg);
+    BINOP(or, );
+    BINOP(sbb, );
+    BINOP(sub, );
+    BINOP(xor, );
+
+#undef UNOP
+#undef BINOP
+
+    case rmw_not:
+        switch ( state->op_bytes )
+        {
+        case 1:
+            asm ( COND_LOCK(notb) " %0"
+                  : "+m" (*dst) : "c" ((long)state->lock_prefix) );
+            break;
+        case 2:
+            asm ( COND_LOCK(notw) " %0"
+                  : "+m" (*dst) : "c" ((long)state->lock_prefix) );
+            break;
+        case 4:
+            asm ( COND_LOCK(notl) " %0"
+                  : "+m" (*dst) : "c" ((long)state->lock_prefix) );
+            break;
+#ifdef __x86_64__
+        case 8:
+            asm ( COND_LOCK(notq) " %0"
+                  : "+m" (*dst) : "c" ((long)state->lock_prefix) );
+            break;
+#endif
+        }
+        break;
+
+    case rmw_xadd:
+        switch ( state->op_bytes )
+        {
+            unsigned long dummy;
+
+#define XADD(sz, cst, mod) \
+        case sz: \
+            asm ( _PRE_EFLAGS("[efl]", "[msk]", "[tmp]") \
+                  COND_LOCK(xadd) " %"#mod"[reg], %[mem]; " \
+                  _POST_EFLAGS("[efl]", "[msk]", "[tmp]") \
+                  : [reg] "+" #cst (state->ea.val), \
+                    [mem] "+m" (*dst), \
+                    [efl] "+g" (*eflags), \
+                    [tmp] "=&r" (dummy) \
+                  : "c" ((long)state->lock_prefix), \
+                    [msk] "i" (EFLAGS_MASK) ); \
+            break
+        XADD(1, q, b);
+        XADD(2, r, w);
+        XADD(4, r, k);
+#ifdef __x86_64__
+        XADD(8, r, );
+#endif
+#undef XADD
+        }
+        break;
+
+    case rmw_xchg:
+        switch ( state->op_bytes )
+        {
+        case 1:
+            asm ( "xchg %b0, %b1" : "+q" (state->ea.val), "+m" (*dst) );
+            break;
+        case 2:
+            asm ( "xchg %w0, %w1" : "+r" (state->ea.val), "+m" (*dst) );
+            break;
+        case 4:
+#ifdef __x86_64__
+            asm ( "xchg %k0, %k1" : "+r" (state->ea.val), "+m" (*dst) );
+            break;
+        case 8:
+#endif
+            asm ( "xchg %0, %1" : "+r" (state->ea.val), "+m" (*dst) );
+            break;
+        }
+        break;
+
+    default:
+        ASSERT_UNREACHABLE();
+        return X86EMUL_UNHANDLEABLE;
+    }
+
+#undef COND_LOCK
+#undef JCXZ
+
+    return X86EMUL_OKAY;
+}
+
 static void __init __maybe_unused build_assertions(void)
 {
     /* Check the values against SReg3 encoding in opcode/ModRM bytes. */
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -239,6 +239,20 @@ struct x86_emulate_ops
         struct x86_emulate_ctxt *ctxt);
 
     /*
+     * rmw: Emulate a memory read-modify-write.
+     * @eflags: [IN/OUT] Pointer to EFLAGS to be updated according to
+     *                   instruction effects.
+     * @state:  [IN/OUT] Pointer to (opaque) emulator state.
+     */
+    int (*rmw)(
+        enum x86_segment seg,
+        unsigned long offset,
+        unsigned int bytes,
+        uint32_t *eflags,
+        struct x86_emulate_state *state,
+        struct x86_emulate_ctxt *ctxt);
+
+    /*
      * cmpxchg: Emulate a CMPXCHG operation.
      *  @p_old: [IN ] Pointer to value expected to be current at @addr.
      *          [OUT] Pointer to value found at @addr (may always be
@@ -684,6 +698,14 @@ void x86_emulate_free_state(struct x86_e
 
 #endif
 
+int
+x86_emul_rmw(
+    void *ptr,
+    unsigned int bytes,
+    uint32_t *eflags,
+    struct x86_emulate_state *state,
+    struct x86_emulate_ctxt *ctxt);
+
 static inline void x86_emul_hw_exception(
     unsigned int vector, int error_code, struct x86_emulate_ctxt *ctxt)
 {



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* [PATCH v3 22/25] x86/HVM: do actual CMPXCHG in hvmemul_cmpxchg()
  2017-12-07 13:49 [PATCH v3 00/25] x86: emulator enhancements Jan Beulich
                   ` (20 preceding siblings ...)
  2017-12-07 14:16 ` [PATCH v3 21/25] x86emul: add read-modify-write hook Jan Beulich
@ 2017-12-07 14:16 ` Jan Beulich
  2017-12-07 14:38   ` Razvan Cojocaru
                     ` (2 more replies)
  2017-12-07 14:17 ` [PATCH v3 23/25] x86/HVM: make use of new read-modify-write emulator hook Jan Beulich
                   ` (2 subsequent siblings)
  24 siblings, 3 replies; 85+ messages in thread
From: Jan Beulich @ 2017-12-07 14:16 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Paul Durrant

..., at least as far as currently possible, i.e. when a mapping can be
obtained.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -1296,8 +1296,83 @@ static int hvmemul_cmpxchg(
     bool lock,
     struct x86_emulate_ctxt *ctxt)
 {
-    /* Fix this in case the guest is really relying on r-m-w atomicity. */
-    return hvmemul_write(seg, offset, p_new, bytes, ctxt);
+    struct hvm_emulate_ctxt *hvmemul_ctxt =
+        container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
+    struct vcpu *curr = current;
+    unsigned long addr, reps = 1;
+    uint32_t pfec = PFEC_page_present | PFEC_write_access;
+    struct hvm_vcpu_io *vio = &curr->arch.hvm_vcpu.hvm_io;
+    int rc;
+    void *mapping = NULL;
+
+    rc = hvmemul_virtual_to_linear(
+        seg, offset, bytes, &reps, hvm_access_write, hvmemul_ctxt, &addr);
+    if ( rc != X86EMUL_OKAY )
+        return rc;
+
+    if ( is_x86_system_segment(seg) )
+        pfec |= PFEC_implicit;
+    else if ( hvmemul_ctxt->seg_reg[x86_seg_ss].dpl == 3 )
+        pfec |= PFEC_user_mode;
+
+    mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt);
+    if ( IS_ERR(mapping) )
+        return ~PTR_ERR(mapping);
+
+    if ( !mapping )
+    {
+        /* Fix this in case the guest is really relying on r-m-w atomicity. */
+        return hvmemul_linear_mmio_write(addr, bytes, p_new, pfec,
+                                         hvmemul_ctxt,
+                                         vio->mmio_access.write_access &&
+                                         vio->mmio_gla == (addr & PAGE_MASK));
+    }
+
+    switch ( bytes )
+    {
+    case 1: case 2: case 4: case 8:
+    {
+        unsigned long old = 0, new = 0, cur;
+
+        memcpy(&old, p_old, bytes);
+        memcpy(&new, p_new, bytes);
+        if ( lock )
+            cur = __cmpxchg(mapping, old, new, bytes);
+        else
+            cur = cmpxchg_local_(mapping, old, new, bytes);
+        if ( cur != old )
+        {
+            memcpy(p_old, &cur, bytes);
+            rc = X86EMUL_CMPXCHG_FAILED;
+        }
+        break;
+    }
+
+    case 16:
+        if ( cpu_has_cx16 )
+        {
+            __uint128_t *old = p_old, cur;
+
+            if ( lock )
+                cur = __cmpxchg16b(mapping, old, p_new);
+            else
+                cur = cmpxchg16b_local_(mapping, old, p_new);
+            if ( cur != *old )
+            {
+                *old = cur;
+                rc = X86EMUL_CMPXCHG_FAILED;
+            }
+            break;
+        }
+        /* fall through */
+    default:
+        rc = X86EMUL_UNHANDLEABLE;
+        break;
+    }
+
+    hvmemul_unmap_linear_addr(mapping, addr, bytes, hvmemul_ctxt);
+
+    return rc;
 }
 
 static int hvmemul_validate(
--- a/xen/include/asm-x86/system.h
+++ b/xen/include/asm-x86/system.h
@@ -110,6 +110,38 @@ static always_inline unsigned long __cmp
     return old;
 }
 
+static always_inline unsigned long cmpxchg_local_(
+    void *ptr, unsigned long old, unsigned long new, unsigned int size)
+{
+    unsigned long prev = ~old;
+
+    switch ( size )
+    {
+    case 1:
+        asm volatile ( "cmpxchgb %b2, %1"
+                       : "=a" (prev), "+m" (*(uint8_t *)ptr)
+                       : "q" (new), "0" (old) );
+        break;
+    case 2:
+        asm volatile ( "cmpxchgw %w2, %1"
+                       : "=a" (prev), "+m" (*(uint16_t *)ptr)
+                       : "r" (new), "0" (old) );
+        break;
+    case 4:
+        asm volatile ( "cmpxchgl %k2, %1"
+                       : "=a" (prev), "+m" (*(uint32_t *)ptr)
+                       : "r" (new), "0" (old) );
+        break;
+    case 8:
+        asm volatile ( "cmpxchgq %2, %1"
+                       : "=a" (prev), "+m" (*(uint64_t *)ptr)
+                       : "r" (new), "0" (old) );
+        break;
+    }
+
+    return prev;
+}
+
 #define cmpxchgptr(ptr,o,n) ({                                          \
     const __typeof__(**(ptr)) *__o = (o);                               \
     __typeof__(**(ptr)) *__n = (n);                                     \
--- a/xen/include/asm-x86/x86_64/system.h
+++ b/xen/include/asm-x86/x86_64/system.h
@@ -31,6 +31,24 @@ static always_inline __uint128_t __cmpxc
     return prev.raw;
 }
 
+static always_inline __uint128_t cmpxchg16b_local_(
+    void *ptr, const __uint128_t *oldp, const __uint128_t *newp)
+{
+    union {
+        struct { uint64_t lo, hi; };
+        __uint128_t raw;
+    } new = { .raw = *newp }, old = { .raw = *oldp }, prev;
+
+    ASSERT(cpu_has_cx16);
+
+    /* Don't use "=A" here - clang can't deal with that. */
+    asm volatile ( "cmpxchg16b %2"
+                   : "=d" (prev.hi), "=a" (prev.lo), "+m" (*(__uint128_t *)ptr)
+                   : "c" (new.hi), "b" (new.lo), "0" (old.hi), "1" (old.lo) );
+
+    return prev.raw;
+}
+
 #define cmpxchg16b(ptr, o, n) ({                           \
     volatile void *_p = (ptr);                             \
     ASSERT(!((unsigned long)_p & 0xf));                    \



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* [PATCH v3 23/25] x86/HVM: make use of new read-modify-write emulator hook
  2017-12-07 13:49 [PATCH v3 00/25] x86: emulator enhancements Jan Beulich
                   ` (21 preceding siblings ...)
  2017-12-07 14:16 ` [PATCH v3 22/25] x86/HVM: do actual CMPXCHG in hvmemul_cmpxchg() Jan Beulich
@ 2017-12-07 14:17 ` Jan Beulich
  2017-12-08 10:41   ` Paul Durrant
  2018-02-02 16:37   ` Andrew Cooper
  2017-12-07 14:18 ` [PATCH v3 24/25] x86/shadow: fully move unmap-dest into common code Jan Beulich
  2017-12-07 14:19 ` [PATCH v3 25/25] x86/shadow: fold sh_x86_emulate_{write, cmpxchg}() into their only callers Jan Beulich
  24 siblings, 2 replies; 85+ messages in thread
From: Jan Beulich @ 2017-12-07 14:17 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Paul Durrant

..., at least as far as currently possible, i.e. when a mapping can be
obtained.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -1187,6 +1187,61 @@ static int hvmemul_write(
     return X86EMUL_OKAY;
 }
 
+static int hvmemul_rmw(
+    enum x86_segment seg,
+    unsigned long offset,
+    unsigned int bytes,
+    uint32_t *eflags,
+    struct x86_emulate_state *state,
+    struct x86_emulate_ctxt *ctxt)
+{
+    struct hvm_emulate_ctxt *hvmemul_ctxt =
+        container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
+    unsigned long addr, reps = 1;
+    uint32_t pfec = PFEC_page_present | PFEC_write_access;
+    struct hvm_vcpu_io *vio = &current->arch.hvm_vcpu.hvm_io;
+    int rc;
+    void *mapping;
+
+    rc = hvmemul_virtual_to_linear(
+        seg, offset, bytes, &reps, hvm_access_write, hvmemul_ctxt, &addr);
+    if ( rc != X86EMUL_OKAY || !bytes )
+        return rc;
+
+    if ( is_x86_system_segment(seg) )
+        pfec |= PFEC_implicit;
+    else if ( hvmemul_ctxt->seg_reg[x86_seg_ss].dpl == 3 )
+        pfec |= PFEC_user_mode;
+
+    mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt);
+    if ( IS_ERR(mapping) )
+        return ~PTR_ERR(mapping);
+
+    if ( mapping )
+    {
+        rc = x86_emul_rmw(mapping, bytes, eflags, state, ctxt);
+        hvmemul_unmap_linear_addr(mapping, addr, bytes, hvmemul_ctxt);
+    }
+    else
+    {
+        unsigned long data = 0;
+        bool_t known_gpfn = vio->mmio_access.write_access &&
+                            vio->mmio_gla == (addr & PAGE_MASK);
+
+        if ( bytes > sizeof(data) )
+            return X86EMUL_UNHANDLEABLE;
+        rc = hvmemul_linear_mmio_read(addr, bytes, &data, pfec, hvmemul_ctxt,
+                                      known_gpfn);
+        if ( rc == X86EMUL_OKAY )
+            rc = x86_emul_rmw(&data, bytes, eflags, state, ctxt);
+        if ( rc == X86EMUL_OKAY )
+            rc = hvmemul_linear_mmio_write(addr, bytes, &data, pfec,
+                                           hvmemul_ctxt, known_gpfn);
+    }
+
+    return rc;
+}
+
 static int hvmemul_write_discard(
     enum x86_segment seg,
     unsigned long offset,
@@ -2157,6 +2212,7 @@ static const struct x86_emulate_ops hvm_
     .read          = hvmemul_read,
     .insn_fetch    = hvmemul_insn_fetch,
     .write         = hvmemul_write,
+    .rmw           = hvmemul_rmw,
     .cmpxchg       = hvmemul_cmpxchg,
     .validate      = hvmemul_validate,
     .rep_ins       = hvmemul_rep_ins,




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* [PATCH v3 24/25] x86/shadow: fully move unmap-dest into common code
  2017-12-07 13:49 [PATCH v3 00/25] x86: emulator enhancements Jan Beulich
                   ` (22 preceding siblings ...)
  2017-12-07 14:17 ` [PATCH v3 23/25] x86/HVM: make use of new read-modify-write emulator hook Jan Beulich
@ 2017-12-07 14:18 ` Jan Beulich
  2018-02-02 16:46   ` Andrew Cooper
  2017-12-07 14:19 ` [PATCH v3 25/25] x86/shadow: fold sh_x86_emulate_{write, cmpxchg}() into their only callers Jan Beulich
  24 siblings, 1 reply; 85+ messages in thread
From: Jan Beulich @ 2017-12-07 14:18 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Tim Deegan

By adding guest PTE size to shadow emulation context, the work begun by
commit 2c80710a78 ("x86/shadow: compile most write emulation code just
once") can be completed, paving the road for further movement into
common code.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -319,7 +319,8 @@ static const struct x86_emulate_ops hvm_
 };
 
 const struct x86_emulate_ops *shadow_init_emulation(
-    struct sh_emulate_ctxt *sh_ctxt, struct cpu_user_regs *regs)
+    struct sh_emulate_ctxt *sh_ctxt, struct cpu_user_regs *regs,
+    unsigned int pte_size)
 {
     struct segment_register *creg, *sreg;
     struct vcpu *v = current;
@@ -346,6 +347,8 @@ const struct x86_emulate_ops *shadow_ini
         sh_ctxt->ctxt.sp_size   = sreg->db ? 32 : 16;
     }
 
+    sh_ctxt->pte_size = pte_size;
+
     /* Attempt to prefetch whole instruction. */
     sh_ctxt->insn_buf_eip = regs->rip;
     sh_ctxt->insn_buf_bytes =
@@ -1778,6 +1781,42 @@ void *sh_emulate_map_dest(struct vcpu *v
     return map;
 }
 
+/**************************************************************************/
+/* Optimization: If we see two emulated writes of zeros to the same
+ * page-table without another kind of page fault in between, we guess
+ * that this is a batch of changes (for process destruction) and
+ * unshadow the page so we don't take a pagefault on every entry.  This
+ * should also make finding writeable mappings of pagetables much
+ * easier. */
+
+/* Look to see if this is the second emulated write in a row to this
+ * page, and unshadow if it is */
+static inline void check_for_early_unshadow(struct vcpu *v, mfn_t gmfn)
+{
+#if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
+    struct domain *d = v->domain;
+    /* If the domain has never made a "dying" op, use the two-writes
+     * heuristic; otherwise, unshadow as soon as we write a zero for a dying
+     * process.
+     *
+     * Don't bother trying to unshadow if it's not a PT, or if it's > l1.
+     */
+    if ( ( v->arch.paging.shadow.pagetable_dying
+           || ( !d->arch.paging.shadow.pagetable_dying_op
+                && v->arch.paging.shadow.last_emulated_mfn_for_unshadow == mfn_x(gmfn) ) )
+         && sh_mfn_is_a_page_table(gmfn)
+         && (!d->arch.paging.shadow.pagetable_dying_op ||
+             !(mfn_to_page(gmfn)->shadow_flags
+               & (SHF_L2_32|SHF_L2_PAE|SHF_L2H_PAE|SHF_L4_64))) )
+    {
+        perfc_incr(shadow_early_unshadow);
+        sh_remove_shadows(d, gmfn, 1, 0 /* Fast, can fail to unshadow */ );
+        TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EARLY_UNSHADOW);
+    }
+    v->arch.paging.shadow.last_emulated_mfn_for_unshadow = mfn_x(gmfn);
+#endif
+}
+
 /*
  * Tidy up after the emulated write: mark pages dirty, verify the new
  * contents, and undo the mapping.
@@ -1787,6 +1826,19 @@ void sh_emulate_unmap_dest(struct vcpu *
 {
     u32 b1 = bytes, b2 = 0, shflags;
 
+    ASSERT(mfn_valid(sh_ctxt->mfn[0]));
+
+    /* If we are writing lots of PTE-aligned zeros, might want to unshadow */
+    if ( likely(bytes >= 4) && (*(u32 *)addr == 0) )
+    {
+        if ( !((unsigned long)addr & (sh_ctxt->pte_size - 1)) )
+            check_for_early_unshadow(v, sh_ctxt->mfn[0]);
+        /* Don't reset the heuristic if we're writing zeros at non-aligned
+         * addresses, otherwise it doesn't catch REP MOVSD on PAE guests */
+    }
+    else
+        sh_reset_early_unshadow(v);
+
     /*
      * We can avoid re-verifying the page contents after the write if:
      *  - it was no larger than the PTE type of this pagetable;
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -2534,52 +2534,6 @@ sh_map_and_validate_gl1e(struct vcpu *v,
 
 
 /**************************************************************************/
-/* Optimization: If we see two emulated writes of zeros to the same
- * page-table without another kind of page fault in between, we guess
- * that this is a batch of changes (for process destruction) and
- * unshadow the page so we don't take a pagefault on every entry.  This
- * should also make finding writeable mappings of pagetables much
- * easier. */
-
-/* Look to see if this is the second emulated write in a row to this
- * page, and unshadow if it is */
-static inline void check_for_early_unshadow(struct vcpu *v, mfn_t gmfn)
-{
-#if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
-    struct domain *d = v->domain;
-    /* If the domain has never made a "dying" op, use the two-writes
-     * heuristic; otherwise, unshadow as soon as we write a zero for a dying
-     * process.
-     *
-     * Don't bother trying to unshadow if it's not a PT, or if it's > l1.
-     */
-    if ( ( v->arch.paging.shadow.pagetable_dying
-           || ( !d->arch.paging.shadow.pagetable_dying_op
-                && v->arch.paging.shadow.last_emulated_mfn_for_unshadow == mfn_x(gmfn) ) )
-         && sh_mfn_is_a_page_table(gmfn)
-         && (!d->arch.paging.shadow.pagetable_dying_op ||
-             !(mfn_to_page(gmfn)->shadow_flags
-               & (SHF_L2_32|SHF_L2_PAE|SHF_L2H_PAE|SHF_L4_64))) )
-    {
-        perfc_incr(shadow_early_unshadow);
-        sh_remove_shadows(d, gmfn, 1, 0 /* Fast, can fail to unshadow */ );
-        TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EARLY_UNSHADOW);
-    }
-    v->arch.paging.shadow.last_emulated_mfn_for_unshadow = mfn_x(gmfn);
-#endif
-}
-
-/* Stop counting towards early unshadows, as we've seen a real page fault */
-static inline void reset_early_unshadow(struct vcpu *v)
-{
-#if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
-    v->arch.paging.shadow.last_emulated_mfn_for_unshadow = mfn_x(INVALID_MFN);
-#endif
-}
-
-
-
-/**************************************************************************/
 /* Optimization: Prefetch multiple L1 entries.  This is called after we have
  * demand-faulted a shadow l1e in the fault handler, to see if it's
  * worth fetching some more.
@@ -2942,7 +2896,7 @@ static int sh_page_fault(struct vcpu *v,
                  * a not-present fault (by flipping two bits). */
                 ASSERT(regs->error_code & PFEC_page_present);
                 regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present);
-                reset_early_unshadow(v);
+                sh_reset_early_unshadow(v);
                 perfc_incr(shadow_fault_fast_gnp);
                 SHADOW_PRINTK("fast path not-present\n");
                 trace_shadow_gen(TRC_SHADOW_FAST_PROPAGATE, va);
@@ -2958,7 +2912,7 @@ static int sh_page_fault(struct vcpu *v,
             }
             perfc_incr(shadow_fault_fast_mmio);
             SHADOW_PRINTK("fast path mmio %#"PRIpaddr"\n", gpa);
-            reset_early_unshadow(v);
+            sh_reset_early_unshadow(v);
             trace_shadow_gen(TRC_SHADOW_FAST_MMIO, va);
             return (handle_mmio_with_translation(va, gpa >> PAGE_SHIFT, access)
                     ? EXCRET_fault_fixed : 0);
@@ -3070,7 +3024,7 @@ static int sh_page_fault(struct vcpu *v,
     {
         perfc_incr(shadow_fault_bail_real_fault);
         SHADOW_PRINTK("not a shadow fault\n");
-        reset_early_unshadow(v);
+        sh_reset_early_unshadow(v);
         regs->error_code = gw.pfec & PFEC_arch_mask;
         goto propagate;
     }
@@ -3096,7 +3050,7 @@ static int sh_page_fault(struct vcpu *v,
         perfc_incr(shadow_fault_bail_bad_gfn);
         SHADOW_PRINTK("BAD gfn=%"SH_PRI_gfn" gmfn=%"PRI_mfn"\n",
                       gfn_x(gfn), mfn_x(gmfn));
-        reset_early_unshadow(v);
+        sh_reset_early_unshadow(v);
         put_gfn(d, gfn_x(gfn));
         goto propagate;
     }
@@ -3285,7 +3239,7 @@ static int sh_page_fault(struct vcpu *v,
 
     perfc_incr(shadow_fault_fixed);
     d->arch.paging.log_dirty.fault_count++;
-    reset_early_unshadow(v);
+    sh_reset_early_unshadow(v);
 
     trace_shadow_fixup(gw.l1e, va);
  done:
@@ -3400,7 +3354,7 @@ static int sh_page_fault(struct vcpu *v,
 
     SHADOW_PRINTK("emulate: eip=%#lx esp=%#lx\n", regs->rip, regs->rsp);
 
-    emul_ops = shadow_init_emulation(&emul_ctxt, regs);
+    emul_ops = shadow_init_emulation(&emul_ctxt, regs, GUEST_PTE_SIZE);
 
     r = x86_emulate(&emul_ctxt.ctxt, emul_ops);
 
@@ -3540,7 +3494,7 @@ static int sh_page_fault(struct vcpu *v,
     sh_audit_gw(v, &gw);
     SHADOW_PRINTK("mmio %#"PRIpaddr"\n", gpa);
     shadow_audit_tables(v);
-    reset_early_unshadow(v);
+    sh_reset_early_unshadow(v);
     paging_unlock(d);
     put_gfn(d, gfn_x(gfn));
     trace_shadow_gen(TRC_SHADOW_MMIO, va);
@@ -3551,7 +3505,7 @@ static int sh_page_fault(struct vcpu *v,
     sh_audit_gw(v, &gw);
     SHADOW_PRINTK("not a shadow fault\n");
     shadow_audit_tables(v);
-    reset_early_unshadow(v);
+    sh_reset_early_unshadow(v);
     paging_unlock(d);
     put_gfn(d, gfn_x(gfn));
 
@@ -4677,29 +4631,6 @@ static void sh_pagetable_dying(struct vc
 /**************************************************************************/
 /* Handling guest writes to pagetables. */
 
-/* Tidy up after the emulated write: mark pages dirty, verify the new
- * contents, and undo the mapping */
-static void emulate_unmap_dest(struct vcpu *v,
-                               void *addr,
-                               u32 bytes,
-                               struct sh_emulate_ctxt *sh_ctxt)
-{
-    ASSERT(mfn_valid(sh_ctxt->mfn[0]));
-
-    /* If we are writing lots of PTE-aligned zeros, might want to unshadow */
-    if ( likely(bytes >= 4) && (*(u32 *)addr == 0) )
-    {
-        if ( ((unsigned long) addr & ((sizeof (guest_intpte_t)) - 1)) == 0 )
-            check_for_early_unshadow(v, sh_ctxt->mfn[0]);
-        /* Don't reset the heuristic if we're writing zeros at non-aligned
-         * addresses, otherwise it doesn't catch REP MOVSD on PAE guests */
-    }
-    else
-        reset_early_unshadow(v);
-
-    sh_emulate_unmap_dest(v, addr, bytes, sh_ctxt);
-}
-
 static int
 sh_x86_emulate_write(struct vcpu *v, unsigned long vaddr, void *src,
                      u32 bytes, struct sh_emulate_ctxt *sh_ctxt)
@@ -4733,7 +4664,7 @@ sh_x86_emulate_write(struct vcpu *v, uns
 #endif
     }
 
-    emulate_unmap_dest(v, addr, bytes, sh_ctxt);
+    sh_emulate_unmap_dest(v, addr, bytes, sh_ctxt);
     shadow_audit_tables(v);
     paging_unlock(v->domain);
     return X86EMUL_OKAY;
@@ -4778,7 +4709,7 @@ sh_x86_emulate_cmpxchg(struct vcpu *v, u
                   " wanted %#lx now %#lx bytes %u\n",
                   vaddr, prev, old, new, *(unsigned long *)addr, bytes);
 
-    emulate_unmap_dest(v, addr, bytes, sh_ctxt);
+    sh_emulate_unmap_dest(v, addr, bytes, sh_ctxt);
     shadow_audit_tables(v);
     paging_unlock(v->domain);
     return rv;
--- a/xen/arch/x86/mm/shadow/private.h
+++ b/xen/arch/x86/mm/shadow/private.h
@@ -721,6 +721,8 @@ struct sh_emulate_ctxt {
     uint8_t insn_buf_bytes;
     unsigned long insn_buf_eip;
 
+    unsigned int pte_size;
+
     /* Cache of segment registers already gathered for this emulation. */
     unsigned int valid_seg_regs;
     struct segment_register seg_reg[6];
@@ -736,10 +738,19 @@ struct sh_emulate_ctxt {
 };
 
 const struct x86_emulate_ops *shadow_init_emulation(
-    struct sh_emulate_ctxt *sh_ctxt, struct cpu_user_regs *regs);
+    struct sh_emulate_ctxt *sh_ctxt, struct cpu_user_regs *regs,
+    unsigned int pte_size);
 void shadow_continue_emulation(
     struct sh_emulate_ctxt *sh_ctxt, struct cpu_user_regs *regs);
 
+/* Stop counting towards early unshadows, as we've seen a real page fault */
+static inline void sh_reset_early_unshadow(struct vcpu *v)
+{
+#if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
+    v->arch.paging.shadow.last_emulated_mfn_for_unshadow = mfn_x(INVALID_MFN);
+#endif
+}
+
 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
 /**************************************************************************/
 /* Virtual TLB entries



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* [PATCH v3 25/25] x86/shadow: fold sh_x86_emulate_{write, cmpxchg}() into their only callers
  2017-12-07 13:49 [PATCH v3 00/25] x86: emulator enhancements Jan Beulich
                   ` (23 preceding siblings ...)
  2017-12-07 14:18 ` [PATCH v3 24/25] x86/shadow: fully move unmap-dest into common code Jan Beulich
@ 2017-12-07 14:19 ` Jan Beulich
  2018-02-02 16:52   ` Andrew Cooper
  24 siblings, 1 reply; 85+ messages in thread
From: Jan Beulich @ 2017-12-07 14:19 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Tim Deegan

The functions have a single caller only and are now guest paging type
independent (except for the tracing part), so have no need to exist as
standalone ones, let alone multiple times. Replace the two prior hooks
with just a single one for dealing with tracing.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -118,6 +118,20 @@ __initcall(shadow_audit_key_init);
  */
 
 /*
+ * Returns a mapped pointer to write to, or one of the following error
+ * indicators.
+ */
+#define MAPPING_UNHANDLEABLE ERR_PTR(~(long)X86EMUL_UNHANDLEABLE)
+#define MAPPING_EXCEPTION    ERR_PTR(~(long)X86EMUL_EXCEPTION)
+#define MAPPING_SILENT_FAIL  ERR_PTR(~(long)X86EMUL_OKAY)
+static void *sh_emulate_map_dest(struct vcpu *v, unsigned long vaddr,
+                                 unsigned int bytes,
+                                 struct sh_emulate_ctxt *sh_ctxt);
+static void sh_emulate_unmap_dest(struct vcpu *v, void *addr,
+                                  unsigned int bytes,
+                                  struct sh_emulate_ctxt *sh_ctxt);
+
+/*
  * Callers which pass a known in-range x86_segment can rely on the return
  * pointer being valid.  Other callers must explicitly check for errors.
  */
@@ -260,6 +274,7 @@ hvm_emulate_write(enum x86_segment seg,
         container_of(ctxt, struct sh_emulate_ctxt, ctxt);
     struct vcpu *v = current;
     unsigned long addr;
+    void *ptr;
     int rc;
 
     /* How many emulations could we save if we unshadowed on stack writes? */
@@ -271,8 +286,26 @@ hvm_emulate_write(enum x86_segment seg,
     if ( rc || !bytes )
         return rc;
 
-    return v->arch.paging.mode->shadow.x86_emulate_write(
-        v, addr, p_data, bytes, sh_ctxt);
+    /* Unaligned writes are only acceptable on HVM */
+    if ( (addr & (bytes - 1)) && !is_hvm_vcpu(v)  )
+        return X86EMUL_UNHANDLEABLE;
+
+    ptr = sh_emulate_map_dest(v, addr, bytes, sh_ctxt);
+    if ( IS_ERR(ptr) )
+        return ~PTR_ERR(ptr);
+
+    paging_lock(v->domain);
+    memcpy(ptr, p_data, bytes);
+
+    if ( tb_init_done )
+        v->arch.paging.mode->shadow.trace_emul_write_val(ptr, addr,
+                                                         p_data, bytes);
+
+    sh_emulate_unmap_dest(v, ptr, bytes, sh_ctxt);
+    shadow_audit_tables(v);
+    paging_unlock(v->domain);
+
+    return X86EMUL_OKAY;
 }
 
 static int
@@ -287,7 +320,8 @@ hvm_emulate_cmpxchg(enum x86_segment seg
     struct sh_emulate_ctxt *sh_ctxt =
         container_of(ctxt, struct sh_emulate_ctxt, ctxt);
     struct vcpu *v = current;
-    unsigned long addr, old, new;
+    unsigned long addr, old, new, prev;
+    void *ptr;
     int rc;
 
     if ( bytes > sizeof(long) )
@@ -298,14 +332,43 @@ hvm_emulate_cmpxchg(enum x86_segment seg
     if ( rc )
         return rc;
 
+    /* Unaligned writes are only acceptable on HVM */
+    if ( (addr & (bytes - 1)) && !is_hvm_vcpu(v)  )
+        return X86EMUL_UNHANDLEABLE;
+
+    ptr = sh_emulate_map_dest(v, addr, bytes, sh_ctxt);
+    if ( IS_ERR(ptr) )
+        return ~PTR_ERR(ptr);
+
     old = new = 0;
     memcpy(&old, p_old, bytes);
     memcpy(&new, p_new, bytes);
 
-    rc = v->arch.paging.mode->shadow.x86_emulate_cmpxchg(
-             v, addr, &old, new, bytes, sh_ctxt);
+    paging_lock(v->domain);
+    switch ( bytes )
+    {
+    case 1: prev = cmpxchg((uint8_t  *)ptr, old, new); break;
+    case 2: prev = cmpxchg((uint16_t *)ptr, old, new); break;
+    case 4: prev = cmpxchg((uint32_t *)ptr, old, new); break;
+    case 8: prev = cmpxchg((uint64_t *)ptr, old, new); break;
+    default:
+        SHADOW_PRINTK("cmpxchg size %u is not supported\n", bytes);
+        prev = ~old;
+    }
+
+    if ( prev != old )
+    {
+        memcpy(p_old, &prev, bytes);
+        rc = X86EMUL_CMPXCHG_FAILED;
+    }
+
+    SHADOW_DEBUG(EMULATE,
+                 "va %#lx was %#lx expected %#lx wanted %#lx now %#lx bytes %u\n",
+                 addr, prev, old, new, *(unsigned long *)ptr, bytes);
 
-    memcpy(p_old, &old, bytes);
+    sh_emulate_unmap_dest(v, ptr, bytes, sh_ctxt);
+    shadow_audit_tables(v);
+    paging_unlock(v->domain);
 
     return rc;
 }
@@ -1693,9 +1756,9 @@ static mfn_t emulate_gva_to_mfn(struct v
  * returned, page references will be held on sh_ctxt->mfn[0] and
  * sh_ctxt->mfn[1] iff !INVALID_MFN.
  */
-void *sh_emulate_map_dest(struct vcpu *v, unsigned long vaddr,
-                          unsigned int bytes,
-                          struct sh_emulate_ctxt *sh_ctxt)
+static void *sh_emulate_map_dest(struct vcpu *v, unsigned long vaddr,
+                                 unsigned int bytes,
+                                 struct sh_emulate_ctxt *sh_ctxt)
 {
     struct domain *d = v->domain;
     void *map;
@@ -1821,8 +1884,9 @@ static inline void check_for_early_unsha
  * Tidy up after the emulated write: mark pages dirty, verify the new
  * contents, and undo the mapping.
  */
-void sh_emulate_unmap_dest(struct vcpu *v, void *addr, unsigned int bytes,
-                           struct sh_emulate_ctxt *sh_ctxt)
+static void sh_emulate_unmap_dest(struct vcpu *v, void *addr,
+                                  unsigned int bytes,
+                                  struct sh_emulate_ctxt *sh_ctxt)
 {
     u32 b1 = bytes, b2 = 0, shflags;
 
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -2744,6 +2744,25 @@ static DEFINE_PER_CPU(int,trace_extra_em
 #endif
 static DEFINE_PER_CPU(guest_pa_t,trace_emulate_write_val);
 
+static void trace_emulate_write_val(const void *ptr, unsigned long vaddr,
+                                    const void *src, unsigned int bytes)
+{
+#if GUEST_PAGING_LEVELS == 3
+    if ( vaddr == this_cpu(trace_emulate_initial_va) )
+        memcpy(&this_cpu(trace_emulate_write_val), src, bytes);
+    else if ( (vaddr & ~(GUEST_PTE_SIZE - 1)) ==
+              this_cpu(trace_emulate_initial_va) )
+    {
+        TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EMULATE_FULL_PT);
+        memcpy(&this_cpu(trace_emulate_write_val),
+               (typeof(ptr))((unsigned long)ptr & ~(GUEST_PTE_SIZE - 1)),
+               GUEST_PTE_SIZE);
+    }
+#else
+    memcpy(&this_cpu(trace_emulate_write_val), src, bytes);
+#endif
+}
+
 static inline void trace_shadow_emulate(guest_l1e_t gl1e, unsigned long va)
 {
     if ( tb_init_done )
@@ -4629,93 +4648,6 @@ static void sh_pagetable_dying(struct vc
 #endif
 
 /**************************************************************************/
-/* Handling guest writes to pagetables. */
-
-static int
-sh_x86_emulate_write(struct vcpu *v, unsigned long vaddr, void *src,
-                     u32 bytes, struct sh_emulate_ctxt *sh_ctxt)
-{
-    void *addr;
-
-    /* Unaligned writes are only acceptable on HVM */
-    if ( (vaddr & (bytes - 1)) && !is_hvm_vcpu(v)  )
-        return X86EMUL_UNHANDLEABLE;
-
-    addr = sh_emulate_map_dest(v, vaddr, bytes, sh_ctxt);
-    if ( IS_ERR(addr) )
-        return ~PTR_ERR(addr);
-
-    paging_lock(v->domain);
-    memcpy(addr, src, bytes);
-
-    if ( tb_init_done )
-    {
-#if GUEST_PAGING_LEVELS == 3
-        if ( vaddr == this_cpu(trace_emulate_initial_va) )
-            memcpy(&this_cpu(trace_emulate_write_val), src, bytes);
-        else if ( (vaddr & ~(0x7UL)) == this_cpu(trace_emulate_initial_va) )
-        {
-            TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EMULATE_FULL_PT);
-            memcpy(&this_cpu(trace_emulate_write_val),
-                   (void *)(((unsigned long) addr) & ~(0x7UL)), GUEST_PTE_SIZE);
-        }
-#else
-        memcpy(&this_cpu(trace_emulate_write_val), src, bytes);
-#endif
-    }
-
-    sh_emulate_unmap_dest(v, addr, bytes, sh_ctxt);
-    shadow_audit_tables(v);
-    paging_unlock(v->domain);
-    return X86EMUL_OKAY;
-}
-
-static int
-sh_x86_emulate_cmpxchg(struct vcpu *v, unsigned long vaddr,
-                       unsigned long *p_old, unsigned long new,
-                       unsigned int bytes, struct sh_emulate_ctxt *sh_ctxt)
-{
-    void *addr;
-    unsigned long prev, old = *p_old;
-    int rv = X86EMUL_OKAY;
-
-    /* Unaligned writes are only acceptable on HVM */
-    if ( (vaddr & (bytes - 1)) && !is_hvm_vcpu(v)  )
-        return X86EMUL_UNHANDLEABLE;
-
-    addr = sh_emulate_map_dest(v, vaddr, bytes, sh_ctxt);
-    if ( IS_ERR(addr) )
-        return ~PTR_ERR(addr);
-
-    paging_lock(v->domain);
-    switch ( bytes )
-    {
-    case 1: prev = cmpxchg(((u8 *)addr), old, new);  break;
-    case 2: prev = cmpxchg(((u16 *)addr), old, new); break;
-    case 4: prev = cmpxchg(((u32 *)addr), old, new); break;
-    case 8: prev = cmpxchg(((u64 *)addr), old, new); break;
-    default:
-        SHADOW_PRINTK("cmpxchg of size %i is not supported\n", bytes);
-        prev = ~old;
-    }
-
-    if ( prev != old )
-    {
-        *p_old = prev;
-        rv = X86EMUL_CMPXCHG_FAILED;
-    }
-
-    SHADOW_DEBUG(EMULATE, "va %#lx was %#lx expected %#lx"
-                  " wanted %#lx now %#lx bytes %u\n",
-                  vaddr, prev, old, new, *(unsigned long *)addr, bytes);
-
-    sh_emulate_unmap_dest(v, addr, bytes, sh_ctxt);
-    shadow_audit_tables(v);
-    paging_unlock(v->domain);
-    return rv;
-}
-
-/**************************************************************************/
 /* Audit tools */
 
 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
@@ -5036,8 +4968,6 @@ const struct paging_mode sh_paging_mode
     .write_p2m_entry               = shadow_write_p2m_entry,
     .guest_levels                  = GUEST_PAGING_LEVELS,
     .shadow.detach_old_tables      = sh_detach_old_tables,
-    .shadow.x86_emulate_write      = sh_x86_emulate_write,
-    .shadow.x86_emulate_cmpxchg    = sh_x86_emulate_cmpxchg,
     .shadow.write_guest_entry      = sh_write_guest_entry,
     .shadow.cmpxchg_guest_entry    = sh_cmpxchg_guest_entry,
     .shadow.make_monitor_table     = sh_make_monitor_table,
@@ -5046,6 +4976,7 @@ const struct paging_mode sh_paging_mode
     .shadow.guess_wrmap            = sh_guess_wrmap,
 #endif
     .shadow.pagetable_dying        = sh_pagetable_dying,
+    .shadow.trace_emul_write_val   = trace_emulate_write_val,
     .shadow.shadow_levels          = SHADOW_PAGING_LEVELS,
 };
 
--- a/xen/arch/x86/mm/shadow/private.h
+++ b/xen/arch/x86/mm/shadow/private.h
@@ -393,16 +393,6 @@ void shadow_update_paging_modes(struct v
  * With user_only == 1, unhooks only the user-mode mappings. */
 void shadow_unhook_mappings(struct domain *d, mfn_t smfn, int user_only);
 
-/* Returns a mapped pointer to write to, or one of the following error
- * indicators. */
-#define MAPPING_UNHANDLEABLE ERR_PTR(~(long)X86EMUL_UNHANDLEABLE)
-#define MAPPING_EXCEPTION    ERR_PTR(~(long)X86EMUL_EXCEPTION)
-#define MAPPING_SILENT_FAIL  ERR_PTR(~(long)X86EMUL_OKAY)
-void *sh_emulate_map_dest(struct vcpu *v, unsigned long vaddr,
-                          unsigned int bytes, struct sh_emulate_ctxt *sh_ctxt);
-void sh_emulate_unmap_dest(struct vcpu *v, void *addr, unsigned int bytes,
-                           struct sh_emulate_ctxt *sh_ctxt);
-
 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
 /* Allow a shadowed page to go out of sync */
 int sh_unsync(struct vcpu *v, mfn_t gmfn);
--- a/xen/arch/x86/mm/shadow/types.h
+++ b/xen/arch/x86/mm/shadow/types.h
@@ -248,8 +248,6 @@ static inline shadow_l4e_t shadow_l4e_fr
 #define sh_unhook_64b_mappings     INTERNAL_NAME(sh_unhook_64b_mappings)
 #define sh_paging_mode             INTERNAL_NAME(sh_paging_mode)
 #define sh_detach_old_tables       INTERNAL_NAME(sh_detach_old_tables)
-#define sh_x86_emulate_write       INTERNAL_NAME(sh_x86_emulate_write)
-#define sh_x86_emulate_cmpxchg     INTERNAL_NAME(sh_x86_emulate_cmpxchg)
 #define sh_audit_l1_table          INTERNAL_NAME(sh_audit_l1_table)
 #define sh_audit_fl1_table         INTERNAL_NAME(sh_audit_fl1_table)
 #define sh_audit_l2_table          INTERNAL_NAME(sh_audit_l2_table)
--- a/xen/include/asm-x86/paging.h
+++ b/xen/include/asm-x86/paging.h
@@ -82,14 +82,6 @@ struct sh_emulate_ctxt;
 struct shadow_paging_mode {
 #ifdef CONFIG_SHADOW_PAGING
     void          (*detach_old_tables     )(struct vcpu *v);
-    int           (*x86_emulate_write     )(struct vcpu *v, unsigned long va,
-                                            void *src, u32 bytes,
-                                            struct sh_emulate_ctxt *sh_ctxt);
-    int           (*x86_emulate_cmpxchg   )(struct vcpu *v, unsigned long va,
-                                            unsigned long *old,
-                                            unsigned long new,
-                                            unsigned int bytes,
-                                            struct sh_emulate_ctxt *sh_ctxt);
     bool          (*write_guest_entry     )(struct vcpu *v, intpte_t *p,
                                             intpte_t new, mfn_t gmfn);
     bool          (*cmpxchg_guest_entry   )(struct vcpu *v, intpte_t *p,
@@ -100,6 +92,8 @@ struct shadow_paging_mode {
     int           (*guess_wrmap           )(struct vcpu *v, 
                                             unsigned long vaddr, mfn_t gmfn);
     void          (*pagetable_dying       )(struct vcpu *v, paddr_t gpa);
+    void          (*trace_emul_write_val  )(const void *ptr, unsigned long vaddr,
+                                            const void *src, unsigned int bytes);
 #endif
     /* For outsiders to tell what mode we're in */
     unsigned int shadow_levels;




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 22/25] x86/HVM: do actual CMPXCHG in hvmemul_cmpxchg()
  2017-12-07 14:16 ` [PATCH v3 22/25] x86/HVM: do actual CMPXCHG in hvmemul_cmpxchg() Jan Beulich
@ 2017-12-07 14:38   ` Razvan Cojocaru
  2017-12-08 10:38   ` Paul Durrant
  2018-02-02 16:36   ` Andrew Cooper
  2 siblings, 0 replies; 85+ messages in thread
From: Razvan Cojocaru @ 2017-12-07 14:38 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: George Dunlap, Andrew Cooper, Paul Durrant

On 12/07/2017 04:16 PM, Jan Beulich wrote:
> ..., at least as far as currently possible, i.e. when a mapping can be
> obtained.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Thank you for the patch!

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 01/25] x86emul: make decode_register() return unsigned long *
  2017-12-07 13:58 ` [PATCH v3 01/25] x86emul: make decode_register() return unsigned long * Jan Beulich
@ 2017-12-07 18:32   ` Andrew Cooper
  2017-12-08  7:44     ` Jan Beulich
  0 siblings, 1 reply; 85+ messages in thread
From: Andrew Cooper @ 2017-12-07 18:32 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: George Dunlap, Kevin Tian, Jun Nakajima

On 07/12/17 13:58, Jan Beulich wrote:
> Quite a few casts can be dropped this way, and type-safeness is being
> increased by not using void * (same goes for decode_vex_gpr()). Drop
> casts and no longer needed intermediate variables where possible. Take
> the opportunity and also switch the last parameter to bool.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

This will need rebasing over 053ae230b1 but that only adjusts the
parameters type of index so shouldn't cause further problems.

However, is this wise?  I can certainly see the attraction for not
needing to casing away from void *, you now give the impression that it
is safe to deference the returned pointer as an unsigned long, even in
the cases where it isn't safe.

At least with returning void*, the required cast highlights that
something special is going on.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 02/25] x86emul: build SIMD tests with -Os
  2017-12-07 13:59 ` [PATCH v3 02/25] x86emul: build SIMD tests with -Os Jan Beulich
@ 2017-12-07 18:32   ` Andrew Cooper
  0 siblings, 0 replies; 85+ messages in thread
From: Andrew Cooper @ 2017-12-07 18:32 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: George Dunlap

On 07/12/17 13:59, Jan Beulich wrote:
> Specifically in the context of putting together subsequent patches I've
> noticed that together with the touch() macro using -Os further
> increases the chances of the compiler using memory operands for the
> instructions we actually care to test.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> Reviewed-by: George Dunlap <george.dunlap@citrix.com>
>

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 01/25] x86emul: make decode_register() return unsigned long *
  2017-12-07 18:32   ` Andrew Cooper
@ 2017-12-08  7:44     ` Jan Beulich
  0 siblings, 0 replies; 85+ messages in thread
From: Jan Beulich @ 2017-12-08  7:44 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: George Dunlap, xen-devel, Kevin Tian, Jun Nakajima

>>> On 07.12.17 at 19:32, <andrew.cooper3@citrix.com> wrote:
> On 07/12/17 13:58, Jan Beulich wrote:
>> Quite a few casts can be dropped this way, and type-safeness is being
>> increased by not using void * (same goes for decode_vex_gpr()). Drop
>> casts and no longer needed intermediate variables where possible. Take
>> the opportunity and also switch the last parameter to bool.
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> This will need rebasing over 053ae230b1 but that only adjusts the
> parameters type of index so shouldn't cause further problems.

Oh, indeed, I should have checked.

> However, is this wise?  I can certainly see the attraction for not
> needing to casing away from void *, you now give the impression that it
> is safe to deference the returned pointer as an unsigned long, even in
> the cases where it isn't safe.
> 
> At least with returning void*, the required cast highlights that
> something special is going on.

How about this: I drop the last parameter from the function so
that callers outside of the emulator won't be mislead (and we'll
have a new internal function with that parameter kept). Internally
in the emulator we store pointers to long anyway, so the change
here is no net increase of risk getting things wrong. We could
even go as far as keeping the internal function return void *,
but the three places the last argument isn't false/zero the
return value is stored into struct operand's reg field anyway
(which is unsigned long *), so I don't see any value in doing so.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 22/25] x86/HVM: do actual CMPXCHG in hvmemul_cmpxchg()
  2017-12-07 14:16 ` [PATCH v3 22/25] x86/HVM: do actual CMPXCHG in hvmemul_cmpxchg() Jan Beulich
  2017-12-07 14:38   ` Razvan Cojocaru
@ 2017-12-08 10:38   ` Paul Durrant
  2018-02-02 16:36   ` Andrew Cooper
  2 siblings, 0 replies; 85+ messages in thread
From: Paul Durrant @ 2017-12-08 10:38 UTC (permalink / raw)
  To: 'Jan Beulich', xen-devel; +Cc: Andrew Cooper, George Dunlap

> -----Original Message-----
> From: Jan Beulich [mailto:JBeulich@suse.com]
> Sent: 07 December 2017 14:17
> To: xen-devel <xen-devel@lists.xenproject.org>
> Cc: Andrew Cooper <Andrew.Cooper3@citrix.com>; Paul Durrant
> <Paul.Durrant@citrix.com>; George Dunlap <George.Dunlap@citrix.com>
> Subject: [PATCH v3 22/25] x86/HVM: do actual CMPXCHG in
> hvmemul_cmpxchg()
> 
> ..., at least as far as currently possible, i.e. when a mapping can be
> obtained.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Paul Durrant <paul.durrant@citrix.com>

> ---
> v3: New.
> 
> --- a/xen/arch/x86/hvm/emulate.c
> +++ b/xen/arch/x86/hvm/emulate.c
> @@ -1296,8 +1296,83 @@ static int hvmemul_cmpxchg(
>      bool lock,
>      struct x86_emulate_ctxt *ctxt)
>  {
> -    /* Fix this in case the guest is really relying on r-m-w atomicity. */
> -    return hvmemul_write(seg, offset, p_new, bytes, ctxt);
> +    struct hvm_emulate_ctxt *hvmemul_ctxt =
> +        container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
> +    struct vcpu *curr = current;
> +    unsigned long addr, reps = 1;
> +    uint32_t pfec = PFEC_page_present | PFEC_write_access;
> +    struct hvm_vcpu_io *vio = &curr->arch.hvm_vcpu.hvm_io;
> +    int rc;
> +    void *mapping = NULL;
> +
> +    rc = hvmemul_virtual_to_linear(
> +        seg, offset, bytes, &reps, hvm_access_write, hvmemul_ctxt, &addr);
> +    if ( rc != X86EMUL_OKAY )
> +        return rc;
> +
> +    if ( is_x86_system_segment(seg) )
> +        pfec |= PFEC_implicit;
> +    else if ( hvmemul_ctxt->seg_reg[x86_seg_ss].dpl == 3 )
> +        pfec |= PFEC_user_mode;
> +
> +    mapping = hvmemul_map_linear_addr(addr, bytes, pfec,
> hvmemul_ctxt);
> +    if ( IS_ERR(mapping) )
> +        return ~PTR_ERR(mapping);
> +
> +    if ( !mapping )
> +    {
> +        /* Fix this in case the guest is really relying on r-m-w atomicity. */
> +        return hvmemul_linear_mmio_write(addr, bytes, p_new, pfec,
> +                                         hvmemul_ctxt,
> +                                         vio->mmio_access.write_access &&
> +                                         vio->mmio_gla == (addr & PAGE_MASK));
> +    }
> +
> +    switch ( bytes )
> +    {
> +    case 1: case 2: case 4: case 8:
> +    {
> +        unsigned long old = 0, new = 0, cur;
> +
> +        memcpy(&old, p_old, bytes);
> +        memcpy(&new, p_new, bytes);
> +        if ( lock )
> +            cur = __cmpxchg(mapping, old, new, bytes);
> +        else
> +            cur = cmpxchg_local_(mapping, old, new, bytes);
> +        if ( cur != old )
> +        {
> +            memcpy(p_old, &cur, bytes);
> +            rc = X86EMUL_CMPXCHG_FAILED;
> +        }
> +        break;
> +    }
> +
> +    case 16:
> +        if ( cpu_has_cx16 )
> +        {
> +            __uint128_t *old = p_old, cur;
> +
> +            if ( lock )
> +                cur = __cmpxchg16b(mapping, old, p_new);
> +            else
> +                cur = cmpxchg16b_local_(mapping, old, p_new);
> +            if ( cur != *old )
> +            {
> +                *old = cur;
> +                rc = X86EMUL_CMPXCHG_FAILED;
> +            }
> +            break;
> +        }
> +        /* fall through */
> +    default:
> +        rc = X86EMUL_UNHANDLEABLE;
> +        break;
> +    }
> +
> +    hvmemul_unmap_linear_addr(mapping, addr, bytes, hvmemul_ctxt);
> +
> +    return rc;
>  }
> 
>  static int hvmemul_validate(
> --- a/xen/include/asm-x86/system.h
> +++ b/xen/include/asm-x86/system.h
> @@ -110,6 +110,38 @@ static always_inline unsigned long __cmp
>      return old;
>  }
> 
> +static always_inline unsigned long cmpxchg_local_(
> +    void *ptr, unsigned long old, unsigned long new, unsigned int size)
> +{
> +    unsigned long prev = ~old;
> +
> +    switch ( size )
> +    {
> +    case 1:
> +        asm volatile ( "cmpxchgb %b2, %1"
> +                       : "=a" (prev), "+m" (*(uint8_t *)ptr)
> +                       : "q" (new), "0" (old) );
> +        break;
> +    case 2:
> +        asm volatile ( "cmpxchgw %w2, %1"
> +                       : "=a" (prev), "+m" (*(uint16_t *)ptr)
> +                       : "r" (new), "0" (old) );
> +        break;
> +    case 4:
> +        asm volatile ( "cmpxchgl %k2, %1"
> +                       : "=a" (prev), "+m" (*(uint32_t *)ptr)
> +                       : "r" (new), "0" (old) );
> +        break;
> +    case 8:
> +        asm volatile ( "cmpxchgq %2, %1"
> +                       : "=a" (prev), "+m" (*(uint64_t *)ptr)
> +                       : "r" (new), "0" (old) );
> +        break;
> +    }
> +
> +    return prev;
> +}
> +
>  #define cmpxchgptr(ptr,o,n) ({                                          \
>      const __typeof__(**(ptr)) *__o = (o);                               \
>      __typeof__(**(ptr)) *__n = (n);                                     \
> --- a/xen/include/asm-x86/x86_64/system.h
> +++ b/xen/include/asm-x86/x86_64/system.h
> @@ -31,6 +31,24 @@ static always_inline __uint128_t __cmpxc
>      return prev.raw;
>  }
> 
> +static always_inline __uint128_t cmpxchg16b_local_(
> +    void *ptr, const __uint128_t *oldp, const __uint128_t *newp)
> +{
> +    union {
> +        struct { uint64_t lo, hi; };
> +        __uint128_t raw;
> +    } new = { .raw = *newp }, old = { .raw = *oldp }, prev;
> +
> +    ASSERT(cpu_has_cx16);
> +
> +    /* Don't use "=A" here - clang can't deal with that. */
> +    asm volatile ( "cmpxchg16b %2"
> +                   : "=d" (prev.hi), "=a" (prev.lo), "+m" (*(__uint128_t *)ptr)
> +                   : "c" (new.hi), "b" (new.lo), "0" (old.hi), "1" (old.lo) );
> +
> +    return prev.raw;
> +}
> +
>  #define cmpxchg16b(ptr, o, n) ({                           \
>      volatile void *_p = (ptr);                             \
>      ASSERT(!((unsigned long)_p & 0xf));                    \
> 


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 23/25] x86/HVM: make use of new read-modify-write emulator hook
  2017-12-07 14:17 ` [PATCH v3 23/25] x86/HVM: make use of new read-modify-write emulator hook Jan Beulich
@ 2017-12-08 10:41   ` Paul Durrant
  2018-02-02 16:37   ` Andrew Cooper
  1 sibling, 0 replies; 85+ messages in thread
From: Paul Durrant @ 2017-12-08 10:41 UTC (permalink / raw)
  To: 'Jan Beulich', xen-devel; +Cc: Andrew Cooper, George Dunlap

> -----Original Message-----
> From: Jan Beulich [mailto:JBeulich@suse.com]
> Sent: 07 December 2017 14:18
> To: xen-devel <xen-devel@lists.xenproject.org>
> Cc: Andrew Cooper <Andrew.Cooper3@citrix.com>; Paul Durrant
> <Paul.Durrant@citrix.com>; George Dunlap <George.Dunlap@citrix.com>
> Subject: [PATCH v3 23/25] x86/HVM: make use of new read-modify-write
> emulator hook
> 
> ..., at least as far as currently possible, i.e. when a mapping can be
> obtained.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Paul Durrant <paul.durrant@citrix.com>

> ---
> v3: New.
> 
> --- a/xen/arch/x86/hvm/emulate.c
> +++ b/xen/arch/x86/hvm/emulate.c
> @@ -1187,6 +1187,61 @@ static int hvmemul_write(
>      return X86EMUL_OKAY;
>  }
> 
> +static int hvmemul_rmw(
> +    enum x86_segment seg,
> +    unsigned long offset,
> +    unsigned int bytes,
> +    uint32_t *eflags,
> +    struct x86_emulate_state *state,
> +    struct x86_emulate_ctxt *ctxt)
> +{
> +    struct hvm_emulate_ctxt *hvmemul_ctxt =
> +        container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
> +    unsigned long addr, reps = 1;
> +    uint32_t pfec = PFEC_page_present | PFEC_write_access;
> +    struct hvm_vcpu_io *vio = &current->arch.hvm_vcpu.hvm_io;
> +    int rc;
> +    void *mapping;
> +
> +    rc = hvmemul_virtual_to_linear(
> +        seg, offset, bytes, &reps, hvm_access_write, hvmemul_ctxt, &addr);
> +    if ( rc != X86EMUL_OKAY || !bytes )
> +        return rc;
> +
> +    if ( is_x86_system_segment(seg) )
> +        pfec |= PFEC_implicit;
> +    else if ( hvmemul_ctxt->seg_reg[x86_seg_ss].dpl == 3 )
> +        pfec |= PFEC_user_mode;
> +
> +    mapping = hvmemul_map_linear_addr(addr, bytes, pfec,
> hvmemul_ctxt);
> +    if ( IS_ERR(mapping) )
> +        return ~PTR_ERR(mapping);
> +
> +    if ( mapping )
> +    {
> +        rc = x86_emul_rmw(mapping, bytes, eflags, state, ctxt);
> +        hvmemul_unmap_linear_addr(mapping, addr, bytes, hvmemul_ctxt);
> +    }
> +    else
> +    {
> +        unsigned long data = 0;
> +        bool_t known_gpfn = vio->mmio_access.write_access &&
> +                            vio->mmio_gla == (addr & PAGE_MASK);
> +
> +        if ( bytes > sizeof(data) )
> +            return X86EMUL_UNHANDLEABLE;
> +        rc = hvmemul_linear_mmio_read(addr, bytes, &data, pfec,
> hvmemul_ctxt,
> +                                      known_gpfn);
> +        if ( rc == X86EMUL_OKAY )
> +            rc = x86_emul_rmw(&data, bytes, eflags, state, ctxt);
> +        if ( rc == X86EMUL_OKAY )
> +            rc = hvmemul_linear_mmio_write(addr, bytes, &data, pfec,
> +                                           hvmemul_ctxt, known_gpfn);
> +    }
> +
> +    return rc;
> +}
> +
>  static int hvmemul_write_discard(
>      enum x86_segment seg,
>      unsigned long offset,
> @@ -2157,6 +2212,7 @@ static const struct x86_emulate_ops hvm_
>      .read          = hvmemul_read,
>      .insn_fetch    = hvmemul_insn_fetch,
>      .write         = hvmemul_write,
> +    .rmw           = hvmemul_rmw,
>      .cmpxchg       = hvmemul_cmpxchg,
>      .validate      = hvmemul_validate,
>      .rep_ins       = hvmemul_rep_ins,
> 
> 


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 19/25] x86emul: tell cmpxchg hook whether LOCK is in effect
  2017-12-07 14:14 ` [PATCH v3 19/25] x86emul: tell cmpxchg hook whether LOCK is in effect Jan Beulich
@ 2017-12-08 10:58   ` Paul Durrant
  2018-02-02 14:13   ` Andrew Cooper
  1 sibling, 0 replies; 85+ messages in thread
From: Paul Durrant @ 2017-12-08 10:58 UTC (permalink / raw)
  To: 'Jan Beulich', xen-devel
  Cc: Andrew Cooper, Tim (Xen.org), George Dunlap

> -----Original Message-----
> From: Jan Beulich [mailto:JBeulich@suse.com]
> Sent: 07 December 2017 14:14
> To: xen-devel <xen-devel@lists.xenproject.org>
> Cc: Andrew Cooper <Andrew.Cooper3@citrix.com>; Paul Durrant
> <Paul.Durrant@citrix.com>; George Dunlap <George.Dunlap@citrix.com>;
> Tim (Xen.org) <tim@xen.org>
> Subject: [PATCH v3 19/25] x86emul: tell cmpxchg hook whether LOCK is in
> effect
> 
> This is necessary for the hook to correctly perform the operation.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Paul Durrant <paul.durrant@citrix.com>

> ---
> v3: New.
> 
> --- a/tools/fuzz/x86_instruction_emulator/fuzz-emul.c
> +++ b/tools/fuzz/x86_instruction_emulator/fuzz-emul.c
> @@ -346,6 +346,7 @@ static int fuzz_cmpxchg(
>      void *old,
>      void *new,
>      unsigned int bytes,
> +    bool lock,
>      struct x86_emulate_ctxt *ctxt)
>  {
>      /*
> --- a/tools/tests/x86_emulator/test_x86_emulator.c
> +++ b/tools/tests/x86_emulator/test_x86_emulator.c
> @@ -320,6 +320,7 @@ static int cmpxchg(
>      void *old,
>      void *new,
>      unsigned int bytes,
> +    bool lock,
>      struct x86_emulate_ctxt *ctxt)
>  {
>      if ( verbose )
> --- a/xen/arch/x86/hvm/emulate.c
> +++ b/xen/arch/x86/hvm/emulate.c
> @@ -1249,6 +1249,7 @@ static int hvmemul_cmpxchg_discard(
>      void *p_old,
>      void *p_new,
>      unsigned int bytes,
> +    bool lock,
>      struct x86_emulate_ctxt *ctxt)
>  {
>      return X86EMUL_OKAY;
> @@ -1292,6 +1293,7 @@ static int hvmemul_cmpxchg(
>      void *p_old,
>      void *p_new,
>      unsigned int bytes,
> +    bool lock,
>      struct x86_emulate_ctxt *ctxt)
>  {
>      /* Fix this in case the guest is really relying on r-m-w atomicity. */
> --- a/xen/arch/x86/mm/shadow/common.c
> +++ b/xen/arch/x86/mm/shadow/common.c
> @@ -281,6 +281,7 @@ hvm_emulate_cmpxchg(enum x86_segment seg
>                      void *p_old,
>                      void *p_new,
>                      unsigned int bytes,
> +                    bool lock,
>                      struct x86_emulate_ctxt *ctxt)
>  {
>      struct sh_emulate_ctxt *sh_ctxt =
> --- a/xen/arch/x86/pv/ro-page-fault.c
> +++ b/xen/arch/x86/pv/ro-page-fault.c
> @@ -216,7 +216,7 @@ static int ptwr_emulated_write(enum x86_
> 
>  static int ptwr_emulated_cmpxchg(enum x86_segment seg, unsigned long
> offset,
>                                   void *p_old, void *p_new, unsigned int bytes,
> -                                 struct x86_emulate_ctxt *ctxt)
> +                                 bool lock, struct x86_emulate_ctxt *ctxt)
>  {
>      paddr_t old = 0, new = 0;
> 
> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -1973,7 +1973,7 @@ protmode_load_seg(
> 
>          fail_if(!ops->cmpxchg);
>          switch ( (rc = ops->cmpxchg(sel_seg, (sel & 0xfff8) + 4, &desc.b,
> -                                    &new_desc_b, sizeof(desc.b), ctxt)) )
> +                                    &new_desc_b, sizeof(desc.b), true, ctxt)) )
>          {
>          case X86EMUL_OKAY:
>              break;
> @@ -6982,7 +6982,8 @@ x86_emulate(
>              }
> 
>              if ( (rc = ops->cmpxchg(ea.mem.seg, ea.mem.off, old, aux,
> -                                    op_bytes, ctxt)) != X86EMUL_OKAY )
> +                                    op_bytes, lock_prefix,
> +                                    ctxt)) != X86EMUL_OKAY )
>                  goto done;
>              _regs.eflags |= X86_EFLAGS_ZF;
>          }
> @@ -8434,7 +8435,7 @@ x86_emulate(
>              fail_if(!ops->cmpxchg);
>              rc = ops->cmpxchg(
>                  dst.mem.seg, dst.mem.off, &dst.orig_val,
> -                &dst.val, dst.bytes, ctxt);
> +                &dst.val, dst.bytes, true, ctxt);
>          }
>          else
>          {
> --- a/xen/arch/x86/x86_emulate/x86_emulate.h
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.h
> @@ -237,10 +237,11 @@ struct x86_emulate_ops
>          struct x86_emulate_ctxt *ctxt);
> 
>      /*
> -     * cmpxchg: Emulate an atomic (LOCKed) CMPXCHG operation.
> +     * cmpxchg: Emulate a CMPXCHG operation.
>       *  @p_old: [IN ] Pointer to value expected to be current at @addr.
>       *  @p_new: [IN ] Pointer to value to write to @addr.
>       *  @bytes: [IN ] Operation size (up to 8 (x86/32) or 16 (x86/64) bytes).
> +     *  @lock:  [IN ] atomic (LOCKed) operation
>       */
>      int (*cmpxchg)(
>          enum x86_segment seg,
> @@ -248,6 +249,7 @@ struct x86_emulate_ops
>          void *p_old,
>          void *p_new,
>          unsigned int bytes,
> +        bool lock,
>          struct x86_emulate_ctxt *ctxt);
> 
>      /*
> 


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 03/25] x86emul: support F16C insns
  2017-12-07 14:00 ` [PATCH v3 03/25] x86emul: support F16C insns Jan Beulich
@ 2018-01-31 18:58   ` Andrew Cooper
  0 siblings, 0 replies; 85+ messages in thread
From: Andrew Cooper @ 2018-01-31 18:58 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: George Dunlap

On 07/12/17 14:00, Jan Beulich wrote:
> Note that this avoids emulating the behavior of VCVTPS2PH found on at
> least some Intel CPUs, which update MXCSR even when the memory write
> faults.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 04/25] x86emul: support FMA4 insns
  2017-12-07 14:01 ` [PATCH v3 04/25] x86emul: support FMA4 insns Jan Beulich
@ 2018-01-31 19:51   ` Andrew Cooper
  0 siblings, 0 replies; 85+ messages in thread
From: Andrew Cooper @ 2018-01-31 19:51 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: George Dunlap

On 07/12/17 14:01, Jan Beulich wrote:
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 05/25] x86emul: support FMA insns
  2017-12-07 14:02 ` [PATCH v3 05/25] x86emul: support FMA insns Jan Beulich
@ 2018-02-01 16:15   ` Andrew Cooper
  0 siblings, 0 replies; 85+ messages in thread
From: Andrew Cooper @ 2018-02-01 16:15 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: George Dunlap

On 07/12/17 14:02, Jan Beulich wrote:
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Andrew Cooper <andrew.cooper@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 06/25] x86emul: support most remaining AVX2 insns
  2017-12-07 14:03 ` [PATCH v3 06/25] x86emul: support most remaining AVX2 insns Jan Beulich
@ 2018-02-01 19:45   ` Andrew Cooper
  2018-02-02  9:29     ` Jan Beulich
  0 siblings, 1 reply; 85+ messages in thread
From: Andrew Cooper @ 2018-02-01 19:45 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: George Dunlap

On 07/12/17 14:03, Jan Beulich wrote:
> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -370,7 +370,7 @@ static const struct {
>      [0x0c ... 0x0f] = { .simd_size = simd_packed_fp },
>      [0x10] = { .simd_size = simd_packed_int },
>      [0x13] = { .simd_size = simd_other, .two_op = 1 },
> -    [0x14 ... 0x15] = { .simd_size = simd_packed_fp },
> +    [0x14 ... 0x16] = { .simd_size = simd_packed_fp },
>      [0x17] = { .simd_size = simd_packed_int, .two_op = 1 },
>      [0x18 ... 0x19] = { .simd_size = simd_scalar_fp, .two_op = 1 },
>      [0x1a] = { .simd_size = simd_128, .two_op = 1 },
> @@ -382,9 +382,15 @@ static const struct {
>      [0x2c ... 0x2d] = { .simd_size = simd_other },
>      [0x2e ... 0x2f] = { .simd_size = simd_other, .to_mem = 1 },
>      [0x30 ... 0x35] = { .simd_size = simd_other, .two_op = 1 },
> -    [0x37 ... 0x3f] = { .simd_size = simd_packed_int },
> +    [0x36 ... 0x3f] = { .simd_size = simd_packed_int },
>      [0x40] = { .simd_size = simd_packed_int },
>      [0x41] = { .simd_size = simd_packed_int, .two_op = 1 },
> +    [0x45 ... 0x47] = { .simd_size = simd_packed_int },
> +    [0x58 ... 0x59] = { .simd_size = simd_other, .two_op = 1 },
> +    [0x5a] = { .simd_size = simd_128, .two_op = 1 },
> +    [0x78 ... 0x79] = { .simd_size = simd_other, .two_op = 1 },
> +    [0x8c] = { .simd_size = simd_other },
> +    [0x8e] = { .simd_size = simd_other, .to_mem = 1 },
>      [0x96 ... 0x9f] = { .simd_size = simd_packed_fp },
>      [0xa6 ... 0xaf] = { .simd_size = simd_packed_fp },
>      [0xb6 ... 0xbf] = { .simd_size = simd_packed_fp },
> @@ -406,6 +412,9 @@ static const struct {
>      uint8_t two_op:1;
>      uint8_t four_op:1;
>  } ext0f3a_table[256] = {
> +    [0x00] = { .simd_size = simd_packed_int, .two_op = 1 },
> +    [0x01] = { .simd_size = simd_packed_fp, .two_op = 1 },
> +    [0x02] = { .simd_size = simd_packed_int },
>      [0x04 ... 0x05] = { .simd_size = simd_packed_fp, .two_op = 1 },
>      [0x06] = { .simd_size = simd_packed_fp },
>      [0x08 ... 0x09] = { .simd_size = simd_packed_fp, .two_op = 1 },
> @@ -419,9 +428,12 @@ static const struct {
>      [0x20] = { .simd_size = simd_none },
>      [0x21] = { .simd_size = simd_other },
>      [0x22] = { .simd_size = simd_none },
> +    [0x38] = { .simd_size = simd_128 },
> +    [0x39] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1 },
>      [0x40 ... 0x41] = { .simd_size = simd_packed_fp },
>      [0x42] = { .simd_size = simd_packed_int },
>      [0x44] = { .simd_size = simd_packed_int },
> +    [0x46] = { .simd_size = simd_packed_int },
>      [0x4a ... 0x4b] = { .simd_size = simd_packed_fp, .four_op = 1 },
>      [0x4c] = { .simd_size = simd_packed_int, .four_op = 1 },
>      [0x5c ... 0x5f] = { .simd_size = simd_packed_fp, .four_op = 1 },
> @@ -2973,7 +2985,7 @@ x86_decode(
>          }
>          break;
>  
> -    case simd_scalar_fp:
> +    case simd_scalar_fp: /* case simd_scalar_dq: */

I don't see this case label used, or introduced in any later patches. 
Is it stale?

>          op_bytes = 4 << (ctxt->opcode & 1);
>          break;
>  
> @@ -6070,6 +6082,10 @@ x86_emulate(
>      case X86EMUL_OPC_VEX_66(0x0f38, 0x40): /* vpmulld {x,y}mm/mem,{x,y}mm,{x,y}mm */
>              if ( !vex.l )
>                  goto simd_0f_avx;
> +            /* fall through */
> +    case X86EMUL_OPC_VEX_66(0x0f38, 0x45): /* vpsrlv{d,q} {x,y}mm/mem,{x,y}mm,{x,y}mm */

0x46 / vpsrav{d,q}?  You add a decode for it above, but I don't see an
introduced case.

> +    case X86EMUL_OPC_VEX_66(0x0f38, 0x47): /* vpsllv{d,q} {x,y}mm/mem,{x,y}mm,{x,y}mm */
> +    simd_0f_avx2:
>              host_and_vcpu_must_have(avx2);
>              goto simd_0f_ymm;
>          }
> @@ -6169,7 +6185,10 @@ x86_emulate(
>      case X86EMUL_OPC_VEX_66(0x0f3a, 0x0f): /* vpalignr $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
>      case X86EMUL_OPC_VEX_66(0x0f3a, 0x42): /* vmpsadbw $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
>              if ( vex.l )
> +            {
> +    simd_0f_imm8_avx2:
>                  host_and_vcpu_must_have(avx2);
> +            }
>              else
>              {
>      case X86EMUL_OPC_VEX_66(0x0f3a, 0x08): /* vroundps $imm8,{x,y}mm/mem,{x,y}mm */
> @@ -7150,12 +7169,16 @@ x86_emulate(
>          fic.insn_bytes = PFX_BYTES + 3;
>          break;
>  
> -    case X86EMUL_OPC_VEX_66(0x0f38, 0x19): /* vbroadcastsd m64,ymm */
> +    case X86EMUL_OPC_VEX_66(0x0f38, 0x19): /* vbroadcastsd xmm/m64,ymm */
>      case X86EMUL_OPC_VEX_66(0x0f38, 0x1a): /* vbroadcastf128 m128,ymm */
>          generate_exception_if(!vex.l, EXC_UD);
>          /* fall through */
> -    case X86EMUL_OPC_VEX_66(0x0f38, 0x18): /* vbroadcastss m32,{x,y}mm */
> -        generate_exception_if(ea.type != OP_MEM, EXC_UD);
> +    case X86EMUL_OPC_VEX_66(0x0f38, 0x18): /* vbroadcastss xmm/m32,{x,y}mm */

It would help reviewability substantially if you split bugfixes of
existing code out separately from introduction of new code, especially
given the quantity of new additions here.  These comment changes are
particularly deceptive.

> +        if ( ea.type != OP_MEM )
> +        {
> +            generate_exception_if(b & 2, EXC_UD);
> +            host_and_vcpu_must_have(avx2);
> +        }
>          /* fall through */
>      case X86EMUL_OPC_VEX_66(0x0f38, 0x0c): /* vpermilps {x,y}mm/mem,{x,y}mm,{x,y}mm */
>      case X86EMUL_OPC_VEX_66(0x0f38, 0x0d): /* vpermilpd {x,y}mm/mem,{x,y}mm,{x,y}mm */
> @@ -7254,6 +7277,11 @@ x86_emulate(
>          op_bytes = 8 << vex.l;
>          goto simd_0f_ymm;
>  
> +    case X86EMUL_OPC_VEX_66(0x0f38, 0x16): /* vpermps ymm/m256,ymm,ymm */
> +    case X86EMUL_OPC_VEX_66(0x0f38, 0x36): /* vpermd ymm/m256,ymm,ymm */
> +        generate_exception_if(!vex.l || vex.w, EXC_UD);
> +        goto simd_0f_avx2;
> +

Looking at these additions, the case labels look like they need sorting
again.  Are you going to organise that in a later patch?

>      case X86EMUL_OPC_VEX_66(0x0f38, 0x20): /* vpmovsxbw xmm/mem,{x,y}mm */
>      case X86EMUL_OPC_VEX_66(0x0f38, 0x21): /* vpmovsxbd xmm/mem,{x,y}mm */
>      case X86EMUL_OPC_VEX_66(0x0f38, 0x22): /* vpmovsxbq xmm/mem,{x,y}mm */
> @@ -7370,6 +7398,80 @@ x86_emulate(
>          generate_exception_if(vex.l, EXC_UD);
>          goto simd_0f_avx;
>  
> +    case X86EMUL_OPC_VEX_66(0x0f38, 0x58): /* vpbroadcastd xmm/m32,{x,y}mm */
> +    case X86EMUL_OPC_VEX_66(0x0f38, 0x59): /* vpbroadcastq xmm/m64,{x,y}mm */
> +    case X86EMUL_OPC_VEX_66(0x0f38, 0x78): /* vpbroadcastb xmm/m8,{x,y}mm */
> +    case X86EMUL_OPC_VEX_66(0x0f38, 0x79): /* vpbroadcastw xmm/m16,{x,y}mm */
> +        op_bytes = 1 << ((!(b & 0x20) * 2) + (b & 1));
> +        /* fall through */
> +    case X86EMUL_OPC_VEX_66(0x0f38, 0x46): /* vpsravd {x,y}mm/mem,{x,y}mm,{x,y}mm */
> +        generate_exception_if(vex.w, EXC_UD);

Oh - here is vpsrav{d,q}.  Why is it not with its companions?  The
manual does curiously omit mention of the W1 encoding for VEX (unlike
its companions), but all 3 have W0 and W1 mentioned for EVEX encoding. 
Judging by them all having identical behaviour, and this one not being
declared as suffering a fault because of W, I expect that it is probably
encoded as WIG.

I've noticed lower down as well that you are inconsistent with vex.w
handling compared to the manual as to whether you reject or ignore
unspecified encodings.  Is this intentional, and if so, why?

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 07/25] x86emul: support AVX2 gather insns
  2017-12-07 14:03 ` [PATCH v3 07/25] x86emul: support AVX2 gather insns Jan Beulich
@ 2018-02-01 20:53   ` Andrew Cooper
  2018-02-02  9:44     ` Jan Beulich
  0 siblings, 1 reply; 85+ messages in thread
From: Andrew Cooper @ 2018-02-01 20:53 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: George Dunlap

On 07/12/17 14:03, Jan Beulich wrote:
> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -391,6 +391,7 @@ static const struct {
>      [0x78 ... 0x79] = { .simd_size = simd_other, .two_op = 1 },
>      [0x8c] = { .simd_size = simd_other },
>      [0x8e] = { .simd_size = simd_other, .to_mem = 1 },
> +    [0x90 ... 0x93] = { .simd_size = simd_other, .vsib = 1 },
>      [0x96 ... 0x9f] = { .simd_size = simd_packed_fp },
>      [0xa6 ... 0xaf] = { .simd_size = simd_packed_fp },
>      [0xb6 ... 0xbf] = { .simd_size = simd_packed_fp },
> @@ -598,6 +599,7 @@ struct x86_emulate_state {
>          ext_8f0a,
>      } ext;
>      uint8_t modrm, modrm_mod, modrm_reg, modrm_rm;
> +    uint8_t sib_index, sib_scale;
>      uint8_t rex_prefix;
>      bool lock_prefix;
>      bool not_64bit; /* Instruction not available in 64bit. */
> @@ -2411,7 +2413,7 @@ x86_decode(
>      struct x86_emulate_ctxt *ctxt,
>      const struct x86_emulate_ops  *ops)
>  {
> -    uint8_t b, d, sib, sib_index, sib_base;
> +    uint8_t b, d;
>      unsigned int def_op_bytes, def_ad_bytes, opcode;
>      enum x86_segment override_seg = x86_seg_none;
>      bool pc_rel = false;
> @@ -2745,6 +2747,7 @@ x86_decode(
>  
>          if ( modrm_mod == 3 )
>          {
> +            generate_exception_if(d & vSIB, EXC_UD);
>              modrm_rm |= (rex_prefix & 1) << 3;
>              ea.type = OP_REG;
>          }
> @@ -2805,13 +2808,17 @@ x86_decode(
>              ea.type = OP_MEM;
>              if ( modrm_rm == 4 )
>              {
> -                sib = insn_fetch_type(uint8_t);
> -                sib_index = ((sib >> 3) & 7) | ((rex_prefix << 2) & 8);
> -                sib_base  = (sib & 7) | ((rex_prefix << 3) & 8);
> -                if ( sib_index != 4 && !(d & vSIB) )
> -                    ea.mem.off = *decode_register(sib_index, state->regs,
> -                                                  false);
> -                ea.mem.off <<= (sib >> 6) & 3;
> +                uint8_t sib = insn_fetch_type(uint8_t);
> +                uint8_t sib_base = (sib & 7) | ((rex_prefix << 3) & 8);
> +
> +                state->sib_index = ((sib >> 3) & 7) | ((rex_prefix << 2) & 8);
> +                state->sib_scale = (sib >> 6) & 3;
> +                if ( state->sib_index != 4 && !(d & vSIB) )
> +                {
> +                    ea.mem.off = *decode_register(state->sib_index,
> +                                                  state->regs, false);
> +                    ea.mem.off <<= state->sib_scale;

This is a functional change.

> +                }
>                  if ( (modrm_mod == 0) && ((sib_base & 7) == 5) )
>                      ea.mem.off += insn_fetch_type(int32_t);
>                  else if ( sib_base == 4 )
> @@ -7472,6 +7479,110 @@ x86_emulate(
>          break;
>      }
>  
> +    case X86EMUL_OPC_VEX_66(0x0f38, 0x90): /* vpgatherd{d,q} {x,y}mm,mem,{x,y}mm */
> +    case X86EMUL_OPC_VEX_66(0x0f38, 0x91): /* vpgatherq{d,q} {x,y}mm,mem,{x,y}mm */
> +    case X86EMUL_OPC_VEX_66(0x0f38, 0x92): /* vgatherdp{s,d} {x,y}mm,mem,{x,y}mm */
> +    case X86EMUL_OPC_VEX_66(0x0f38, 0x93): /* vgatherqp{s,d} {x,y}mm,mem,{x,y}mm */
> +    {
> +        unsigned int mask_reg = ~vex.reg & (mode_64bit() ? 0xf : 7);
> +        typeof(vex) *pvex;
> +        union {
> +            int32_t dw[8];
> +            int64_t qw[4];
> +        } index, mask;
> +
> +        ASSERT(ea.type == OP_MEM);
> +        generate_exception_if(modrm_reg == state->sib_index ||
> +                              modrm_reg == mask_reg ||
> +                              state->sib_index == mask_reg, EXC_UD);
> +        generate_exception_if(!cpu_has_avx, EXC_UD);
> +        vcpu_must_have(avx2);
> +        get_fpu(X86EMUL_FPU_ymm, &fic);
> +
> +        /* Read destination, index, and mask registers. */
> +        opc = init_prefixes(stub);
> +        pvex = copy_VEX(opc, vex);
> +        pvex->opcx = vex_0f;
> +        opc[0] = 0x7f; /* vmovdqa */
> +        /* Use (%rax) as destination and modrm_reg as source. */
> +        pvex->r = !mode_64bit() || !(modrm_reg & 8);
> +        pvex->b = 1;
> +        opc[1] = (modrm_reg & 7) << 3;
> +        pvex->reg = 0xf;
> +        opc[2] = 0xc3;
> +
> +        invoke_stub("", "", "=m" (*mmvalp) : "a" (mmvalp));
> +
> +        pvex->pfx = vex_f3; /* vmovdqu */
> +        /* Switch to sib_index as source. */
> +        pvex->r = !mode_64bit() || !(state->sib_index & 8);
> +        opc[1] = (state->sib_index & 7) << 3;
> +
> +        invoke_stub("", "", "=m" (index) : "a" (&index));
> +
> +        /* Switch to mask_reg as source. */
> +        pvex->r = !mode_64bit() || !(mask_reg & 8);
> +        opc[1] = (mask_reg & 7) << 3;
> +
> +        invoke_stub("", "", "=m" (mask) : "a" (&mask));
> +        put_stub(stub);
> +
> +        /* Clear untouched parts of the destination and mask values. */
> +        n = 1 << (2 + vex.l - ((b & 1) | vex.w));
> +        op_bytes = 4 << vex.w;
> +        memset((void *)mmvalp + n * op_bytes, 0, 32 - n * op_bytes);
> +        memset((void *)&mask + n * op_bytes, 0, 32 - n * op_bytes);
> +
> +        for ( i = 0; i < n && rc == X86EMUL_OKAY; ++i )
> +        {
> +            if ( (vex.w ? mask.qw[i] : mask.dw[i]) < 0 )
> +            {
> +                signed long idx = b & 1 ? index.qw[i] : index.dw[i];
> +
> +                rc = ops->read(ea.mem.seg,
> +                               ea.mem.off + (idx << state->sib_scale),
> +                               (void *)mmvalp + i * op_bytes, op_bytes, ctxt);
> +                if ( rc != X86EMUL_OKAY )
> +                    break;
> +
> +#ifdef __XEN__
> +                if ( i + 1 < n && local_events_need_delivery() )
> +                    rc = X86EMUL_RETRY;
> +#endif
> +            }
> +
> +            if ( vex.w )
> +                mask.qw[i] = 0;
> +            else
> +                mask.dw[i] = 0;
> +        }

The incomplete case here is rather more complicated.  In the case that
rc != OK and local events are pending, RF needs setting, although it is
not clear if this is only applicable if an exception is pending, or
between every element.

> +
> +        /* Write destination and mask registers. */
> +        opc = init_prefixes(stub);
> +        pvex = copy_VEX(opc, vex);
> +        pvex->opcx = vex_0f;
> +        opc[0] = 0x6f; /* vmovdqa */
> +        /* Use modrm_reg as destination and (%rax) as source. */
> +        pvex->r = !mode_64bit() || !(modrm_reg & 8);
> +        pvex->b = 1;
> +        opc[1] = (modrm_reg & 7) << 3;
> +        pvex->reg = 0xf;
> +        opc[2] = 0xc3;
> +
> +        invoke_stub("", "", "+m" (*mmvalp) : "a" (mmvalp));
> +
> +        pvex->pfx = vex_f3; /* vmovdqu */
> +        /* Switch to mask_reg as destination. */
> +        pvex->r = !mode_64bit() || !(mask_reg & 8);
> +        opc[1] = (mask_reg & 7) << 3;
> +
> +        invoke_stub("", "", "+m" (mask) : "a" (&mask));
> +        put_stub(stub);
> +
> +        state->simd_size = simd_none;
> +        break;
> +    }
> +
>      case X86EMUL_OPC_VEX_66(0x0f38, 0x96): /* vfmaddsub132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
>      case X86EMUL_OPC_VEX_66(0x0f38, 0x97): /* vfmsubadd132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
>      case X86EMUL_OPC_VEX_66(0x0f38, 0x98): /* vfmadd132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
> --- a/xen/arch/x86/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate.c
> @@ -10,6 +10,7 @@
>   */
>  
>  #include <xen/domain_page.h>
> +#include <xen/event.h>

Spurious hunk?

~Andrew

>  #include <asm/x86_emulate.h>
>  #include <asm/asm_defns.h> /* mark_regs_dirty() */
>  #include <asm/processor.h> /* current_cpu_info */
>
>


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 06/25] x86emul: support most remaining AVX2 insns
  2018-02-01 19:45   ` Andrew Cooper
@ 2018-02-02  9:29     ` Jan Beulich
  0 siblings, 0 replies; 85+ messages in thread
From: Jan Beulich @ 2018-02-02  9:29 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: George Dunlap, xen-devel

>>> On 01.02.18 at 20:45, <andrew.cooper3@citrix.com> wrote:
> On 07/12/17 14:03, Jan Beulich wrote:
>> @@ -2973,7 +2985,7 @@ x86_decode(
>>          }
>>          break;
>>  
>> -    case simd_scalar_fp:
>> +    case simd_scalar_fp: /* case simd_scalar_dq: */
> 
> I don't see this case label used, or introduced in any later patches. 
> Is it stale?

Oh, indeed it is. And it's been so long that I don't even recall what
it was used for.

>> @@ -6070,6 +6082,10 @@ x86_emulate(
>>      case X86EMUL_OPC_VEX_66(0x0f38, 0x40): /* vpmulld {x,y}mm/mem,{x,y}mm,{x,y}mm */
>>              if ( !vex.l )
>>                  goto simd_0f_avx;
>> +            /* fall through */
>> +    case X86EMUL_OPC_VEX_66(0x0f38, 0x45): /* vpsrlv{d,q} {x,y}mm/mem,{x,y}mm,{x,y}mm */
> 
> 0x46 / vpsrav{d,q}?  You add a decode for it above, but I don't see an
> introduced case.

See further down - it doesn't fit here very well because there's only
vpsravd, but no vpsravq (which only appears in AVX512).

>> @@ -7150,12 +7169,16 @@ x86_emulate(
>>          fic.insn_bytes = PFX_BYTES + 3;
>>          break;
>>  
>> -    case X86EMUL_OPC_VEX_66(0x0f38, 0x19): /* vbroadcastsd m64,ymm */
>> +    case X86EMUL_OPC_VEX_66(0x0f38, 0x19): /* vbroadcastsd xmm/m64,ymm */
>>      case X86EMUL_OPC_VEX_66(0x0f38, 0x1a): /* vbroadcastf128 m128,ymm */
>>          generate_exception_if(!vex.l, EXC_UD);
>>          /* fall through */
>> -    case X86EMUL_OPC_VEX_66(0x0f38, 0x18): /* vbroadcastss m32,{x,y}mm */
>> -        generate_exception_if(ea.type != OP_MEM, EXC_UD);
>> +    case X86EMUL_OPC_VEX_66(0x0f38, 0x18): /* vbroadcastss xmm/m32,{x,y}mm */
> 
> It would help reviewability substantially if you split bugfixes of
> existing code out separately from introduction of new code, especially
> given the quantity of new additions here.  These comment changes are
> particularly deceptive.

This is not a bugfix - the register forms appear only in AVX2.
Normally I would have added their support right when the
instructions were added for AVX, but it was George who noticed
them missing after the AVX patch had gone in already. Changing
the code here still fits under the topic of the patch.

>>      case X86EMUL_OPC_VEX_66(0x0f38, 0x0c): /* vpermilps {x,y}mm/mem,{x,y}mm,{x,y}mm */
>>      case X86EMUL_OPC_VEX_66(0x0f38, 0x0d): /* vpermilpd {x,y}mm/mem,{x,y}mm,{x,y}mm */
>> @@ -7254,6 +7277,11 @@ x86_emulate(
>>          op_bytes = 8 << vex.l;
>>          goto simd_0f_ymm;
>>  
>> +    case X86EMUL_OPC_VEX_66(0x0f38, 0x16): /* vpermps ymm/m256,ymm,ymm */
>> +    case X86EMUL_OPC_VEX_66(0x0f38, 0x36): /* vpermd ymm/m256,ymm,ymm */
>> +        generate_exception_if(!vex.l || vex.w, EXC_UD);
>> +        goto simd_0f_avx2;
>> +
> 
> Looking at these additions, the case labels look like they need sorting
> again.  Are you going to organise that in a later patch?

I don't understand. Together with context above and ...

>>      case X86EMUL_OPC_VEX_66(0x0f38, 0x20): /* vpmovsxbw xmm/mem,{x,y}mm */
>>      case X86EMUL_OPC_VEX_66(0x0f38, 0x21): /* vpmovsxbd xmm/mem,{x,y}mm */
>>      case X86EMUL_OPC_VEX_66(0x0f38, 0x22): /* vpmovsxbq xmm/mem,{x,y}mm */

... here I don't see what's wrong. The lowest entry in each block
of case statements wanting similar treatment rules: 0x16 is above
0x0d and below 0x20.

>> @@ -7370,6 +7398,80 @@ x86_emulate(
>>          generate_exception_if(vex.l, EXC_UD);
>>          goto simd_0f_avx;
>>  
>> +    case X86EMUL_OPC_VEX_66(0x0f38, 0x58): /* vpbroadcastd xmm/m32,{x,y}mm */
>> +    case X86EMUL_OPC_VEX_66(0x0f38, 0x59): /* vpbroadcastq xmm/m64,{x,y}mm */
>> +    case X86EMUL_OPC_VEX_66(0x0f38, 0x78): /* vpbroadcastb xmm/m8,{x,y}mm */
>> +    case X86EMUL_OPC_VEX_66(0x0f38, 0x79): /* vpbroadcastw xmm/m16,{x,y}mm */
>> +        op_bytes = 1 << ((!(b & 0x20) * 2) + (b & 1));
>> +        /* fall through */
>> +    case X86EMUL_OPC_VEX_66(0x0f38, 0x46): /* vpsravd {x,y}mm/mem,{x,y}mm,{x,y}mm */
>> +        generate_exception_if(vex.w, EXC_UD);
> 
> Oh - here is vpsrav{d,q}.  Why is it not with its companions?  The
> manual does curiously omit mention of the W1 encoding for VEX (unlike
> its companions), but all 3 have W0 and W1 mentioned for EVEX encoding. 
> Judging by them all having identical behaviour, and this one not being
> declared as suffering a fault because of W, I expect that it is probably
> encoded as WIG.

Without trying out (which is unreliable as they might change things
between silicon revisions), I have to follow what the SDM and/or XED
say, and both say W0. We both know that things aren't always
consistent in the manual, so I prefer to use the tighter variant in case
of doubt - imo it's always better to relax things down the road than to
rely on catching faults raised from the stubs.

And trying it out (again; I'm sure I did so a year ago when this was
all put together), my Haswell faults for the W1 encoding.

> I've noticed lower down as well that you are inconsistent with vex.w
> handling compared to the manual as to whether you reject or ignore
> unspecified encodings.  Is this intentional, and if so, why?

I'm not intending to do anything inconsistently. It would help if you
could point out where you found such issues. Actually there's not a
whole lot of things left further down, and I can't spot any
inconsistency there.

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 07/25] x86emul: support AVX2 gather insns
  2018-02-01 20:53   ` Andrew Cooper
@ 2018-02-02  9:44     ` Jan Beulich
  0 siblings, 0 replies; 85+ messages in thread
From: Jan Beulich @ 2018-02-02  9:44 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: George Dunlap, xen-devel

>>> On 01.02.18 at 21:53, <andrew.cooper3@citrix.com> wrote:
> On 07/12/17 14:03, Jan Beulich wrote:
>> @@ -2805,13 +2808,17 @@ x86_decode(
>>              ea.type = OP_MEM;
>>              if ( modrm_rm == 4 )
>>              {
>> -                sib = insn_fetch_type(uint8_t);
>> -                sib_index = ((sib >> 3) & 7) | ((rex_prefix << 2) & 8);
>> -                sib_base  = (sib & 7) | ((rex_prefix << 3) & 8);
>> -                if ( sib_index != 4 && !(d & vSIB) )
>> -                    ea.mem.off = *decode_register(sib_index, state->regs,
>> -                                                  false);
>> -                ea.mem.off <<= (sib >> 6) & 3;
>> +                uint8_t sib = insn_fetch_type(uint8_t);
>> +                uint8_t sib_base = (sib & 7) | ((rex_prefix << 3) & 8);
>> +
>> +                state->sib_index = ((sib >> 3) & 7) | ((rex_prefix << 2) & 8);
>> +                state->sib_scale = (sib >> 6) & 3;
>> +                if ( state->sib_index != 4 && !(d & vSIB) )
>> +                {
>> +                    ea.mem.off = *decode_register(state->sib_index,
>> +                                                  state->regs, false);
>> +                    ea.mem.off <<= state->sib_scale;
> 
> This is a functional change.

In what way? The code is just being re-organized, so that the two
pieces of information needed later go into the new state fields. Are
you perhaps referring to the shift previously having happened
outside the if()? With the condition being false, that was simply a
shifting zero left (or else it would have been wrong to sit outside
the if()).

>> @@ -7472,6 +7479,110 @@ x86_emulate(
>>          break;
>>      }
>>  
>> +    case X86EMUL_OPC_VEX_66(0x0f38, 0x90): /* vpgatherd{d,q} {x,y}mm,mem,{x,y}mm */
>> +    case X86EMUL_OPC_VEX_66(0x0f38, 0x91): /* vpgatherq{d,q} {x,y}mm,mem,{x,y}mm */
>> +    case X86EMUL_OPC_VEX_66(0x0f38, 0x92): /* vgatherdp{s,d} {x,y}mm,mem,{x,y}mm */
>> +    case X86EMUL_OPC_VEX_66(0x0f38, 0x93): /* vgatherqp{s,d} {x,y}mm,mem,{x,y}mm */
>> +    {
>> +        unsigned int mask_reg = ~vex.reg & (mode_64bit() ? 0xf : 7);
>> +        typeof(vex) *pvex;
>> +        union {
>> +            int32_t dw[8];
>> +            int64_t qw[4];
>> +        } index, mask;
>> +
>> +        ASSERT(ea.type == OP_MEM);
>> +        generate_exception_if(modrm_reg == state->sib_index ||
>> +                              modrm_reg == mask_reg ||
>> +                              state->sib_index == mask_reg, EXC_UD);
>> +        generate_exception_if(!cpu_has_avx, EXC_UD);
>> +        vcpu_must_have(avx2);
>> +        get_fpu(X86EMUL_FPU_ymm, &fic);
>> +
>> +        /* Read destination, index, and mask registers. */
>> +        opc = init_prefixes(stub);
>> +        pvex = copy_VEX(opc, vex);
>> +        pvex->opcx = vex_0f;
>> +        opc[0] = 0x7f; /* vmovdqa */
>> +        /* Use (%rax) as destination and modrm_reg as source. */
>> +        pvex->r = !mode_64bit() || !(modrm_reg & 8);
>> +        pvex->b = 1;
>> +        opc[1] = (modrm_reg & 7) << 3;
>> +        pvex->reg = 0xf;
>> +        opc[2] = 0xc3;
>> +
>> +        invoke_stub("", "", "=m" (*mmvalp) : "a" (mmvalp));
>> +
>> +        pvex->pfx = vex_f3; /* vmovdqu */
>> +        /* Switch to sib_index as source. */
>> +        pvex->r = !mode_64bit() || !(state->sib_index & 8);
>> +        opc[1] = (state->sib_index & 7) << 3;
>> +
>> +        invoke_stub("", "", "=m" (index) : "a" (&index));
>> +
>> +        /* Switch to mask_reg as source. */
>> +        pvex->r = !mode_64bit() || !(mask_reg & 8);
>> +        opc[1] = (mask_reg & 7) << 3;
>> +
>> +        invoke_stub("", "", "=m" (mask) : "a" (&mask));
>> +        put_stub(stub);
>> +
>> +        /* Clear untouched parts of the destination and mask values. */
>> +        n = 1 << (2 + vex.l - ((b & 1) | vex.w));
>> +        op_bytes = 4 << vex.w;
>> +        memset((void *)mmvalp + n * op_bytes, 0, 32 - n * op_bytes);
>> +        memset((void *)&mask + n * op_bytes, 0, 32 - n * op_bytes);
>> +
>> +        for ( i = 0; i < n && rc == X86EMUL_OKAY; ++i )
>> +        {
>> +            if ( (vex.w ? mask.qw[i] : mask.dw[i]) < 0 )
>> +            {
>> +                signed long idx = b & 1 ? index.qw[i] : index.dw[i];
>> +
>> +                rc = ops->read(ea.mem.seg,
>> +                               ea.mem.off + (idx << state->sib_scale),
>> +                               (void *)mmvalp + i * op_bytes, op_bytes, ctxt);
>> +                if ( rc != X86EMUL_OKAY )
>> +                    break;
>> +
>> +#ifdef __XEN__
>> +                if ( i + 1 < n && local_events_need_delivery() )
>> +                    rc = X86EMUL_RETRY;
>> +#endif
>> +            }
>> +
>> +            if ( vex.w )
>> +                mask.qw[i] = 0;
>> +            else
>> +                mask.dw[i] = 0;
>> +        }
> 
> The incomplete case here is rather more complicated.  In the case that
> rc != OK and local events are pending, RF needs setting, although it is
> not clear if this is only applicable if an exception is pending, or
> between every element.

Isn't this a more general issue, e.g. also applicable to repeated
string insns? Right now we only ever clear RF. I'm not convinced
dealing with this belongs here.

>> --- a/xen/arch/x86/x86_emulate.c
>> +++ b/xen/arch/x86/x86_emulate.c
>> @@ -10,6 +10,7 @@
>>   */
>>  
>>  #include <xen/domain_page.h>
>> +#include <xen/event.h>
> 
> Spurious hunk?

No - this is for local_events_need_delivery() (still visible above).

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 08/25] x86emul: add tables for XOP 08 and 09 extension spaces
  2017-12-07 14:04 ` [PATCH v3 08/25] x86emul: add tables for XOP 08 and 09 extension spaces Jan Beulich
@ 2018-02-02 11:43   ` Andrew Cooper
  2018-02-02 15:15     ` Jan Beulich
  0 siblings, 1 reply; 85+ messages in thread
From: Andrew Cooper @ 2018-02-02 11:43 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: George Dunlap

On 07/12/17 14:04, Jan Beulich wrote:
> Convert the few existing opcodes so far supported.
>
> Also adjust two vex_* case labels to better be ext_* (the values are
> identical).
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>
> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -458,6 +458,20 @@ static const opcode_desc_t xop_table[] =
>      DstReg|SrcImm|ModRM,
>  };
>  
> +static const struct {
> +    uint8_t simd_size:5;
> +    uint8_t two_op:1;
> +    uint8_t four_op:1;
> +} ext8f08_table[256] = {
> +};
> +
> +static const struct {
> +    uint8_t simd_size:5;
> +    uint8_t two_op:1;
> +} ext8f09_table[256] = {
> +    [0x01 ... 0x02] = { .two_op = 1 },
> +};
> +

What about 8f0a ?  We've got emulation for bextr already, and might want
to consider #GRP4 seeing as we expose LWP to guests.

>  #define REX_PREFIX 0x40
>  #define REX_B 0x01
>  #define REX_X 0x02
> @@ -2726,7 +2740,7 @@ x86_decode(
>              }
>              break;
>  
> -        case vex_0f38:
> +        case ext_0f38:
>              d = ext0f38_table[b].to_mem ? DstMem | SrcReg
>                                          : DstReg | SrcMem;
>              if ( ext0f38_table[b].two_op )
> @@ -2736,7 +2750,14 @@ x86_decode(
>              state->simd_size = ext0f38_table[b].simd_size;
>              break;
>  
> -        case vex_0f3a:
> +        case ext_8f09:
> +            if ( ext8f09_table[b].two_op )
> +                d |= TwoOp;
> +            state->simd_size = ext8f09_table[b].simd_size;
> +            break;
> +
> +        case ext_0f3a:
> +        case ext_8f08:
>              /*
>               * Cannot update d here yet, as the immediate operand still
>               * needs fetching.
> @@ -2928,6 +2949,15 @@ x86_decode(
>          break;
>  
>      case ext_8f08:
> +        d = DstReg | SrcMem;
> +        if ( ext8f08_table[b].two_op )
> +            d |= TwoOp;
> +        else if ( ext8f08_table[b].four_op && !mode_64bit() )
> +            imm1 &= 0x7f;
> +        state->desc = d;
> +        state->simd_size = ext8f08_table[b].simd_size;
> +        break;

I presume that these don't actually impact our currently implemented XOP
instructions?

~Andrew

> +
>      case ext_8f09:
>      case ext_8f0a:
>          break;
>
>
>


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 09/25] x86emul: support XOP insns
  2017-12-07 14:04 ` [PATCH v3 09/25] x86emul: support XOP insns Jan Beulich
@ 2018-02-02 12:03   ` Andrew Cooper
  2018-02-02 15:17     ` Jan Beulich
  0 siblings, 1 reply; 85+ messages in thread
From: Andrew Cooper @ 2018-02-02 12:03 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: George Dunlap

On 07/12/17 14:04, Jan Beulich wrote:
> @@ -8027,6 +8060,13 @@ x86_emulate(
>          generate_exception_if(vex.w, EXC_UD);
>          goto simd_0f_imm8_avx;
>  
> +    case X86EMUL_OPC_VEX_66(0x0f3a, 0x48): /* vpermil2ps $imm,{x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
> +                                           /* vpermil2ps $imm,{x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
> +    case X86EMUL_OPC_VEX_66(0x0f3a, 0x49): /* vpermil2pd $imm,{x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
> +                                           /* vpermil2pd $imm,{x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
> +        host_and_vcpu_must_have(xop);
> +        goto simd_0f_imm8_ymm;

Is this correct?  VEX.W selects which operand may be the memory operand,
and I don't see anything in the decode which copes, or anything in the
stub which adjusts .W.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 10/25] x86emul: support 3DNow! insns
  2017-12-07 14:05 ` [PATCH v3 10/25] x86emul: support 3DNow! insns Jan Beulich
@ 2018-02-02 13:02   ` Andrew Cooper
  2018-02-02 15:22     ` Jan Beulich
  0 siblings, 1 reply; 85+ messages in thread
From: Andrew Cooper @ 2018-02-02 13:02 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: George Dunlap

On 07/12/17 14:05, Jan Beulich wrote:
> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -355,6 +355,36 @@ static const struct {
>      [0xff] = { ModRM }
>  };
>  
> +static const uint16_t _3dnow_table[16] = {

Comment explaining how these mappings work?  It looks like nibble
splits, but I still can't work out how to crossreference with the opcode
tables.

> +    [0x0] = (1 << 0xd) /* pi2fd */,
> +    [0x1] = (1 << 0xd) /* pf2id */,
> +    [0x9] = (1 << 0x0) /* pfcmpge */ |
> +            (1 << 0x4) /* pfmin */ |
> +            (1 << 0x6) /* pfrcp */ |
> +            (1 << 0x7) /* pfrsqrt */ |
> +            (1 << 0xa) /* pfsub */ |
> +            (1 << 0xe) /* pfadd */,
> +    [0xa] = (1 << 0x0) /* pfcmpge */ |
> +            (1 << 0x4) /* pfmax */ |
> +            (1 << 0x6) /* pfrcpit1 */ |
> +            (1 << 0x7) /* pfrsqit1 */ |
> +            (1 << 0xa) /* pfsubr */ |
> +            (1 << 0xe) /* pfacc */,
> +    [0xb] = (1 << 0x0) /* pfcmpeq */ |
> +            (1 << 0x4) /* pfmul */ |
> +            (1 << 0x6) /* pfrcpit2 */ |
> +            (1 << 0x7) /* pmulhrw */ |
> +            (1 << 0xf) /* pavgusb */,
> +};
> +
> +static const uint16_t _3dnow_ext_table[16] = {
> +    [0x1] = (1 << 0xd) /* pi2fw */,
> +    [0x1] = (1 << 0xc) /* pf2iw */,

You presumably want an | in here instead?

> +    [0x8] = (1 << 0xa) /* pfnacc */ |
> +            (1 << 0xa) /* pfpnacc */,
> +    [0xb] = (1 << 0xb) /* pfswapd */,
> +};
> +
>  /*
>   * "two_op" and "four_op" below refer to the number of register operands
>   * (one of which possibly also allowing to be a memory one). The named
> @@ -1671,6 +1701,8 @@ static bool vcpu_has(
>  #define vcpu_has_rdrand()      vcpu_has(         1, ECX, 30, ctxt, ops)
>  #define vcpu_has_mmxext()     (vcpu_has(0x80000001, EDX, 22, ctxt, ops) || \
>                                 vcpu_has_sse())
> +#define vcpu_has_3dnow_ext()   vcpu_has(0x80000001, EDX, 30, ctxt, ops)
> +#define vcpu_has_3dnow()       vcpu_has(0x80000001, EDX, 31, ctxt, ops)
>  #define vcpu_has_lahf_lm()     vcpu_has(0x80000001, ECX,  0, ctxt, ops)
>  #define vcpu_has_cr8_legacy()  vcpu_has(0x80000001, ECX,  4, ctxt, ops)
>  #define vcpu_has_lzcnt()       vcpu_has(0x80000001, ECX,  5, ctxt, ops)
> @@ -5505,6 +5537,26 @@ x86_emulate(
>      case X86EMUL_OPC(0x0f, 0x19) ... X86EMUL_OPC(0x0f, 0x1f): /* nop */
>          break;
>  

0f 0d prefetches?  They are 3DNow instructions, but available on later
processors.

~Andrew

> +    case X86EMUL_OPC(0x0f, 0x0e): /* femms */
> +        host_and_vcpu_must_have(3dnow);
> +        asm volatile ( "femms" );
> +        break;
> +
> +    case X86EMUL_OPC(0x0f, 0x0f): /* 3DNow! */
> +        if ( _3dnow_ext_table[(imm1 >> 4) & 0xf] & (1 << (imm1 & 0xf)) )
> +            host_and_vcpu_must_have(3dnow_ext);
> +        else if ( _3dnow_table[(imm1 >> 4) & 0xf] & (1 << (imm1 & 0xf)) )
> +            host_and_vcpu_must_have(3dnow);
> +        else
> +            generate_exception(EXC_UD);
> +
> +        get_fpu(X86EMUL_FPU_mmx, &fic);
> +
> +        d = DstReg | SrcMem;
> +        op_bytes = 8;
> +        state->simd_size = simd_other;
> +        goto simd_0f_imm8;
> +
>  #define CASE_SIMD_PACKED_INT(pfx, opc)       \
>      case X86EMUL_OPC(pfx, opc):              \
>      case X86EMUL_OPC_66(pfx, opc)
> --- a/xen/include/asm-x86/cpufeature.h
> +++ b/xen/include/asm-x86/cpufeature.h
> @@ -71,6 +71,8 @@
>                                   && boot_cpu_has(X86_FEATURE_FFXSR))
>  #define cpu_has_page1gb         boot_cpu_has(X86_FEATURE_PAGE1GB)
>  #define cpu_has_rdtscp          boot_cpu_has(X86_FEATURE_RDTSCP)
> +#define cpu_has_3dnow_ext       boot_cpu_has(X86_FEATURE_3DNOWEXT)
> +#define cpu_has_3dnow           boot_cpu_has(X86_FEATURE_3DNOW)
>  
>  /* CPUID level 0x80000001.ecx */
>  #define cpu_has_cmp_legacy      boot_cpu_has(X86_FEATURE_CMP_LEGACY)
>
>


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 11/25] x86emul: place test blobs in executable section
  2017-12-07 14:06 ` [PATCH v3 11/25] x86emul: place test blobs in executable section Jan Beulich
@ 2018-02-02 13:03   ` Andrew Cooper
  2018-02-02 15:27     ` Jan Beulich
  0 siblings, 1 reply; 85+ messages in thread
From: Andrew Cooper @ 2018-02-02 13:03 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: George Dunlap

On 07/12/17 14:06, Jan Beulich wrote:
> This allows the section contents to be disassembled without going
> through any extra hoops, simplifying the analysis of problems in test
> and/or emulation code.
>
> The blobs being emitted as (r/o) data means we need to accept an
> assembler warning here (about the differing section attributes).
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

What about just giving up their constness?  This is a test program after
all.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 12/25] x86emul: abstract out XCRn accesses
  2017-12-07 14:07 ` [PATCH v3 12/25] x86emul: abstract out XCRn accesses Jan Beulich
@ 2018-02-02 13:29   ` Andrew Cooper
  2018-02-02 17:05     ` Jan Beulich
  0 siblings, 1 reply; 85+ messages in thread
From: Andrew Cooper @ 2018-02-02 13:29 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: George Dunlap

On 07/12/17 14:07, Jan Beulich wrote:
> --- a/tools/tests/x86_emulator/x86-emulate.c
> +++ b/tools/tests/x86_emulator/x86-emulate.c
> @@ -120,6 +120,19 @@ int emul_test_read_cr(
>      return X86EMUL_UNHANDLEABLE;
>  }
>  
> +int emul_test_read_xcr(
> +    unsigned int reg,
> +    uint64_t *val,
> +    struct x86_emulate_ctxt *ctxt)
> +{
> +    uint32_t lo, hi;
> +
> +    asm ( "xgetbv" : "=a" (lo), "=d" (hi) : "c" (reg) );
> +    *val = lo | ((uint64_t)hi << 32);

This will want a reg filter, or AFL will find that trying to read reg 2
will explode.

> +
> +    return X86EMUL_OKAY;
> +}
> +
>  int emul_test_get_fpu(
>      void (*exception_callback)(void *, struct cpu_user_regs *),
>      void *exception_callback_arg,
> --- a/xen/arch/x86/hvm/emulate.c
> +++ b/xen/arch/x86/hvm/emulate.c
> @@ -1825,6 +1825,49 @@ static int hvmemul_write_cr(
>      return rc;
>  }
>  
> +static int hvmemul_read_xcr(
> +    unsigned int reg,
> +    uint64_t *val,
> +    struct x86_emulate_ctxt *ctxt)
> +{
> +    uint32_t lo, hi;
> +
> +    switch ( reg )
> +    {
> +    case 0:
> +        *val = current->arch.xcr0;
> +        return X86EMUL_OKAY;
> +
> +    case 1:
> +        if ( !cpu_has_xgetbv1 )
> +            return X86EMUL_UNHANDLEABLE;
> +        break;
> +
> +    default:
> +        return X86EMUL_UNHANDLEABLE;
> +    }
> +
> +    asm ( ".byte 0x0f,0x01,0xd0" /* xgetbv */
> +          : "=a" (lo), "=d" (hi) : "c" (reg) );

Please can we have a static inline?  It needs to be volatile, because
the result depends on unspecified other operations, which for xgetbv1
includes any instruction which alters xsave state.

Furthermore, does this actually return the correct result?  I'd prefer
if we didn't have to read from hardware here, but I can't see an
alternative.

From the guests point of view, we should at least have the guests xcr0
in context, but we have xcr0_accum loaded, meaning that the guest is
liable to see returned set bits which are higher than its idea of xcr0.

> +    *val = lo | ((uint64_t)hi << 32);
> +    HVMTRACE_LONG_2D(XCR_READ, reg, TRC_PAR_LONG(*val));
> +
> +    return X86EMUL_OKAY;
> +}
> +
> +static int hvmemul_write_xcr(
> +    unsigned int reg,
> +    uint64_t val,
> +    struct x86_emulate_ctxt *ctxt)
> +{
> +    HVMTRACE_LONG_2D(XCR_WRITE, reg, TRC_PAR_LONG(val));
> +    if ( likely(handle_xsetbv(reg, val) == 0) )
> +        return X86EMUL_OKAY;
> +
> +    x86_emul_hw_exception(TRAP_gp_fault, 0, ctxt);
> +    return X86EMUL_EXCEPTION;

This exception is inconsistent with unhandleable above.  FTR, I'd expect
all of them to be exception rather than unhandleable.

> +}
> +
>  static int hvmemul_read_msr(
>      unsigned int reg,
>      uint64_t *val,
> @@ -5161,18 +5182,33 @@ x86_emulate(
>                  _regs.eflags |= X86_EFLAGS_AC;
>              break;
>  
> -#ifdef __XEN__
> -        case 0xd1: /* xsetbv */
> +        case 0xd0: /* xgetbv */
>              generate_exception_if(vex.pfx, EXC_UD);
> -            if ( !ops->read_cr || ops->read_cr(4, &cr4, ctxt) != X86EMUL_OKAY )
> +            if ( !ops->read_cr || !ops->read_xcr ||
> +                 ops->read_cr(4, &cr4, ctxt) != X86EMUL_OKAY )
>                  cr4 = 0;
>              generate_exception_if(!(cr4 & X86_CR4_OSXSAVE), EXC_UD);
> -            generate_exception_if(!mode_ring0() ||
> -                                  handle_xsetbv(_regs.ecx,
> -                                                _regs.eax | (_regs.rdx << 32)),
> +            generate_exception_if(_regs.ecx > (vcpu_has_xgetbv1() ? 1 : 0),
>                                    EXC_GP, 0);

I don't think this filtering is correct.  We don't filter on the xsetbv
side, or for the plain cr/dr index.  It should be up to the hook to
decide whether a specific index is appropriate.

> +            rc = ops->read_xcr(_regs.ecx, &msr_val, ctxt);
> +            if ( rc != X86EMUL_OKAY )
> +                goto done;
> +            _regs.r(ax) = (uint32_t)msr_val;
> +            _regs.r(dx) = msr_val >> 32;
> +            break;
> +
> +        case 0xd1: /* xsetbv */
> +            generate_exception_if(vex.pfx, EXC_UD);
> +            if ( !ops->read_cr || !ops->write_xcr ||
> +                 ops->read_cr(4, &cr4, ctxt) != X86EMUL_OKAY )
> +                cr4 = 0;
> +            generate_exception_if(!(cr4 & X86_CR4_OSXSAVE), EXC_UD);
> +            generate_exception_if(!mode_ring0() || _regs.ecx, EXC_GP, 0);
> +            rc = ops->write_xcr(_regs.ecx,
> +                                _regs.eax | ((uint64_t)_regs.edx << 32), ctxt);
> +            if ( rc != X86EMUL_OKAY )
> +                goto done;
>              break;
> -#endif
>  
>          case 0xd4: /* vmfunc */
>              generate_exception_if(vex.pfx, EXC_UD);
> --- a/xen/include/asm-x86/x86-defns.h
> +++ b/xen/include/asm-x86/x86-defns.h
> @@ -66,4 +66,28 @@
>  #define X86_CR4_SMAP       0x00200000 /* enable SMAP */
>  #define X86_CR4_PKE        0x00400000 /* enable PKE */
>  
> +/*
> + * XSTATE component flags in XCR0
> + */
> +#define _XSTATE_FP                0
> +#define XSTATE_FP                 (1ULL << _XSTATE_FP)
> +#define _XSTATE_SSE               1
> +#define XSTATE_SSE                (1ULL << _XSTATE_SSE)
> +#define _XSTATE_YMM               2
> +#define XSTATE_YMM                (1ULL << _XSTATE_YMM)
> +#define _XSTATE_BNDREGS           3
> +#define XSTATE_BNDREGS            (1ULL << _XSTATE_BNDREGS)
> +#define _XSTATE_BNDCSR            4
> +#define XSTATE_BNDCSR             (1ULL << _XSTATE_BNDCSR)
> +#define _XSTATE_OPMASK            5
> +#define XSTATE_OPMASK             (1ULL << _XSTATE_OPMASK)
> +#define _XSTATE_ZMM               6
> +#define XSTATE_ZMM                (1ULL << _XSTATE_ZMM)
> +#define _XSTATE_HI_ZMM            7
> +#define XSTATE_HI_ZMM             (1ULL << _XSTATE_HI_ZMM)
> +#define _XSTATE_PKRU              9
> +#define XSTATE_PKRU               (1ULL << _XSTATE_PKRU)
> +#define _XSTATE_LWP               62
> +#define XSTATE_LWP                (1ULL << _XSTATE_LWP)

Can we name these consistently as part of moving into this file?  At the
very least an X86_ prefix, and possibly an XCR0 middle.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 13/25] x86emul: adjust_bnd() should check XCR0
  2017-12-07 14:08 ` [PATCH v3 13/25] x86emul: adjust_bnd() should check XCR0 Jan Beulich
@ 2018-02-02 13:30   ` Andrew Cooper
  2018-02-02 16:19     ` Jan Beulich
  0 siblings, 1 reply; 85+ messages in thread
From: Andrew Cooper @ 2018-02-02 13:30 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: George Dunlap

On 07/12/17 14:08, Jan Beulich wrote:
> Experimentally MPX instructions have been confirmed to behave as NOPs
> unless both related XCR0 bits are set to 1. By implication branches
> then also don't clear BNDn.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>
> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -2143,12 +2143,16 @@ static bool umip_active(struct x86_emula
>  static void adjust_bnd(struct x86_emulate_ctxt *ctxt,
>                         const struct x86_emulate_ops *ops, enum vex_pfx pfx)
>  {
> -    uint64_t bndcfg;
> +    uint64_t xcr0, bndcfg;
>      int rc;
>  
>      if ( pfx == vex_f2 || !cpu_has_mpx || !vcpu_has_mpx() )
>          return;
>  
> +    if ( !ops->read_xcr || ops->read_xcr(0, &xcr0, ctxt) != X86EMUL_OKAY ||
> +         !(xcr0 & XSTATE_BNDREGS) || !(xcr0 & XSTATE_BNDCSR) )

!(xcr0 & (XSTATE_BNDREGS | XSTATE_BNDCSR)) ?

Otherwise, Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

> +        return;
> +
>      if ( !mode_ring0() )
>          bndcfg = read_bndcfgu();
>      else if ( !ops->read_msr ||
>
>
>


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 14/25] x86emul: make all FPU emulation use the stub
  2017-12-07 14:09 ` [PATCH v3 14/25] x86emul: make all FPU emulation use the stub Jan Beulich
@ 2018-02-02 13:37   ` Andrew Cooper
  0 siblings, 0 replies; 85+ messages in thread
From: Andrew Cooper @ 2018-02-02 13:37 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: George Dunlap

On 07/12/17 14:09, Jan Beulich wrote:
> While this means quite some reduction of (source) code, the main
> purpose is to no longer have exceptions raised from other than stubs.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> for the reduction
alone, but with a recommendation.

> @@ -4266,37 +4262,13 @@ x86_emulate(
>              emulate_fpu_insn_stub(0xd8, modrm);
>              break;
>          default:
> +        fpu_memsrc32:
>              ASSERT(ea.type == OP_MEM);
>              if ( (rc = ops->read(ea.mem.seg, ea.mem.off, &src.val,
>                                   4, ctxt)) != X86EMUL_OKAY )
>                  goto done;
> -            switch ( modrm_reg & 7 )
> -            {
> -            case 0: /* fadd */
> -                emulate_fpu_insn_memsrc("fadds", src.val);
> -                break;
> -            case 1: /* fmul */
> -                emulate_fpu_insn_memsrc("fmuls", src.val);
> -                break;
> -            case 2: /* fcom */
> -                emulate_fpu_insn_memsrc("fcoms", src.val);
> -                break;
> -            case 3: /* fcomp */
> -                emulate_fpu_insn_memsrc("fcomps", src.val);
> -                break;
> -            case 4: /* fsub */
> -                emulate_fpu_insn_memsrc("fsubs", src.val);
> -                break;
> -            case 5: /* fsubr */
> -                emulate_fpu_insn_memsrc("fsubrs", src.val);
> -                break;
> -            case 6: /* fdiv */
> -                emulate_fpu_insn_memsrc("fdivs", src.val);
> -                break;
> -            case 7: /* fdivr */
> -                emulate_fpu_insn_memsrc("fdivrs", src.val);
> -                break;
> -            }
> +            emulate_fpu_insn_memsrc(b, modrm_reg, src.val);

The modrm_reg & 7 should be visible here to make it obvious that the rex
prefix if any is dropped.  It is probably best to duplicate up the &7
because the one inside the macro is for encoding safety, and the
compiler can trivially combine the two.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 15/25] x86/HVM: eliminate custom #MF/#XM handling
  2017-12-07 14:10 ` [PATCH v3 15/25] x86/HVM: eliminate custom #MF/#XM handling Jan Beulich
@ 2018-02-02 13:38   ` Andrew Cooper
  0 siblings, 0 replies; 85+ messages in thread
From: Andrew Cooper @ 2018-02-02 13:38 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: George Dunlap

On 07/12/17 14:10, Jan Beulich wrote:
> Use the generic stub exception handling instead.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> Reviewed-by: Paul Durrant <paul.durrant@citrix.com>

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>, because I'm happy
with the change but you've got quite a lot of rebasing to get this to
apply to staging.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 16/25] x86emul: support SWAPGS
  2017-12-07 14:11 ` [PATCH v3 16/25] x86emul: support SWAPGS Jan Beulich
@ 2018-02-02 13:41   ` Andrew Cooper
  2018-02-02 16:24     ` Jan Beulich
  0 siblings, 1 reply; 85+ messages in thread
From: Andrew Cooper @ 2018-02-02 13:41 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: George Dunlap

On 07/12/17 14:11, Jan Beulich wrote:
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> v3: New.
>
> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -5047,6 +5047,24 @@ x86_emulate(
>                  goto done;
>              break;
>  
> +        case 0xf8: /* swapgs */
> +            generate_exception_if(!mode_64bit(), EXC_UD);
> +            generate_exception_if(!mode_ring0(), EXC_GP, 0);
> +            fail_if(!ops->read_segment || !ops->read_msr ||
> +                    !ops->write_segment || !ops->write_msr);
> +            if ( (rc = ops->read_segment(x86_seg_gs, &sreg,
> +                                         ctxt)) != X86EMUL_OKAY ||
> +                 (rc = ops->read_msr(MSR_SHADOW_GS_BASE, &msr_val,
> +                                     ctxt)) != X86EMUL_OKAY ||
> +                 (rc = ops->write_msr(MSR_SHADOW_GS_BASE, sreg.base,
> +                                      ctxt)) != X86EMUL_OKAY )

We need to unwind this write in the case of write_segment failing, or
when the instruction restarts, state will be corrupt.

~Andrew

> +                goto done;
> +            sreg.base = msr_val;
> +            if ( (rc = ops->write_segment(x86_seg_gs, &sreg,
> +                                          ctxt)) != X86EMUL_OKAY )
> +                goto done;
> +            break;
> +
>          case 0xf9: /* rdtscp */
>              fail_if(ops->read_msr == NULL);
>              if ( (rc = ops->read_msr(MSR_TSC_AUX,
>
>
>


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 17/25] x86emul: emulate {MONITOR, MWAIT}{, X} as no-op
  2017-12-07 14:11 ` [PATCH v3 17/25] x86emul: emulate {MONITOR, MWAIT}{, X} as no-op Jan Beulich
@ 2018-02-02 14:05   ` Andrew Cooper
  0 siblings, 0 replies; 85+ messages in thread
From: Andrew Cooper @ 2018-02-02 14:05 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: George Dunlap

On 07/12/17 14:11, Jan Beulich wrote:
> As mentioned in Linux commit 87c00572ba ("kvm: x86: emulate monitor and
> mwait instructions as nop"), older OS X versions (for example) may make
> use of the insns without checking CPUID flags (presumably implying
> availability from family/model).

-1 to this.

IIRC, monitor and mwait are disabled entirely due to VMCS/VMCB
configuration, and convert to #UD internally.  The emulator shouldn't be
able to let software work around that.

If and when we decide to support this functionality for guests (which
probably won't be until after EPT SPP gets in), then the feature should
use CPUID as per normal.

There is a large list of other things which prevent OS X from booting
under Xen.  If someone decides to step up and get OS X support working
then we could reconsider whether we quirk this, but until then,
unilaterally quirking it is a net negative.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 18/25] x86emul: add missing suffixes in test harness
  2017-12-07 14:12 ` [PATCH v3 18/25] x86emul: add missing suffixes in test harness Jan Beulich
@ 2018-02-02 14:13   ` Andrew Cooper
  0 siblings, 0 replies; 85+ messages in thread
From: Andrew Cooper @ 2018-02-02 14:13 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: George Dunlap

On 07/12/17 14:12, Jan Beulich wrote:
> I'm in the process of putting together a gas change issuing at least
> warnings when the intended size of a memory operation can't be deduced
> from another (register) operand. Add missing suffixes to silence such
> future diagnostics.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 19/25] x86emul: tell cmpxchg hook whether LOCK is in effect
  2017-12-07 14:14 ` [PATCH v3 19/25] x86emul: tell cmpxchg hook whether LOCK is in effect Jan Beulich
  2017-12-08 10:58   ` Paul Durrant
@ 2018-02-02 14:13   ` Andrew Cooper
  1 sibling, 0 replies; 85+ messages in thread
From: Andrew Cooper @ 2018-02-02 14:13 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: George Dunlap, Paul Durrant, Tim Deegan

On 07/12/17 14:14, Jan Beulich wrote:
> This is necessary for the hook to correctly perform the operation.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 20/25] x86emul: correctly handle CMPXCHG* comparison failures
  2017-12-07 14:15 ` [PATCH v3 20/25] x86emul: correctly handle CMPXCHG* comparison failures Jan Beulich
@ 2018-02-02 14:49   ` Andrew Cooper
  2018-02-05  8:07     ` Jan Beulich
  0 siblings, 1 reply; 85+ messages in thread
From: Andrew Cooper @ 2018-02-02 14:49 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: George Dunlap, Tim Deegan

On 07/12/17 14:15, Jan Beulich wrote:
> If the ->cmpxchg() hook finds a mismatch, we should deal with this the
> same way as when the "manual" comparison reports a mismatch.
>
> This involves reverting bfce0e62c3 ("x86/emul: Drop
> X86EMUL_CMPXCHG_FAILED"), albeit with X86EMUL_CMPXCHG_FAILED now
> becoming a value distinct from X86EMUL_RETRY.
>
> In order to not leave mixed code also fully switch affected functions
> from paddr_t to intpte_t.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> v3: New.
> ---
> The code could be further simplified if we could rely on all
> ->cmpxchg() hooks always using CMPXCHG, but for now we need to cope
> with them using plain writes (and hence accept the double reads if
> CMPXCHG is actually being used).
> Note that the patch doesn't address the incorrectness of there not
> being a memory write even in the comparison-failed case.
>
> --- a/xen/arch/x86/mm/shadow/common.c
> +++ b/xen/arch/x86/mm/shadow/common.c
> @@ -302,8 +302,12 @@ hvm_emulate_cmpxchg(enum x86_segment seg
>      memcpy(&old, p_old, bytes);
>      memcpy(&new, p_new, bytes);
>  
> -    return v->arch.paging.mode->shadow.x86_emulate_cmpxchg(
> -               v, addr, old, new, bytes, sh_ctxt);
> +    rc = v->arch.paging.mode->shadow.x86_emulate_cmpxchg(
> +             v, addr, &old, new, bytes, sh_ctxt);
> +
> +    memcpy(p_old, &old, bytes);

This is redundant with ...

> +
> +    return rc;
>  }
>  
>  static const struct x86_emulate_ops hvm_shadow_emulator_ops = {
> --- a/xen/arch/x86/mm/shadow/multi.c
> +++ b/xen/arch/x86/mm/shadow/multi.c
> @@ -4741,11 +4741,11 @@ sh_x86_emulate_write(struct vcpu *v, uns
>  
>  static int
>  sh_x86_emulate_cmpxchg(struct vcpu *v, unsigned long vaddr,
> -                        unsigned long old, unsigned long new,
> -                        unsigned int bytes, struct sh_emulate_ctxt *sh_ctxt)
> +                       unsigned long *p_old, unsigned long new,
> +                       unsigned int bytes, struct sh_emulate_ctxt *sh_ctxt)
>  {
>      void *addr;
> -    unsigned long prev;
> +    unsigned long prev, old = *p_old;
>      int rv = X86EMUL_OKAY;
>  
>      /* Unaligned writes are only acceptable on HVM */
> @@ -4769,7 +4769,10 @@ sh_x86_emulate_cmpxchg(struct vcpu *v, u
>      }
>  
>      if ( prev != old )
> -        rv = X86EMUL_RETRY;
> +    {
> +        *p_old = prev;

... this, is it not?

> +        rv = X86EMUL_CMPXCHG_FAILED;
> +    }
>  
>      SHADOW_DEBUG(EMULATE, "va %#lx was %#lx expected %#lx"
>                    " wanted %#lx now %#lx bytes %u\n",
> --- a/xen/arch/x86/pv/ro-page-fault.c
> +++ b/xen/arch/x86/pv/ro-page-fault.c
> @@ -65,14 +65,16 @@ static int ptwr_emulated_read(enum x86_s
>      return X86EMUL_OKAY;
>  }
>  
> -static int ptwr_emulated_update(unsigned long addr, paddr_t old, paddr_t val,
> -                                unsigned int bytes, unsigned int do_cmpxchg,
> +static int ptwr_emulated_update(unsigned long addr, intpte_t *p_old,
> +                                intpte_t val, unsigned int bytes,
>                                  struct x86_emulate_ctxt *ctxt)
>  {
>      unsigned long mfn;
>      unsigned long unaligned_addr = addr;
>      struct page_info *page;
>      l1_pgentry_t pte, ol1e, nl1e, *pl1e;
> +    intpte_t old = p_old ? *p_old : 0;
> +    unsigned int offset = 0;

I really think this conversion to intpte needs splitting out into a
separate patch.  You're making multiple changes in this function which
aren't at commit message at all, including introducing the distinction
I've just noted of *p_old being NULL meaning a write rather than cmpxchg.

On that note specifically, it would be clearer to have "const bool
do_cmpxchg = *p_old; /* cmpxchg, or write? */".  If you don't want to do
it, then there needs to be a comment with the function explaining the
semantics of p_old.

~Andrew


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 08/25] x86emul: add tables for XOP 08 and 09 extension spaces
  2018-02-02 11:43   ` Andrew Cooper
@ 2018-02-02 15:15     ` Jan Beulich
  2018-02-02 16:02       ` Andrew Cooper
  0 siblings, 1 reply; 85+ messages in thread
From: Jan Beulich @ 2018-02-02 15:15 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: George Dunlap, xen-devel

>>> On 02.02.18 at 12:43, <andrew.cooper3@citrix.com> wrote:
> On 07/12/17 14:04, Jan Beulich wrote:
>> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
>> @@ -458,6 +458,20 @@ static const opcode_desc_t xop_table[] =
>>      DstReg|SrcImm|ModRM,
>>  };
>>  
>> +static const struct {
>> +    uint8_t simd_size:5;
>> +    uint8_t two_op:1;
>> +    uint8_t four_op:1;
>> +} ext8f08_table[256] = {
>> +};
>> +
>> +static const struct {
>> +    uint8_t simd_size:5;
>> +    uint8_t two_op:1;
>> +} ext8f09_table[256] = {
>> +    [0x01 ... 0x02] = { .two_op = 1 },
>> +};
>> +
> 
> What about 8f0a ?  We've got emulation for bextr already, and might want
> to consider #GRP4 seeing as we expose LWP to guests.

I'd prefer to convert that to a table at the time we need it.

>> @@ -2726,7 +2740,7 @@ x86_decode(
>>              }
>>              break;
>>  
>> -        case vex_0f38:
>> +        case ext_0f38:
>>              d = ext0f38_table[b].to_mem ? DstMem | SrcReg
>>                                          : DstReg | SrcMem;
>>              if ( ext0f38_table[b].two_op )
>> @@ -2736,7 +2750,14 @@ x86_decode(
>>              state->simd_size = ext0f38_table[b].simd_size;
>>              break;
>>  
>> -        case vex_0f3a:
>> +        case ext_8f09:
>> +            if ( ext8f09_table[b].two_op )
>> +                d |= TwoOp;
>> +            state->simd_size = ext8f09_table[b].simd_size;
>> +            break;
>> +
>> +        case ext_0f3a:
>> +        case ext_8f08:
>>              /*
>>               * Cannot update d here yet, as the immediate operand still
>>               * needs fetching.
>> @@ -2928,6 +2949,15 @@ x86_decode(
>>          break;
>>  
>>      case ext_8f08:
>> +        d = DstReg | SrcMem;
>> +        if ( ext8f08_table[b].two_op )
>> +            d |= TwoOp;
>> +        else if ( ext8f08_table[b].four_op && !mode_64bit() )
>> +            imm1 &= 0x7f;
>> +        state->desc = d;
>> +        state->simd_size = ext8f08_table[b].simd_size;
>> +        break;
> 
> I presume that these don't actually impact our currently implemented XOP
> instructions?

No - as the description says, the patch converts the existing ones,
it doesn't break them (and the test harness continuing to work
confirms this).

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 09/25] x86emul: support XOP insns
  2018-02-02 12:03   ` Andrew Cooper
@ 2018-02-02 15:17     ` Jan Beulich
  2018-02-05 13:01       ` Andrew Cooper
  0 siblings, 1 reply; 85+ messages in thread
From: Jan Beulich @ 2018-02-02 15:17 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: George Dunlap, xen-devel

>>> On 02.02.18 at 13:03, <andrew.cooper3@citrix.com> wrote:
> On 07/12/17 14:04, Jan Beulich wrote:
>> @@ -8027,6 +8060,13 @@ x86_emulate(
>>          generate_exception_if(vex.w, EXC_UD);
>>          goto simd_0f_imm8_avx;
>>  
>> +    case X86EMUL_OPC_VEX_66(0x0f3a, 0x48): /* vpermil2ps $imm,{x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
>> +                                           /* vpermil2ps $imm,{x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
>> +    case X86EMUL_OPC_VEX_66(0x0f3a, 0x49): /* vpermil2pd $imm,{x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
>> +                                           /* vpermil2pd $imm,{x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
>> +        host_and_vcpu_must_have(xop);
>> +        goto simd_0f_imm8_ymm;
> 
> Is this correct?  VEX.W selects which operand may be the memory operand,
> and I don't see anything in the decode which copes, or anything in the
> stub which adjusts .W.

That's the nice thing here - by re-using the original instruction in
the stub (with only GPR numbers adjusted if necessary) we simply
don't care which of the operands it the memory one, as long as
the access width does not differ (and it doesn't).

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 10/25] x86emul: support 3DNow! insns
  2018-02-02 13:02   ` Andrew Cooper
@ 2018-02-02 15:22     ` Jan Beulich
  2018-02-02 16:04       ` Andrew Cooper
  0 siblings, 1 reply; 85+ messages in thread
From: Jan Beulich @ 2018-02-02 15:22 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: George Dunlap, xen-devel

>>> On 02.02.18 at 14:02, <andrew.cooper3@citrix.com> wrote:
> On 07/12/17 14:05, Jan Beulich wrote:
>> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
>> @@ -355,6 +355,36 @@ static const struct {
>>      [0xff] = { ModRM }
>>  };
>>  
>> +static const uint16_t _3dnow_table[16] = {
> 
> Comment explaining how these mappings work?  It looks like nibble
> splits, but I still can't work out how to crossreference with the opcode
> tables.

Will do. Array index is high opcode nibble, bit index is low opcode
nibble.

>> +    [0x0] = (1 << 0xd) /* pi2fd */,
>> +    [0x1] = (1 << 0xd) /* pf2id */,
>> +    [0x9] = (1 << 0x0) /* pfcmpge */ |
>> +            (1 << 0x4) /* pfmin */ |
>> +            (1 << 0x6) /* pfrcp */ |
>> +            (1 << 0x7) /* pfrsqrt */ |
>> +            (1 << 0xa) /* pfsub */ |
>> +            (1 << 0xe) /* pfadd */,
>> +    [0xa] = (1 << 0x0) /* pfcmpge */ |
>> +            (1 << 0x4) /* pfmax */ |
>> +            (1 << 0x6) /* pfrcpit1 */ |
>> +            (1 << 0x7) /* pfrsqit1 */ |
>> +            (1 << 0xa) /* pfsubr */ |
>> +            (1 << 0xe) /* pfacc */,
>> +    [0xb] = (1 << 0x0) /* pfcmpeq */ |
>> +            (1 << 0x4) /* pfmul */ |
>> +            (1 << 0x6) /* pfrcpit2 */ |
>> +            (1 << 0x7) /* pmulhrw */ |
>> +            (1 << 0xf) /* pavgusb */,
>> +};
>> +
>> +static const uint16_t _3dnow_ext_table[16] = {
>> +    [0x1] = (1 << 0xd) /* pi2fw */,
>> +    [0x1] = (1 << 0xc) /* pf2iw */,
> 
> You presumably want an | in here instead?

No, the first of the two lines is wrong and needs to be

    [0x0] = (1 << 0xc) /* pi2fw */,

(wrong post-copy-and-paste editing).

>> @@ -5505,6 +5537,26 @@ x86_emulate(
>>      case X86EMUL_OPC(0x0f, 0x19) ... X86EMUL_OPC(0x0f, 0x1f): /* nop */
>>          break;
> 
> 0f 0d prefetches?  They are 3DNow instructions, but available on later
> processors.

And it is for that latter reason (I assume) that we have these
already.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 11/25] x86emul: place test blobs in executable section
  2018-02-02 13:03   ` Andrew Cooper
@ 2018-02-02 15:27     ` Jan Beulich
  2018-02-05 13:11       ` Andrew Cooper
  0 siblings, 1 reply; 85+ messages in thread
From: Jan Beulich @ 2018-02-02 15:27 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: George Dunlap, xen-devel

>>> On 02.02.18 at 14:03, <andrew.cooper3@citrix.com> wrote:
> On 07/12/17 14:06, Jan Beulich wrote:
>> This allows the section contents to be disassembled without going
>> through any extra hoops, simplifying the analysis of problems in test
>> and/or emulation code.
>>
>> The blobs being emitted as (r/o) data means we need to accept an
>> assembler warning here (about the differing section attributes).
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> What about just giving up their constness?  This is a test program after
> all.

Then the conflict would be in two attributes (writable and
executable) rather than just one. The issue is that we emit them
as data, but want them to be in an executable section. If anything
we'd have to re-do how they're emitted (e.g. by using asm()), but
that seems overkill to me.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 08/25] x86emul: add tables for XOP 08 and 09 extension spaces
  2018-02-02 15:15     ` Jan Beulich
@ 2018-02-02 16:02       ` Andrew Cooper
  0 siblings, 0 replies; 85+ messages in thread
From: Andrew Cooper @ 2018-02-02 16:02 UTC (permalink / raw)
  To: Jan Beulich; +Cc: George Dunlap, xen-devel

On 02/02/18 15:15, Jan Beulich wrote:
>>>> On 02.02.18 at 12:43, <andrew.cooper3@citrix.com> wrote:
>> On 07/12/17 14:04, Jan Beulich wrote:
>>> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
>>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
>>> @@ -458,6 +458,20 @@ static const opcode_desc_t xop_table[] =
>>>      DstReg|SrcImm|ModRM,
>>>  };
>>>  
>>> +static const struct {
>>> +    uint8_t simd_size:5;
>>> +    uint8_t two_op:1;
>>> +    uint8_t four_op:1;
>>> +} ext8f08_table[256] = {
>>> +};
>>> +
>>> +static const struct {
>>> +    uint8_t simd_size:5;
>>> +    uint8_t two_op:1;
>>> +} ext8f09_table[256] = {
>>> +    [0x01 ... 0x02] = { .two_op = 1 },
>>> +};
>>> +
>> What about 8f0a ?  We've got emulation for bextr already, and might want
>> to consider #GRP4 seeing as we expose LWP to guests.
> I'd prefer to convert that to a table at the time we need it.
>
>>> @@ -2726,7 +2740,7 @@ x86_decode(
>>>              }
>>>              break;
>>>  
>>> -        case vex_0f38:
>>> +        case ext_0f38:
>>>              d = ext0f38_table[b].to_mem ? DstMem | SrcReg
>>>                                          : DstReg | SrcMem;
>>>              if ( ext0f38_table[b].two_op )
>>> @@ -2736,7 +2750,14 @@ x86_decode(
>>>              state->simd_size = ext0f38_table[b].simd_size;
>>>              break;
>>>  
>>> -        case vex_0f3a:
>>> +        case ext_8f09:
>>> +            if ( ext8f09_table[b].two_op )
>>> +                d |= TwoOp;
>>> +            state->simd_size = ext8f09_table[b].simd_size;
>>> +            break;
>>> +
>>> +        case ext_0f3a:
>>> +        case ext_8f08:
>>>              /*
>>>               * Cannot update d here yet, as the immediate operand still
>>>               * needs fetching.
>>> @@ -2928,6 +2949,15 @@ x86_decode(
>>>          break;
>>>  
>>>      case ext_8f08:
>>> +        d = DstReg | SrcMem;
>>> +        if ( ext8f08_table[b].two_op )
>>> +            d |= TwoOp;
>>> +        else if ( ext8f08_table[b].four_op && !mode_64bit() )
>>> +            imm1 &= 0x7f;
>>> +        state->desc = d;
>>> +        state->simd_size = ext8f08_table[b].simd_size;
>>> +        break;
>> I presume that these don't actually impact our currently implemented XOP
>> instructions?
> No - as the description says, the patch converts the existing ones,
> it doesn't break them (and the test harness continuing to work
> confirms this).

Ok, in which case Reviewed-by: Andrew Cooper
<andrew.cooper3@citrix.com>, conditional on the tables gaining names
(which you said you've already done IIRC).

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 10/25] x86emul: support 3DNow! insns
  2018-02-02 15:22     ` Jan Beulich
@ 2018-02-02 16:04       ` Andrew Cooper
  0 siblings, 0 replies; 85+ messages in thread
From: Andrew Cooper @ 2018-02-02 16:04 UTC (permalink / raw)
  To: Jan Beulich; +Cc: George Dunlap, xen-devel

On 02/02/18 15:22, Jan Beulich wrote:
>>>> On 02.02.18 at 14:02, <andrew.cooper3@citrix.com> wrote:
>> On 07/12/17 14:05, Jan Beulich wrote:
>>> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
>>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
>>> @@ -355,6 +355,36 @@ static const struct {
>>>      [0xff] = { ModRM }
>>>  };
>>>  
>>> +static const uint16_t _3dnow_table[16] = {
>> Comment explaining how these mappings work?  It looks like nibble
>> splits, but I still can't work out how to crossreference with the opcode
>> tables.
> Will do. Array index is high opcode nibble, bit index is low opcode
> nibble.
>
>>> +    [0x0] = (1 << 0xd) /* pi2fd */,
>>> +    [0x1] = (1 << 0xd) /* pf2id */,
>>> +    [0x9] = (1 << 0x0) /* pfcmpge */ |
>>> +            (1 << 0x4) /* pfmin */ |
>>> +            (1 << 0x6) /* pfrcp */ |
>>> +            (1 << 0x7) /* pfrsqrt */ |
>>> +            (1 << 0xa) /* pfsub */ |
>>> +            (1 << 0xe) /* pfadd */,
>>> +    [0xa] = (1 << 0x0) /* pfcmpge */ |
>>> +            (1 << 0x4) /* pfmax */ |
>>> +            (1 << 0x6) /* pfrcpit1 */ |
>>> +            (1 << 0x7) /* pfrsqit1 */ |
>>> +            (1 << 0xa) /* pfsubr */ |
>>> +            (1 << 0xe) /* pfacc */,
>>> +    [0xb] = (1 << 0x0) /* pfcmpeq */ |
>>> +            (1 << 0x4) /* pfmul */ |
>>> +            (1 << 0x6) /* pfrcpit2 */ |
>>> +            (1 << 0x7) /* pmulhrw */ |
>>> +            (1 << 0xf) /* pavgusb */,
>>> +};
>>> +
>>> +static const uint16_t _3dnow_ext_table[16] = {
>>> +    [0x1] = (1 << 0xd) /* pi2fw */,
>>> +    [0x1] = (1 << 0xc) /* pf2iw */,
>> You presumably want an | in here instead?
> No, the first of the two lines is wrong and needs to be
>
>     [0x0] = (1 << 0xc) /* pi2fw */,
>
> (wrong post-copy-and-paste editing).
>
>>> @@ -5505,6 +5537,26 @@ x86_emulate(
>>>      case X86EMUL_OPC(0x0f, 0x19) ... X86EMUL_OPC(0x0f, 0x1f): /* nop */
>>>          break;
>> 0f 0d prefetches?  They are 3DNow instructions, but available on later
>> processors.
> And it is for that latter reason (I assume) that we have these
> already.

Ah.  I see now that they are just out of context above this hunk.

Sorry for the noise.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 21/25] x86emul: add read-modify-write hook
  2017-12-07 14:16 ` [PATCH v3 21/25] x86emul: add read-modify-write hook Jan Beulich
@ 2018-02-02 16:13   ` Andrew Cooper
  2018-02-05  8:22     ` Jan Beulich
  0 siblings, 1 reply; 85+ messages in thread
From: Andrew Cooper @ 2018-02-02 16:13 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: George Dunlap

On 07/12/17 14:16, Jan Beulich wrote:
> In order to correctly emulate read-modify-write insns, especially
> LOCKed ones, we should not issue reads and writes separately. Use a
> new hook to combine both, and don't uniformly read the memory
> destination anymore. Instead, DstMem opcodes without Mov now need to
> have done so in their respective case blocks.
>
> Also strip bogus _ prefixes from macro parameters when this only affects
> lines which are being changed anyway.
>
> In the test harness, besides some re-ordering to facilitate running a
> few tests twice (one without and a second time with the .rmw hook in
> place), tighten a few EFLAGS checks and add a test for NOT with memory
> operand (in particular to verify EFLAGS don't get altered there).
>
> For now make use of the hook optional for callers; eventually we may
> want to consider making this mandatory.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> v3: New.
> ---
> TBD: Do we want to also support non-lockable RMW insns in the new hook
>      and helper (SHL & friends, SHLD, SHRD)?

What would this achieve?  I suppose it would avoid a double pagewalk.

>
>
> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -3356,35 +3388,83 @@ x86_emulate(
>          unsigned int i, n;
>          unsigned long dummy;
>  
> -    case 0x00 ... 0x05: add: /* add */
> -        emulate_2op_SrcV("add", src, dst, _regs.eflags);
> +    case 0x00: case 0x01: add: /* add reg,mem */
> +        if ( ops->rmw && dst.type == OP_MEM )
> +            state->rmw = rmw_add;
> +        else
> +        {
> +    case 0x02 ... 0x05: /* add */

I think it would help to identify reg,reg specifically in these comments.

> +            emulate_2op_SrcV("add", src, dst, _regs.eflags);
> +        }
>          break;
>  
> -    case 0x08 ... 0x0d: or:  /* or */
> -        emulate_2op_SrcV("or", src, dst, _regs.eflags);
> +    case 0x08: case 0x09: or: /* or reg,mem */
> +        if ( ops->rmw && dst.type == OP_MEM )
> +            state->rmw = rmw_or;
> +        else
> +        {
> +    case 0x0a ... 0x0d: /* or */
> +            emulate_2op_SrcV("or", src, dst, _regs.eflags);
> +        }
>          break;
>  
> -    case 0x10 ... 0x15: adc: /* adc */
> -        emulate_2op_SrcV("adc", src, dst, _regs.eflags);
> +    case 0x10: case 0x11: adc: /* adc reg,mem */
> +        if ( ops->rmw && dst.type == OP_MEM )
> +            state->rmw = rmw_adc;
> +        else
> +        {
> +    case 0x12 ... 0x15: /* adc */
> +            emulate_2op_SrcV("adc", src, dst, _regs.eflags);
> +        }
>          break;
>  
> -    case 0x18 ... 0x1d: sbb: /* sbb */
> -        emulate_2op_SrcV("sbb", src, dst, _regs.eflags);
> +    case 0x18: case 0x19: sbb: /* sbb reg,mem */
> +        if ( ops->rmw && dst.type == OP_MEM )
> +            state->rmw = rmw_sbb;
> +        else
> +        {
> +    case 0x1a ... 0x1d: /* sbb */
> +            emulate_2op_SrcV("sbb", src, dst, _regs.eflags);
> +        }
>          break;
>  
> -    case 0x20 ... 0x25: and: /* and */
> -        emulate_2op_SrcV("and", src, dst, _regs.eflags);
> +    case 0x20: case 0x21: and: /* and reg,mem */
> +        if ( ops->rmw && dst.type == OP_MEM )
> +            state->rmw = rmw_and;
> +        else
> +        {
> +    case 0x22 ... 0x25: /* and */
> +            emulate_2op_SrcV("and", src, dst, _regs.eflags);
> +        }
>          break;
>  
> -    case 0x28 ... 0x2d: sub: /* sub */
> -        emulate_2op_SrcV("sub", src, dst, _regs.eflags);
> +    case 0x28: case 0x29: sub: /* sub reg,mem */
> +        if ( ops->rmw && dst.type == OP_MEM )
> +            state->rmw = rmw_sub;
> +        else
> +        {
> +    case 0x2a ... 0x2d: /* sub */
> +            emulate_2op_SrcV("sub", src, dst, _regs.eflags);
> +        }
>          break;
>  
> -    case 0x30 ... 0x35: xor: /* xor */
> -        emulate_2op_SrcV("xor", src, dst, _regs.eflags);
> +    case 0x30: case 0x31: xor: /* xor reg,mem */
> +        if ( ops->rmw && dst.type == OP_MEM )
> +            state->rmw = rmw_xor;
> +        else
> +        {
> +    case 0x32 ... 0x35: /* xor */
> +            emulate_2op_SrcV("xor", src, dst, _regs.eflags);
> +        }
>          break;
>  
> -    case 0x38 ... 0x3d: cmp: /* cmp */
> +    case 0x38: case 0x39: cmp: /* cmp reg,mem */
> +        if ( ops->rmw && dst.type == OP_MEM &&
> +             (rc = read_ulong(dst.mem.seg, dst.mem.off, &dst.val,
> +                              dst.bytes, ctxt, ops)) != X86EMUL_OKAY )

Why does rmw matter here? cmp doesn't write to its operands.

> +            goto done;
> +        /* fall through */
> +    case 0x3a ... 0x3d: /* cmp */
>          generate_exception_if(lock_prefix, EXC_UD);
>          emulate_2op_SrcV("cmp", src, dst, _regs.eflags);
>          dst.type = OP_NONE;
> @@ -3700,6 +3780,13 @@ x86_emulate(
>          break;
>  
>      case 0x86 ... 0x87: xchg: /* xchg */
> +        /* The lock prefix is implied for this insn. */
> +        lock_prefix = 1;

Only for memory.  I.e. this should probably be inside an OP_MEM check.

~Andrew

> +        if ( ops->rmw && dst.type == OP_MEM )
> +        {
> +            state->rmw = rmw_xchg;
> +            break;
> +        }
>          /* Write back the register source. */
>          switch ( dst.bytes )
>          {


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 13/25] x86emul: adjust_bnd() should check XCR0
  2018-02-02 13:30   ` Andrew Cooper
@ 2018-02-02 16:19     ` Jan Beulich
  2018-02-02 16:28       ` Andrew Cooper
  0 siblings, 1 reply; 85+ messages in thread
From: Jan Beulich @ 2018-02-02 16:19 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: George Dunlap, xen-devel

>>> On 02.02.18 at 14:30, <andrew.cooper3@citrix.com> wrote:
> On 07/12/17 14:08, Jan Beulich wrote:
>> Experimentally MPX instructions have been confirmed to behave as NOPs
>> unless both related XCR0 bits are set to 1. By implication branches
>> then also don't clear BNDn.
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>
>> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
>> @@ -2143,12 +2143,16 @@ static bool umip_active(struct x86_emula
>>  static void adjust_bnd(struct x86_emulate_ctxt *ctxt,
>>                         const struct x86_emulate_ops *ops, enum vex_pfx pfx)
>>  {
>> -    uint64_t bndcfg;
>> +    uint64_t xcr0, bndcfg;
>>      int rc;
>>  
>>      if ( pfx == vex_f2 || !cpu_has_mpx || !vcpu_has_mpx() )
>>          return;
>>  
>> +    if ( !ops->read_xcr || ops->read_xcr(0, &xcr0, ctxt) != X86EMUL_OKAY ||
>> +         !(xcr0 & XSTATE_BNDREGS) || !(xcr0 & XSTATE_BNDCSR) )
> 
> !(xcr0 & (XSTATE_BNDREGS | XSTATE_BNDCSR)) ?

No, I mean "if either bit is clear", not "if both bits are clear". I think
we had discussed before that both bits need to be 1 in order for
bounds checking to actually work.

> Otherwise, Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

Please clarify this in light of the above.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 16/25] x86emul: support SWAPGS
  2018-02-02 13:41   ` Andrew Cooper
@ 2018-02-02 16:24     ` Jan Beulich
  0 siblings, 0 replies; 85+ messages in thread
From: Jan Beulich @ 2018-02-02 16:24 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: George Dunlap, xen-devel

>>> On 02.02.18 at 14:41, <andrew.cooper3@citrix.com> wrote:
> On 07/12/17 14:11, Jan Beulich wrote:
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> ---
>> v3: New.
>>
>> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
>> @@ -5047,6 +5047,24 @@ x86_emulate(
>>                  goto done;
>>              break;
>>  
>> +        case 0xf8: /* swapgs */
>> +            generate_exception_if(!mode_64bit(), EXC_UD);
>> +            generate_exception_if(!mode_ring0(), EXC_GP, 0);
>> +            fail_if(!ops->read_segment || !ops->read_msr ||
>> +                    !ops->write_segment || !ops->write_msr);
>> +            if ( (rc = ops->read_segment(x86_seg_gs, &sreg,
>> +                                         ctxt)) != X86EMUL_OKAY ||
>> +                 (rc = ops->read_msr(MSR_SHADOW_GS_BASE, &msr_val,
>> +                                     ctxt)) != X86EMUL_OKAY ||
>> +                 (rc = ops->write_msr(MSR_SHADOW_GS_BASE, sreg.base,
>> +                                      ctxt)) != X86EMUL_OKAY )
> 
> We need to unwind this write in the case of write_segment failing, or
> when the instruction restarts, state will be corrupt.

We don't do similar restoring anywhere else iirc, so I'm not sure I
want to start doing so here. Multi-element updates really need to
be converted to go through a staging layer, where the checks
are done right away, but the commit happens only at the end.
One of the reasons I decided against doing what you suggest
(indeed I had considered that) is that this other write may then
be failing, too. Let me know.

Jan

>> +                goto done;
>> +            sreg.base = msr_val;
>> +            if ( (rc = ops->write_segment(x86_seg_gs, &sreg,
>> +                                          ctxt)) != X86EMUL_OKAY )
>> +                goto done;
>> +            break;
>> +
>>          case 0xf9: /* rdtscp */
>>              fail_if(ops->read_msr == NULL);
>>              if ( (rc = ops->read_msr(MSR_TSC_AUX,
>>
>>
>>




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 13/25] x86emul: adjust_bnd() should check XCR0
  2018-02-02 16:19     ` Jan Beulich
@ 2018-02-02 16:28       ` Andrew Cooper
  0 siblings, 0 replies; 85+ messages in thread
From: Andrew Cooper @ 2018-02-02 16:28 UTC (permalink / raw)
  To: Jan Beulich; +Cc: George Dunlap, xen-devel

On 02/02/18 16:19, Jan Beulich wrote:
>>>> On 02.02.18 at 14:30, <andrew.cooper3@citrix.com> wrote:
>> On 07/12/17 14:08, Jan Beulich wrote:
>>> Experimentally MPX instructions have been confirmed to behave as NOPs
>>> unless both related XCR0 bits are set to 1. By implication branches
>>> then also don't clear BNDn.
>>>
>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>>
>>> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
>>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
>>> @@ -2143,12 +2143,16 @@ static bool umip_active(struct x86_emula
>>>  static void adjust_bnd(struct x86_emulate_ctxt *ctxt,
>>>                         const struct x86_emulate_ops *ops, enum vex_pfx pfx)
>>>  {
>>> -    uint64_t bndcfg;
>>> +    uint64_t xcr0, bndcfg;
>>>      int rc;
>>>  
>>>      if ( pfx == vex_f2 || !cpu_has_mpx || !vcpu_has_mpx() )
>>>          return;
>>>  
>>> +    if ( !ops->read_xcr || ops->read_xcr(0, &xcr0, ctxt) != X86EMUL_OKAY ||
>>> +         !(xcr0 & XSTATE_BNDREGS) || !(xcr0 & XSTATE_BNDCSR) )
>> !(xcr0 & (XSTATE_BNDREGS | XSTATE_BNDCSR)) ?
> No, I mean "if either bit is clear", not "if both bits are clear". I think
> we had discussed before that both bits need to be 1 in order for
> bounds checking to actually work.
>
>> Otherwise, Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
> Please clarify this in light of the above.

Architecturally, they can't be different, which is why the above logic
looks suspicious.

Given that the actual isn't wrong, I won't object, but it does look
wrong to compare them individually.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 22/25] x86/HVM: do actual CMPXCHG in hvmemul_cmpxchg()
  2017-12-07 14:16 ` [PATCH v3 22/25] x86/HVM: do actual CMPXCHG in hvmemul_cmpxchg() Jan Beulich
  2017-12-07 14:38   ` Razvan Cojocaru
  2017-12-08 10:38   ` Paul Durrant
@ 2018-02-02 16:36   ` Andrew Cooper
  2018-02-05  8:32     ` Jan Beulich
  2 siblings, 1 reply; 85+ messages in thread
From: Andrew Cooper @ 2018-02-02 16:36 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: George Dunlap, Paul Durrant

On 07/12/17 14:16, Jan Beulich wrote:
> ..., at least as far as currently possible, i.e. when a mapping can be
> obtained.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> v3: New.
>
> --- a/xen/arch/x86/hvm/emulate.c
> +++ b/xen/arch/x86/hvm/emulate.c
> @@ -1296,8 +1296,83 @@ static int hvmemul_cmpxchg(
>      bool lock,
>      struct x86_emulate_ctxt *ctxt)
>  {
> -    /* Fix this in case the guest is really relying on r-m-w atomicity. */
> -    return hvmemul_write(seg, offset, p_new, bytes, ctxt);
> +    struct hvm_emulate_ctxt *hvmemul_ctxt =
> +        container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
> +    struct vcpu *curr = current;
> +    unsigned long addr, reps = 1;
> +    uint32_t pfec = PFEC_page_present | PFEC_write_access;

I'm fairly certain from my pagetable work that passing PFEC_page_present
here is bogus, and I do have (eventual) plans to make the pagewalk
reject such values.

> +    struct hvm_vcpu_io *vio = &curr->arch.hvm_vcpu.hvm_io;
> +    int rc;
> +    void *mapping = NULL;
> +
> +    rc = hvmemul_virtual_to_linear(
> +        seg, offset, bytes, &reps, hvm_access_write, hvmemul_ctxt, &addr);
> +    if ( rc != X86EMUL_OKAY )
> +        return rc;
> +
> +    if ( is_x86_system_segment(seg) )
> +        pfec |= PFEC_implicit;
> +    else if ( hvmemul_ctxt->seg_reg[x86_seg_ss].dpl == 3 )
> +        pfec |= PFEC_user_mode;
> +
> +    mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt);
> +    if ( IS_ERR(mapping) )
> +        return ~PTR_ERR(mapping);
> +
> +    if ( !mapping )
> +    {
> +        /* Fix this in case the guest is really relying on r-m-w atomicity. */
> +        return hvmemul_linear_mmio_write(addr, bytes, p_new, pfec,
> +                                         hvmemul_ctxt,
> +                                         vio->mmio_access.write_access &&
> +                                         vio->mmio_gla == (addr & PAGE_MASK));
> +    }
> +
> +    switch ( bytes )
> +    {
> +    case 1: case 2: case 4: case 8:
> +    {
> +        unsigned long old = 0, new = 0, cur;
> +
> +        memcpy(&old, p_old, bytes);
> +        memcpy(&new, p_new, bytes);
> +        if ( lock )
> +            cur = __cmpxchg(mapping, old, new, bytes);
> +        else
> +            cur = cmpxchg_local_(mapping, old, new, bytes);
> +        if ( cur != old )
> +        {
> +            memcpy(p_old, &cur, bytes);
> +            rc = X86EMUL_CMPXCHG_FAILED;
> +        }
> +        break;
> +    }
> +
> +    case 16:
> +        if ( cpu_has_cx16 )
> +        {
> +            __uint128_t *old = p_old, cur;
> +
> +            if ( lock )
> +                cur = __cmpxchg16b(mapping, old, p_new);
> +            else
> +                cur = cmpxchg16b_local_(mapping, old, p_new);
> +            if ( cur != *old )
> +            {
> +                *old = cur;
> +                rc = X86EMUL_CMPXCHG_FAILED;
> +            }
> +            break;
> +        }
> +        /* fall through */
> +    default:

ASSERT_UNREACHABLE() ?

> +        rc = X86EMUL_UNHANDLEABLE;
> +        break;
> +    }
> +
> +    hvmemul_unmap_linear_addr(mapping, addr, bytes, hvmemul_ctxt);
> +
> +    return rc;
>  }
>  
>  static int hvmemul_validate(
> --- a/xen/include/asm-x86/system.h
> +++ b/xen/include/asm-x86/system.h
> @@ -110,6 +110,38 @@ static always_inline unsigned long __cmp
>      return old;
>  }
>  
> +static always_inline unsigned long cmpxchg_local_(

unlocked_cmpxchg() ?

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 23/25] x86/HVM: make use of new read-modify-write emulator hook
  2017-12-07 14:17 ` [PATCH v3 23/25] x86/HVM: make use of new read-modify-write emulator hook Jan Beulich
  2017-12-08 10:41   ` Paul Durrant
@ 2018-02-02 16:37   ` Andrew Cooper
  2018-02-05  8:34     ` Jan Beulich
  1 sibling, 1 reply; 85+ messages in thread
From: Andrew Cooper @ 2018-02-02 16:37 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: George Dunlap, Paul Durrant

On 07/12/17 14:17, Jan Beulich wrote:
> ..., at least as far as currently possible, i.e. when a mapping can be
> obtained.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> v3: New.
>
> --- a/xen/arch/x86/hvm/emulate.c
> +++ b/xen/arch/x86/hvm/emulate.c
> @@ -1187,6 +1187,61 @@ static int hvmemul_write(
>      return X86EMUL_OKAY;
>  }
>  
> +static int hvmemul_rmw(
> +    enum x86_segment seg,
> +    unsigned long offset,
> +    unsigned int bytes,
> +    uint32_t *eflags,
> +    struct x86_emulate_state *state,
> +    struct x86_emulate_ctxt *ctxt)
> +{
> +    struct hvm_emulate_ctxt *hvmemul_ctxt =
> +        container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
> +    unsigned long addr, reps = 1;
> +    uint32_t pfec = PFEC_page_present | PFEC_write_access;

Drop present, and...

> +    struct hvm_vcpu_io *vio = &current->arch.hvm_vcpu.hvm_io;
> +    int rc;
> +    void *mapping;
> +
> +    rc = hvmemul_virtual_to_linear(
> +        seg, offset, bytes, &reps, hvm_access_write, hvmemul_ctxt, &addr);
> +    if ( rc != X86EMUL_OKAY || !bytes )
> +        return rc;
> +
> +    if ( is_x86_system_segment(seg) )
> +        pfec |= PFEC_implicit;
> +    else if ( hvmemul_ctxt->seg_reg[x86_seg_ss].dpl == 3 )
> +        pfec |= PFEC_user_mode;
> +
> +    mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt);
> +    if ( IS_ERR(mapping) )
> +        return ~PTR_ERR(mapping);
> +
> +    if ( mapping )
> +    {
> +        rc = x86_emul_rmw(mapping, bytes, eflags, state, ctxt);
> +        hvmemul_unmap_linear_addr(mapping, addr, bytes, hvmemul_ctxt);
> +    }
> +    else
> +    {
> +        unsigned long data = 0;
> +        bool_t known_gpfn = vio->mmio_access.write_access &&
> +                            vio->mmio_gla == (addr & PAGE_MASK);

... bool here.

Otherwise, Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

> +
> +        if ( bytes > sizeof(data) )
> +            return X86EMUL_UNHANDLEABLE;
> +        rc = hvmemul_linear_mmio_read(addr, bytes, &data, pfec, hvmemul_ctxt,
> +                                      known_gpfn);
> +        if ( rc == X86EMUL_OKAY )
> +            rc = x86_emul_rmw(&data, bytes, eflags, state, ctxt);
> +        if ( rc == X86EMUL_OKAY )
> +            rc = hvmemul_linear_mmio_write(addr, bytes, &data, pfec,
> +                                           hvmemul_ctxt, known_gpfn);
> +    }
> +
> +    return rc;
> +}
> +
>  static int hvmemul_write_discard(
>      enum x86_segment seg,
>      unsigned long offset,
> @@ -2157,6 +2212,7 @@ static const struct x86_emulate_ops hvm_
>      .read          = hvmemul_read,
>      .insn_fetch    = hvmemul_insn_fetch,
>      .write         = hvmemul_write,
> +    .rmw           = hvmemul_rmw,
>      .cmpxchg       = hvmemul_cmpxchg,
>      .validate      = hvmemul_validate,
>      .rep_ins       = hvmemul_rep_ins,
>
>
>


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 24/25] x86/shadow: fully move unmap-dest into common code
  2017-12-07 14:18 ` [PATCH v3 24/25] x86/shadow: fully move unmap-dest into common code Jan Beulich
@ 2018-02-02 16:46   ` Andrew Cooper
  0 siblings, 0 replies; 85+ messages in thread
From: Andrew Cooper @ 2018-02-02 16:46 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: George Dunlap, Tim Deegan

On 07/12/17 14:18, Jan Beulich wrote:
> @@ -1778,6 +1781,42 @@ void *sh_emulate_map_dest(struct vcpu *v
>      return map;
>  }
>  
> +/**************************************************************************/
> +/* Optimization: If we see two emulated writes of zeros to the same
> + * page-table without another kind of page fault in between, we guess
> + * that this is a batch of changes (for process destruction) and
> + * unshadow the page so we don't take a pagefault on every entry.  This
> + * should also make finding writeable mappings of pagetables much
> + * easier. */
> +
> +/* Look to see if this is the second emulated write in a row to this
> + * page, and unshadow if it is */

Do you mind adjusting the comment style as part of the movement?

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 25/25] x86/shadow: fold sh_x86_emulate_{write, cmpxchg}() into their only callers
  2017-12-07 14:19 ` [PATCH v3 25/25] x86/shadow: fold sh_x86_emulate_{write, cmpxchg}() into their only callers Jan Beulich
@ 2018-02-02 16:52   ` Andrew Cooper
  2018-02-05  8:42     ` Jan Beulich
  0 siblings, 1 reply; 85+ messages in thread
From: Andrew Cooper @ 2018-02-02 16:52 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: George Dunlap, Tim Deegan

On 07/12/17 14:19, Jan Beulich wrote:
> @@ -298,14 +332,43 @@ hvm_emulate_cmpxchg(enum x86_segment seg
>      if ( rc )
>          return rc;
>  
> +    /* Unaligned writes are only acceptable on HVM */
> +    if ( (addr & (bytes - 1)) && !is_hvm_vcpu(v)  )
> +        return X86EMUL_UNHANDLEABLE;
> +
> +    ptr = sh_emulate_map_dest(v, addr, bytes, sh_ctxt);
> +    if ( IS_ERR(ptr) )
> +        return ~PTR_ERR(ptr);
> +
>      old = new = 0;
>      memcpy(&old, p_old, bytes);
>      memcpy(&new, p_new, bytes);
>  
> -    rc = v->arch.paging.mode->shadow.x86_emulate_cmpxchg(
> -             v, addr, &old, new, bytes, sh_ctxt);
> +    paging_lock(v->domain);
> +    switch ( bytes )
> +    {
> +    case 1: prev = cmpxchg((uint8_t  *)ptr, old, new); break;
> +    case 2: prev = cmpxchg((uint16_t *)ptr, old, new); break;
> +    case 4: prev = cmpxchg((uint32_t *)ptr, old, new); break;
> +    case 8: prev = cmpxchg((uint64_t *)ptr, old, new); break;
> +    default:
> +        SHADOW_PRINTK("cmpxchg size %u is not supported\n", bytes);

Given the earlier patches in the series, is it worth introducing case 16
here?

Irrespective, this doesn't interfere with the purpose of the patch, so
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 12/25] x86emul: abstract out XCRn accesses
  2018-02-02 13:29   ` Andrew Cooper
@ 2018-02-02 17:05     ` Jan Beulich
  0 siblings, 0 replies; 85+ messages in thread
From: Jan Beulich @ 2018-02-02 17:05 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: George Dunlap, xen-devel

>>> On 02.02.18 at 14:29, <andrew.cooper3@citrix.com> wrote:
> On 07/12/17 14:07, Jan Beulich wrote:
>> --- a/tools/tests/x86_emulator/x86-emulate.c
>> +++ b/tools/tests/x86_emulator/x86-emulate.c
>> @@ -120,6 +120,19 @@ int emul_test_read_cr(
>>      return X86EMUL_UNHANDLEABLE;
>>  }
>>  
>> +int emul_test_read_xcr(
>> +    unsigned int reg,
>> +    uint64_t *val,
>> +    struct x86_emulate_ctxt *ctxt)
>> +{
>> +    uint32_t lo, hi;
>> +
>> +    asm ( "xgetbv" : "=a" (lo), "=d" (hi) : "c" (reg) );
>> +    *val = lo | ((uint64_t)hi << 32);
> 
> This will want a reg filter, or AFL will find that trying to read reg 2
> will explode.

How would AFL manage to do that? It doesn't fuzz the function
alone, and there's no call path leading here that would pass an
invalid value. It is the main emulator that should never call this
with a wrong value, or if it does, we should be happy for it to
be flagged by AFL rather than going silent (via some error path).

Plus - if I wanted to add proper checking here, I'd have to re-do
exactly what the emulator code around the call site does.

>> --- a/xen/arch/x86/hvm/emulate.c
>> +++ b/xen/arch/x86/hvm/emulate.c
>> @@ -1825,6 +1825,49 @@ static int hvmemul_write_cr(
>>      return rc;
>>  }
>>  
>> +static int hvmemul_read_xcr(
>> +    unsigned int reg,
>> +    uint64_t *val,
>> +    struct x86_emulate_ctxt *ctxt)
>> +{
>> +    uint32_t lo, hi;
>> +
>> +    switch ( reg )
>> +    {
>> +    case 0:
>> +        *val = current->arch.xcr0;
>> +        return X86EMUL_OKAY;
>> +
>> +    case 1:
>> +        if ( !cpu_has_xgetbv1 )
>> +            return X86EMUL_UNHANDLEABLE;
>> +        break;
>> +
>> +    default:
>> +        return X86EMUL_UNHANDLEABLE;
>> +    }
>> +
>> +    asm ( ".byte 0x0f,0x01,0xd0" /* xgetbv */
>> +          : "=a" (lo), "=d" (hi) : "c" (reg) );
> 
> Please can we have a static inline?

Sure.

>  It needs to be volatile, because
> the result depends on unspecified other operations, which for xgetbv1
> includes any instruction which alters xsave state.

Well, yes, strictly speaking it should be volatile. Will add.

> Furthermore, does this actually return the correct result?  I'd prefer
> if we didn't have to read from hardware here, but I can't see an
> alternative.

In what way do you see it possibly producing the wrong value?

> From the guests point of view, we should at least have the guests xcr0
> in context, but we have xcr0_accum loaded, meaning that the guest is
> liable to see returned set bits which are higher than its idea of xcr0.

Nor would it make much sense to cache a dozen or more XCRs,
once there'll be that many.

>> +static int hvmemul_write_xcr(
>> +    unsigned int reg,
>> +    uint64_t val,
>> +    struct x86_emulate_ctxt *ctxt)
>> +{
>> +    HVMTRACE_LONG_2D(XCR_WRITE, reg, TRC_PAR_LONG(val));
>> +    if ( likely(handle_xsetbv(reg, val) == 0) )
>> +        return X86EMUL_OKAY;
>> +
>> +    x86_emul_hw_exception(TRAP_gp_fault, 0, ctxt);
>> +    return X86EMUL_EXCEPTION;
> 
> This exception is inconsistent with unhandleable above.  FTR, I'd expect
> all of them to be exception rather than unhandleable.

I can switch to that, sure.

>> @@ -5161,18 +5182,33 @@ x86_emulate(
>>                  _regs.eflags |= X86_EFLAGS_AC;
>>              break;
>>  
>> -#ifdef __XEN__
>> -        case 0xd1: /* xsetbv */
>> +        case 0xd0: /* xgetbv */
>>              generate_exception_if(vex.pfx, EXC_UD);
>> -            if ( !ops->read_cr || ops->read_cr(4, &cr4, ctxt) != X86EMUL_OKAY )
>> +            if ( !ops->read_cr || !ops->read_xcr ||
>> +                 ops->read_cr(4, &cr4, ctxt) != X86EMUL_OKAY )
>>                  cr4 = 0;
>>              generate_exception_if(!(cr4 & X86_CR4_OSXSAVE), EXC_UD);
>> -            generate_exception_if(!mode_ring0() ||
>> -                                  handle_xsetbv(_regs.ecx,
>> -                                                _regs.eax | (_regs.rdx << 32)),
>> +            generate_exception_if(_regs.ecx > (vcpu_has_xgetbv1() ? 1 : 0),
>>                                    EXC_GP, 0);
> 
> I don't think this filtering is correct.  We don't filter on the xsetbv
> side, or for the plain cr/dr index.  It should be up to the hook to
> decide whether a specific index is appropriate.

Any filtering that can be done here should be done here - this
is the central place to enforce architectural dependencies. I'd
rather add a similar check to xsetbv; in fact I'm not sure why I
didn't.

>> --- a/xen/include/asm-x86/x86-defns.h
>> +++ b/xen/include/asm-x86/x86-defns.h
>> @@ -66,4 +66,28 @@
>>  #define X86_CR4_SMAP       0x00200000 /* enable SMAP */
>>  #define X86_CR4_PKE        0x00400000 /* enable PKE */
>>  
>> +/*
>> + * XSTATE component flags in XCR0
>> + */
>> +#define _XSTATE_FP                0
>> +#define XSTATE_FP                 (1ULL << _XSTATE_FP)
>> +#define _XSTATE_SSE               1
>> +#define XSTATE_SSE                (1ULL << _XSTATE_SSE)
>> +#define _XSTATE_YMM               2
>> +#define XSTATE_YMM                (1ULL << _XSTATE_YMM)
>> +#define _XSTATE_BNDREGS           3
>> +#define XSTATE_BNDREGS            (1ULL << _XSTATE_BNDREGS)
>> +#define _XSTATE_BNDCSR            4
>> +#define XSTATE_BNDCSR             (1ULL << _XSTATE_BNDCSR)
>> +#define _XSTATE_OPMASK            5
>> +#define XSTATE_OPMASK             (1ULL << _XSTATE_OPMASK)
>> +#define _XSTATE_ZMM               6
>> +#define XSTATE_ZMM                (1ULL << _XSTATE_ZMM)
>> +#define _XSTATE_HI_ZMM            7
>> +#define XSTATE_HI_ZMM             (1ULL << _XSTATE_HI_ZMM)
>> +#define _XSTATE_PKRU              9
>> +#define XSTATE_PKRU               (1ULL << _XSTATE_PKRU)
>> +#define _XSTATE_LWP               62
>> +#define XSTATE_LWP                (1ULL << _XSTATE_LWP)
> 
> Can we name these consistently as part of moving into this file?  At the
> very least an X86_ prefix, and possibly an XCR0 middle.

Well, yes, if you insist I can add another patch doing this - doing
it here would be insane, as I'll need to update all users. As to
"XCR0 middle" do you mean in place of XSTATE, or in addition to
(I'd prefer the former)?

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 20/25] x86emul: correctly handle CMPXCHG* comparison failures
  2018-02-02 14:49   ` Andrew Cooper
@ 2018-02-05  8:07     ` Jan Beulich
  2018-02-05 13:38       ` Andrew Cooper
  0 siblings, 1 reply; 85+ messages in thread
From: Jan Beulich @ 2018-02-05  8:07 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: George Dunlap, xen-devel, Tim Deegan

>>> On 02.02.18 at 15:49, <andrew.cooper3@citrix.com> wrote:
> On 07/12/17 14:15, Jan Beulich wrote:
>> --- a/xen/arch/x86/mm/shadow/common.c
>> +++ b/xen/arch/x86/mm/shadow/common.c
>> @@ -302,8 +302,12 @@ hvm_emulate_cmpxchg(enum x86_segment seg
>>      memcpy(&old, p_old, bytes);
>>      memcpy(&new, p_new, bytes);
>>  
>> -    return v->arch.paging.mode->shadow.x86_emulate_cmpxchg(
>> -               v, addr, old, new, bytes, sh_ctxt);
>> +    rc = v->arch.paging.mode->shadow.x86_emulate_cmpxchg(
>> +             v, addr, &old, new, bytes, sh_ctxt);

For the sake of the response below, please note the passing of
&old (rather than p_old) here.

>> +
>> +    memcpy(p_old, &old, bytes);
> 
> This is redundant with ...
> 
>> +
>> +    return rc;
>>  }
>>  
>>  static const struct x86_emulate_ops hvm_shadow_emulator_ops = {
>> --- a/xen/arch/x86/mm/shadow/multi.c
>> +++ b/xen/arch/x86/mm/shadow/multi.c
>> @@ -4741,11 +4741,11 @@ sh_x86_emulate_write(struct vcpu *v, uns
>>  
>>  static int
>>  sh_x86_emulate_cmpxchg(struct vcpu *v, unsigned long vaddr,
>> -                        unsigned long old, unsigned long new,
>> -                        unsigned int bytes, struct sh_emulate_ctxt *sh_ctxt)
>> +                       unsigned long *p_old, unsigned long new,
>> +                       unsigned int bytes, struct sh_emulate_ctxt *sh_ctxt)
>>  {
>>      void *addr;
>> -    unsigned long prev;
>> +    unsigned long prev, old = *p_old;
>>      int rv = X86EMUL_OKAY;
>>  
>>      /* Unaligned writes are only acceptable on HVM */
>> @@ -4769,7 +4769,10 @@ sh_x86_emulate_cmpxchg(struct vcpu *v, u
>>      }
>>  
>>      if ( prev != old )
>> -        rv = X86EMUL_RETRY;
>> +    {
>> +        *p_old = prev;
> 
> ... this, is it not?

No, here we copy info hvm_emulate_cmpxchg()'s local variable,
while there we copy into its caller's one. But anyway, the double
copying gets eliminated by patch 25.

>> --- a/xen/arch/x86/pv/ro-page-fault.c
>> +++ b/xen/arch/x86/pv/ro-page-fault.c
>> @@ -65,14 +65,16 @@ static int ptwr_emulated_read(enum x86_s
>>      return X86EMUL_OKAY;
>>  }
>>  
>> -static int ptwr_emulated_update(unsigned long addr, paddr_t old, paddr_t val,
>> -                                unsigned int bytes, unsigned int do_cmpxchg,
>> +static int ptwr_emulated_update(unsigned long addr, intpte_t *p_old,
>> +                                intpte_t val, unsigned int bytes,
>>                                  struct x86_emulate_ctxt *ctxt)
>>  {
>>      unsigned long mfn;
>>      unsigned long unaligned_addr = addr;
>>      struct page_info *page;
>>      l1_pgentry_t pte, ol1e, nl1e, *pl1e;
>> +    intpte_t old = p_old ? *p_old : 0;
>> +    unsigned int offset = 0;
> 
> I really think this conversion to intpte needs splitting out into a
> separate patch.  You're making multiple changes in this function which
> aren't at commit message at all, including introducing the distinction
> I've just noted of *p_old being NULL meaning a write rather than cmpxchg.

I can split out the type change, but you realize this means touching
twice some of the exact same code? As to changes not mentioned
in the commit message - I'm having trouble to spot those (the type
change is mentioned).

> On that note specifically, it would be clearer to have "const bool
> do_cmpxchg = *p_old; /* cmpxchg, or write? */".  If you don't want to do
> it, then there needs to be a comment with the function explaining the
> semantics of p_old.

I'll add a comment, even if I have a hard time finding a good place
to put it. Ahead of the function isn't really a good place imo, but I
also can't figure anything better.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 21/25] x86emul: add read-modify-write hook
  2018-02-02 16:13   ` Andrew Cooper
@ 2018-02-05  8:22     ` Jan Beulich
  2018-02-05 14:21       ` Andrew Cooper
  0 siblings, 1 reply; 85+ messages in thread
From: Jan Beulich @ 2018-02-05  8:22 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: George Dunlap, xen-devel

>>> On 02.02.18 at 17:13, <andrew.cooper3@citrix.com> wrote:
> On 07/12/17 14:16, Jan Beulich wrote:
>> In order to correctly emulate read-modify-write insns, especially
>> LOCKed ones, we should not issue reads and writes separately. Use a
>> new hook to combine both, and don't uniformly read the memory
>> destination anymore. Instead, DstMem opcodes without Mov now need to
>> have done so in their respective case blocks.
>>
>> Also strip bogus _ prefixes from macro parameters when this only affects
>> lines which are being changed anyway.
>>
>> In the test harness, besides some re-ordering to facilitate running a
>> few tests twice (one without and a second time with the .rmw hook in
>> place), tighten a few EFLAGS checks and add a test for NOT with memory
>> operand (in particular to verify EFLAGS don't get altered there).
>>
>> For now make use of the hook optional for callers; eventually we may
>> want to consider making this mandatory.
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> ---
>> v3: New.
>> ---
>> TBD: Do we want to also support non-lockable RMW insns in the new hook
>>      and helper (SHL & friends, SHLD, SHRD)?
> 
> What would this achieve?  I suppose it would avoid a double pagewalk.

Well - it's mostly the implication from this double walk which I'm
concerned about: The first walk is one for a read, which might
succeed when the second (write) walk fails. We'd then have done
a read which should have never occurred. But anyway this would
be follow-up work only, nothing to be added to the patch here.

>> -    case 0x38 ... 0x3d: cmp: /* cmp */
>> +    case 0x38: case 0x39: cmp: /* cmp reg,mem */
>> +        if ( ops->rmw && dst.type == OP_MEM &&
>> +             (rc = read_ulong(dst.mem.seg, dst.mem.off, &dst.val,
>> +                              dst.bytes, ctxt, ops)) != X86EMUL_OKAY )
> 
> Why does rmw matter here? cmp doesn't write to its operands.

The read of the "destination" operand was skipped in case there
is a ->rmw hook (see the change to generic destination operand
processing ahead of the main switch()). This needs to be carried
out here now (and elsewhere when what is nominally the
destination operand really is a second source one).

>> @@ -3700,6 +3780,13 @@ x86_emulate(
>>          break;
>>  
>>      case 0x86 ... 0x87: xchg: /* xchg */
>> +        /* The lock prefix is implied for this insn. */
>> +        lock_prefix = 1;
> 
> Only for memory.  I.e. this should probably be inside an OP_MEM check.

Consuming code (further down) ignores this for the register only
case. Only the body of "if ( state->rmw )" and a "case OP_MEM:"
look at this flag, and ...

>> +        if ( ops->rmw && dst.type == OP_MEM )
>> +        {
>> +            state->rmw = rmw_xchg;
>> +            break;
>> +        }

... state->rmw is set here with exactly the condition around it
that you ask for. I would certainly agree if it was the decode stage
where the flag is set, but we're in the execution phase already.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 22/25] x86/HVM: do actual CMPXCHG in hvmemul_cmpxchg()
  2018-02-02 16:36   ` Andrew Cooper
@ 2018-02-05  8:32     ` Jan Beulich
  2018-02-05 16:09       ` Andrew Cooper
  0 siblings, 1 reply; 85+ messages in thread
From: Jan Beulich @ 2018-02-05  8:32 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: George Dunlap, xen-devel, Paul Durrant

>>> On 02.02.18 at 17:36, <andrew.cooper3@citrix.com> wrote:
> On 07/12/17 14:16, Jan Beulich wrote:
>> --- a/xen/arch/x86/hvm/emulate.c
>> +++ b/xen/arch/x86/hvm/emulate.c
>> @@ -1296,8 +1296,83 @@ static int hvmemul_cmpxchg(
>>      bool lock,
>>      struct x86_emulate_ctxt *ctxt)
>>  {
>> -    /* Fix this in case the guest is really relying on r-m-w atomicity. */
>> -    return hvmemul_write(seg, offset, p_new, bytes, ctxt);
>> +    struct hvm_emulate_ctxt *hvmemul_ctxt =
>> +        container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
>> +    struct vcpu *curr = current;
>> +    unsigned long addr, reps = 1;
>> +    uint32_t pfec = PFEC_page_present | PFEC_write_access;
> 
> I'm fairly certain from my pagetable work that passing PFEC_page_present
> here is bogus, and I do have (eventual) plans to make the pagewalk
> reject such values.

Both here and in the subsequent RMW patch I'm simply following
what hvmemul_write() does. I'd prefer all three to stay in sync.

>> +    case 16:
>> +        if ( cpu_has_cx16 )
>> +        {
>> +            __uint128_t *old = p_old, cur;
>> +
>> +            if ( lock )
>> +                cur = __cmpxchg16b(mapping, old, p_new);
>> +            else
>> +                cur = cmpxchg16b_local_(mapping, old, p_new);
>> +            if ( cur != *old )
>> +            {
>> +                *old = cur;
>> +                rc = X86EMUL_CMPXCHG_FAILED;
>> +            }
>> +            break;
>> +        }
>> +        /* fall through */
>> +    default:
> 
> ASSERT_UNREACHABLE() ?
> 
>> +        rc = X86EMUL_UNHANDLEABLE;
>> +        break;
>> +    }

I'm not sure - from an abstract POV cpu_has_cx16 and the guest
seeing the feature available in its CPUID policy could differ. Granted
we're unlikely to want to try to emulate CMPXCHG16B without having
the instruction available ourselves, but it still wouldn't seem entirely
correct to assert here. I could remove the fall-through and _then_
assert in the default case only. Let me know.

>> --- a/xen/include/asm-x86/system.h
>> +++ b/xen/include/asm-x86/system.h
>> @@ -110,6 +110,38 @@ static always_inline unsigned long __cmp
>>      return old;
>>  }
>>  
>> +static always_inline unsigned long cmpxchg_local_(
> 
> unlocked_cmpxchg() ?

Not in line with our current naming scheme.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 23/25] x86/HVM: make use of new read-modify-write emulator hook
  2018-02-02 16:37   ` Andrew Cooper
@ 2018-02-05  8:34     ` Jan Beulich
  2018-02-05 16:15       ` Andrew Cooper
  0 siblings, 1 reply; 85+ messages in thread
From: Jan Beulich @ 2018-02-05  8:34 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: George Dunlap, xen-devel, Paul Durrant

>>> On 02.02.18 at 17:37, <andrew.cooper3@citrix.com> wrote:
> On 07/12/17 14:17, Jan Beulich wrote:
>> ..., at least as far as currently possible, i.e. when a mapping can be
>> obtained.
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> ---
>> v3: New.
>>
>> --- a/xen/arch/x86/hvm/emulate.c
>> +++ b/xen/arch/x86/hvm/emulate.c
>> @@ -1187,6 +1187,61 @@ static int hvmemul_write(
>>      return X86EMUL_OKAY;
>>  }
>>  
>> +static int hvmemul_rmw(
>> +    enum x86_segment seg,
>> +    unsigned long offset,
>> +    unsigned int bytes,
>> +    uint32_t *eflags,
>> +    struct x86_emulate_state *state,
>> +    struct x86_emulate_ctxt *ctxt)
>> +{
>> +    struct hvm_emulate_ctxt *hvmemul_ctxt =
>> +        container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
>> +    unsigned long addr, reps = 1;
>> +    uint32_t pfec = PFEC_page_present | PFEC_write_access;
> 
> Drop present, and...

See reply to previous patch.

>> +    struct hvm_vcpu_io *vio = &current->arch.hvm_vcpu.hvm_io;
>> +    int rc;
>> +    void *mapping;
>> +
>> +    rc = hvmemul_virtual_to_linear(
>> +        seg, offset, bytes, &reps, hvm_access_write, hvmemul_ctxt, &addr);
>> +    if ( rc != X86EMUL_OKAY || !bytes )
>> +        return rc;
>> +
>> +    if ( is_x86_system_segment(seg) )
>> +        pfec |= PFEC_implicit;
>> +    else if ( hvmemul_ctxt->seg_reg[x86_seg_ss].dpl == 3 )
>> +        pfec |= PFEC_user_mode;
>> +
>> +    mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt);
>> +    if ( IS_ERR(mapping) )
>> +        return ~PTR_ERR(mapping);
>> +
>> +    if ( mapping )
>> +    {
>> +        rc = x86_emul_rmw(mapping, bytes, eflags, state, ctxt);
>> +        hvmemul_unmap_linear_addr(mapping, addr, bytes, hvmemul_ctxt);
>> +    }
>> +    else
>> +    {
>> +        unsigned long data = 0;
>> +        bool_t known_gpfn = vio->mmio_access.write_access &&
>> +                            vio->mmio_gla == (addr & PAGE_MASK);
> 
> ... bool here.

Oops.

> Otherwise, Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

Thanks, but due to the first of the two requests you'll need to let
me know whether this applies with just the second change done.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 25/25] x86/shadow: fold sh_x86_emulate_{write, cmpxchg}() into their only callers
  2018-02-02 16:52   ` Andrew Cooper
@ 2018-02-05  8:42     ` Jan Beulich
  2018-02-05 12:16       ` Tim Deegan
  0 siblings, 1 reply; 85+ messages in thread
From: Jan Beulich @ 2018-02-05  8:42 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: George Dunlap, xen-devel, Tim Deegan

>>> On 02.02.18 at 17:52, <andrew.cooper3@citrix.com> wrote:
> On 07/12/17 14:19, Jan Beulich wrote:
>> @@ -298,14 +332,43 @@ hvm_emulate_cmpxchg(enum x86_segment seg
>>      if ( rc )
>>          return rc;
>>  
>> +    /* Unaligned writes are only acceptable on HVM */
>> +    if ( (addr & (bytes - 1)) && !is_hvm_vcpu(v)  )
>> +        return X86EMUL_UNHANDLEABLE;
>> +
>> +    ptr = sh_emulate_map_dest(v, addr, bytes, sh_ctxt);
>> +    if ( IS_ERR(ptr) )
>> +        return ~PTR_ERR(ptr);
>> +
>>      old = new = 0;
>>      memcpy(&old, p_old, bytes);
>>      memcpy(&new, p_new, bytes);
>>  
>> -    rc = v->arch.paging.mode->shadow.x86_emulate_cmpxchg(
>> -             v, addr, &old, new, bytes, sh_ctxt);
>> +    paging_lock(v->domain);
>> +    switch ( bytes )
>> +    {
>> +    case 1: prev = cmpxchg((uint8_t  *)ptr, old, new); break;
>> +    case 2: prev = cmpxchg((uint16_t *)ptr, old, new); break;
>> +    case 4: prev = cmpxchg((uint32_t *)ptr, old, new); break;
>> +    case 8: prev = cmpxchg((uint64_t *)ptr, old, new); break;
>> +    default:
>> +        SHADOW_PRINTK("cmpxchg size %u is not supported\n", bytes);
> 
> Given the earlier patches in the series, is it worth introducing case 16
> here?

In a follow-up patch this could be an option (unless Tim knows a
reason why this might be a bad idea), but I certainly wouldn't want
to do so here.

> Irrespective, this doesn't interfere with the purpose of the patch, so
> Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

Thanks, Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 25/25] x86/shadow: fold sh_x86_emulate_{write, cmpxchg}() into their only callers
  2018-02-05  8:42     ` Jan Beulich
@ 2018-02-05 12:16       ` Tim Deegan
  0 siblings, 0 replies; 85+ messages in thread
From: Tim Deegan @ 2018-02-05 12:16 UTC (permalink / raw)
  To: Jan Beulich; +Cc: George Dunlap, Andrew Cooper, xen-devel

Hi,

At 01:42 -0700 on 05 Feb (1517794959), Jan Beulich wrote:
> >>> On 02.02.18 at 17:52, <andrew.cooper3@citrix.com> wrote:
> > On 07/12/17 14:19, Jan Beulich wrote:
> >> +    case 1: prev = cmpxchg((uint8_t  *)ptr, old, new); break;
> >> +    case 2: prev = cmpxchg((uint16_t *)ptr, old, new); break;
> >> +    case 4: prev = cmpxchg((uint32_t *)ptr, old, new); break;
> >> +    case 8: prev = cmpxchg((uint64_t *)ptr, old, new); break;
> >> +    default:
> >> +        SHADOW_PRINTK("cmpxchg size %u is not supported\n", bytes);
> > 
> > Given the earlier patches in the series, is it worth introducing case 16
> > here?
> 
> In a follow-up patch this could be an option (unless Tim knows a
> reason why this might be a bad idea), but I certainly wouldn't want
> to do so here.

I agree that adding the 16 case shouldn't happen in this patch, and I
don't see a need for it.  Unless we think guest OSes will use 16-byte
atomic ops to update their 8-byte PTEs, the shadow code is probably
better off taking that as a hint to unshadow.

Cheers,

Tim.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 09/25] x86emul: support XOP insns
  2018-02-02 15:17     ` Jan Beulich
@ 2018-02-05 13:01       ` Andrew Cooper
  0 siblings, 0 replies; 85+ messages in thread
From: Andrew Cooper @ 2018-02-05 13:01 UTC (permalink / raw)
  To: Jan Beulich; +Cc: George Dunlap, xen-devel

On 02/02/18 15:17, Jan Beulich wrote:
>>>> On 02.02.18 at 13:03, <andrew.cooper3@citrix.com> wrote:
>> On 07/12/17 14:04, Jan Beulich wrote:
>>> @@ -8027,6 +8060,13 @@ x86_emulate(
>>>          generate_exception_if(vex.w, EXC_UD);
>>>          goto simd_0f_imm8_avx;
>>>  
>>> +    case X86EMUL_OPC_VEX_66(0x0f3a, 0x48): /* vpermil2ps $imm,{x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
>>> +                                           /* vpermil2ps $imm,{x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
>>> +    case X86EMUL_OPC_VEX_66(0x0f3a, 0x49): /* vpermil2pd $imm,{x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
>>> +                                           /* vpermil2pd $imm,{x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
>>> +        host_and_vcpu_must_have(xop);
>>> +        goto simd_0f_imm8_ymm;
>> Is this correct?  VEX.W selects which operand may be the memory operand,
>> and I don't see anything in the decode which copes, or anything in the
>> stub which adjusts .W.
> That's the nice thing here - by re-using the original instruction in
> the stub (with only GPR numbers adjusted if necessary) we simply
> don't care which of the operands it the memory one, as long as
> the access width does not differ (and it doesn't).

Hmm - that is very subtle, but ok.

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> as I didn't find any
other issues with the patch.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 11/25] x86emul: place test blobs in executable section
  2018-02-02 15:27     ` Jan Beulich
@ 2018-02-05 13:11       ` Andrew Cooper
  2018-02-05 13:55         ` Jan Beulich
  0 siblings, 1 reply; 85+ messages in thread
From: Andrew Cooper @ 2018-02-05 13:11 UTC (permalink / raw)
  To: Jan Beulich; +Cc: George Dunlap, xen-devel

On 02/02/18 15:27, Jan Beulich wrote:
>>>> On 02.02.18 at 14:03, <andrew.cooper3@citrix.com> wrote:
>> On 07/12/17 14:06, Jan Beulich wrote:
>>> This allows the section contents to be disassembled without going
>>> through any extra hoops, simplifying the analysis of problems in test
>>> and/or emulation code.
>>>
>>> The blobs being emitted as (r/o) data means we need to accept an
>>> assembler warning here (about the differing section attributes).
>>>
>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> What about just giving up their constness?  This is a test program after
>> all.
> Then the conflict would be in two attributes (writable and
> executable) rather than just one. The issue is that we emit them
> as data, but want them to be in an executable section. If anything
> we'd have to re-do how they're emitted (e.g. by using asm()), but
> that seems overkill to me.

Ok.  Instead, how about having a second .test.const? wouldn't that
resolve the warnings, but still leave the instructions in an executable
section?

Either way, this is just a developer utility, so Acked-by: Andrew Cooper
<andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 20/25] x86emul: correctly handle CMPXCHG* comparison failures
  2018-02-05  8:07     ` Jan Beulich
@ 2018-02-05 13:38       ` Andrew Cooper
  0 siblings, 0 replies; 85+ messages in thread
From: Andrew Cooper @ 2018-02-05 13:38 UTC (permalink / raw)
  To: Jan Beulich; +Cc: George Dunlap, xen-devel, Tim Deegan

On 05/02/18 08:07, Jan Beulich wrote:
>
>>> +
>>> +    memcpy(p_old, &old, bytes);
>> This is redundant with ...
>>
>>> +
>>> +    return rc;
>>>  }
>>>  
>>>  static const struct x86_emulate_ops hvm_shadow_emulator_ops = {
>>> --- a/xen/arch/x86/mm/shadow/multi.c
>>> +++ b/xen/arch/x86/mm/shadow/multi.c
>>> @@ -4741,11 +4741,11 @@ sh_x86_emulate_write(struct vcpu *v, uns
>>>  
>>>  static int
>>>  sh_x86_emulate_cmpxchg(struct vcpu *v, unsigned long vaddr,
>>> -                        unsigned long old, unsigned long new,
>>> -                        unsigned int bytes, struct sh_emulate_ctxt *sh_ctxt)
>>> +                       unsigned long *p_old, unsigned long new,
>>> +                       unsigned int bytes, struct sh_emulate_ctxt *sh_ctxt)
>>>  {
>>>      void *addr;
>>> -    unsigned long prev;
>>> +    unsigned long prev, old = *p_old;
>>>      int rv = X86EMUL_OKAY;
>>>  
>>>      /* Unaligned writes are only acceptable on HVM */
>>> @@ -4769,7 +4769,10 @@ sh_x86_emulate_cmpxchg(struct vcpu *v, u
>>>      }
>>>  
>>>      if ( prev != old )
>>> -        rv = X86EMUL_RETRY;
>>> +    {
>>> +        *p_old = prev;
>> ... this, is it not?
> No, here we copy info hvm_emulate_cmpxchg()'s local variable,
> while there we copy into its caller's one. But anyway, the double
> copying gets eliminated by patch 25.

Ok.

>
>>> --- a/xen/arch/x86/pv/ro-page-fault.c
>>> +++ b/xen/arch/x86/pv/ro-page-fault.c
>>> @@ -65,14 +65,16 @@ static int ptwr_emulated_read(enum x86_s
>>>      return X86EMUL_OKAY;
>>>  }
>>>  
>>> -static int ptwr_emulated_update(unsigned long addr, paddr_t old, paddr_t val,
>>> -                                unsigned int bytes, unsigned int do_cmpxchg,
>>> +static int ptwr_emulated_update(unsigned long addr, intpte_t *p_old,
>>> +                                intpte_t val, unsigned int bytes,
>>>                                  struct x86_emulate_ctxt *ctxt)
>>>  {
>>>      unsigned long mfn;
>>>      unsigned long unaligned_addr = addr;
>>>      struct page_info *page;
>>>      l1_pgentry_t pte, ol1e, nl1e, *pl1e;
>>> +    intpte_t old = p_old ? *p_old : 0;
>>> +    unsigned int offset = 0;
>> I really think this conversion to intpte needs splitting out into a
>> separate patch.  You're making multiple changes in this function which
>> aren't at commit message at all, including introducing the distinction
>> I've just noted of *p_old being NULL meaning a write rather than cmpxchg.
> I can split out the type change, but you realize this means touching
> twice some of the exact same code? As to changes not mentioned
> in the commit message - I'm having trouble to spot those (the type
> change is mentioned).

What you don't mention is that you're changing how
ptwr_emulated_update() evaluates what to do, and this took me a while to
figure out.

>
>> On that note specifically, it would be clearer to have "const bool
>> do_cmpxchg = *p_old; /* cmpxchg, or write? */".  If you don't want to do
>> it, then there needs to be a comment with the function explaining the
>> semantics of p_old.
> I'll add a comment, even if I have a hard time finding a good place
> to put it. Ahead of the function isn't really a good place imo, but I
> also can't figure anything better.

With that sorted, Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 11/25] x86emul: place test blobs in executable section
  2018-02-05 13:11       ` Andrew Cooper
@ 2018-02-05 13:55         ` Jan Beulich
  0 siblings, 0 replies; 85+ messages in thread
From: Jan Beulich @ 2018-02-05 13:55 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: George Dunlap, xen-devel

>>> On 05.02.18 at 14:11, <andrew.cooper3@citrix.com> wrote:
> On 02/02/18 15:27, Jan Beulich wrote:
>>>>> On 02.02.18 at 14:03, <andrew.cooper3@citrix.com> wrote:
>>> On 07/12/17 14:06, Jan Beulich wrote:
>>>> This allows the section contents to be disassembled without going
>>>> through any extra hoops, simplifying the analysis of problems in test
>>>> and/or emulation code.
>>>>
>>>> The blobs being emitted as (r/o) data means we need to accept an
>>>> assembler warning here (about the differing section attributes).
>>>>
>>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>> What about just giving up their constness?  This is a test program after
>>> all.
>> Then the conflict would be in two attributes (writable and
>> executable) rather than just one. The issue is that we emit them
>> as data, but want them to be in an executable section. If anything
>> we'd have to re-do how they're emitted (e.g. by using asm()), but
>> that seems overkill to me.
> 
> Ok.  Instead, how about having a second .test.const? wouldn't that
> resolve the warnings, but still leave the instructions in an executable
> section?

Well, with just the above I can't see how you'd expect that to work:
Whatever we name the section that the const arrays are emitted to,
it'll have a section directive with just "a" as attributes. Somehow we'd
need to add the missing "x", and that would require a second section
directive. Which would trigger the very warning again.

Now we could of course have a second source file, but besides this
imo going too far, it would also have the downside that in the .o
we'd then again have a non-executable section containing code
(and which most tools won't easily disassemble).

> Either way, this is just a developer utility, so Acked-by: Andrew Cooper
> <andrew.cooper3@citrix.com>

Thanks, Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 21/25] x86emul: add read-modify-write hook
  2018-02-05  8:22     ` Jan Beulich
@ 2018-02-05 14:21       ` Andrew Cooper
  2018-02-05 14:56         ` Jan Beulich
  0 siblings, 1 reply; 85+ messages in thread
From: Andrew Cooper @ 2018-02-05 14:21 UTC (permalink / raw)
  To: Jan Beulich; +Cc: George Dunlap, xen-devel

On 05/02/18 08:22, Jan Beulich wrote:
>>>> On 02.02.18 at 17:13, <andrew.cooper3@citrix.com> wrote:
>> On 07/12/17 14:16, Jan Beulich wrote:
>>> In order to correctly emulate read-modify-write insns, especially
>>> LOCKed ones, we should not issue reads and writes separately. Use a
>>> new hook to combine both, and don't uniformly read the memory
>>> destination anymore. Instead, DstMem opcodes without Mov now need to
>>> have done so in their respective case blocks.
>>>
>>> Also strip bogus _ prefixes from macro parameters when this only affects
>>> lines which are being changed anyway.
>>>
>>> In the test harness, besides some re-ordering to facilitate running a
>>> few tests twice (one without and a second time with the .rmw hook in
>>> place), tighten a few EFLAGS checks and add a test for NOT with memory
>>> operand (in particular to verify EFLAGS don't get altered there).
>>>
>>> For now make use of the hook optional for callers; eventually we may
>>> want to consider making this mandatory.
>>>
>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>> ---
>>> v3: New.
>>> ---
>>> TBD: Do we want to also support non-lockable RMW insns in the new hook
>>>      and helper (SHL & friends, SHLD, SHRD)?
>> What would this achieve?  I suppose it would avoid a double pagewalk.
> Well - it's mostly the implication from this double walk which I'm
> concerned about: The first walk is one for a read, which might
> succeed when the second (write) walk fails. We'd then have done
> a read which should have never occurred. But anyway this would
> be follow-up work only, nothing to be added to the patch here.

In some copious new future with the emulation changes discussed at
summit, we'd want to issue a single writeable translation request.

On that basis then, we should extend the use of this hook to all RMW
instruction, irrespective of locking.

>
>>> -    case 0x38 ... 0x3d: cmp: /* cmp */
>>> +    case 0x38: case 0x39: cmp: /* cmp reg,mem */
>>> +        if ( ops->rmw && dst.type == OP_MEM &&
>>> +             (rc = read_ulong(dst.mem.seg, dst.mem.off, &dst.val,
>>> +                              dst.bytes, ctxt, ops)) != X86EMUL_OKAY )
>> Why does rmw matter here? cmp doesn't write to its operands.
> The read of the "destination" operand was skipped in case there
> is a ->rmw hook (see the change to generic destination operand
> processing ahead of the main switch()). This needs to be carried
> out here now (and elsewhere when what is nominally the
> destination operand really is a second source one).

Oh, right.  I think this needs to be clearer.  Having two different
behaviours depending on whether the caller provides an rmw hook is very
subtle.

The particularly confusing thing here is that cmp isn't an rmw
instruction.  I presume the oddity comes about because cmp encodes the
memory operand in dst, even though the operand doesn't get written to?

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 21/25] x86emul: add read-modify-write hook
  2018-02-05 14:21       ` Andrew Cooper
@ 2018-02-05 14:56         ` Jan Beulich
  0 siblings, 0 replies; 85+ messages in thread
From: Jan Beulich @ 2018-02-05 14:56 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: George Dunlap, xen-devel

>>> On 05.02.18 at 15:21, <andrew.cooper3@citrix.com> wrote:
> On 05/02/18 08:22, Jan Beulich wrote:
>>>>> On 02.02.18 at 17:13, <andrew.cooper3@citrix.com> wrote:
>>> On 07/12/17 14:16, Jan Beulich wrote:
>>>> In order to correctly emulate read-modify-write insns, especially
>>>> LOCKed ones, we should not issue reads and writes separately. Use a
>>>> new hook to combine both, and don't uniformly read the memory
>>>> destination anymore. Instead, DstMem opcodes without Mov now need to
>>>> have done so in their respective case blocks.
>>>>
>>>> Also strip bogus _ prefixes from macro parameters when this only affects
>>>> lines which are being changed anyway.
>>>>
>>>> In the test harness, besides some re-ordering to facilitate running a
>>>> few tests twice (one without and a second time with the .rmw hook in
>>>> place), tighten a few EFLAGS checks and add a test for NOT with memory
>>>> operand (in particular to verify EFLAGS don't get altered there).
>>>>
>>>> For now make use of the hook optional for callers; eventually we may
>>>> want to consider making this mandatory.
>>>>
>>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>>> ---
>>>> v3: New.
>>>> ---
>>>> TBD: Do we want to also support non-lockable RMW insns in the new hook
>>>>      and helper (SHL & friends, SHLD, SHRD)?
>>> What would this achieve?  I suppose it would avoid a double pagewalk.
>> Well - it's mostly the implication from this double walk which I'm
>> concerned about: The first walk is one for a read, which might
>> succeed when the second (write) walk fails. We'd then have done
>> a read which should have never occurred. But anyway this would
>> be follow-up work only, nothing to be added to the patch here.
> 
> In some copious new future with the emulation changes discussed at
> summit, we'd want to issue a single writeable translation request.
> 
> On that basis then, we should extend the use of this hook to all RMW
> instruction, irrespective of locking.

Added to my todo list. Before putting together the patch here I
had actually considered the introduction of map/unmap hooks,
but I did conclude that this would be quite a bit more intrusive.

>>>> -    case 0x38 ... 0x3d: cmp: /* cmp */
>>>> +    case 0x38: case 0x39: cmp: /* cmp reg,mem */
>>>> +        if ( ops->rmw && dst.type == OP_MEM &&
>>>> +             (rc = read_ulong(dst.mem.seg, dst.mem.off, &dst.val,
>>>> +                              dst.bytes, ctxt, ops)) != X86EMUL_OKAY )
>>> Why does rmw matter here? cmp doesn't write to its operands.
>> The read of the "destination" operand was skipped in case there
>> is a ->rmw hook (see the change to generic destination operand
>> processing ahead of the main switch()). This needs to be carried
>> out here now (and elsewhere when what is nominally the
>> destination operand really is a second source one).
> 
> Oh, right.  I think this needs to be clearer.  Having two different
> behaviours depending on whether the caller provides an rmw hook is very
> subtle.

Well, if you have suggestions as to how to make this more clear,
I'm all ears.

> The particularly confusing thing here is that cmp isn't an rmw
> instruction.  I presume the oddity comes about because cmp encodes the
> memory operand in dst, even though the operand doesn't get written to?

Sort of (formally source and destination aren't spelled out at
the ModR/M byte level): It's really our operand encoding scheme
which puts us into this situation: We can't encode more than one
source operand. And for the insn flavors with ModR/M and
immediate the table entries are even shared between CMP and
the 7 other opcodes all writing to the register or memory.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 22/25] x86/HVM: do actual CMPXCHG in hvmemul_cmpxchg()
  2018-02-05  8:32     ` Jan Beulich
@ 2018-02-05 16:09       ` Andrew Cooper
  2018-02-05 16:49         ` Jan Beulich
  0 siblings, 1 reply; 85+ messages in thread
From: Andrew Cooper @ 2018-02-05 16:09 UTC (permalink / raw)
  To: Jan Beulich; +Cc: George Dunlap, xen-devel, Paul Durrant

On 05/02/18 08:32, Jan Beulich wrote:
>>>> On 02.02.18 at 17:36, <andrew.cooper3@citrix.com> wrote:
>> On 07/12/17 14:16, Jan Beulich wrote:
>>> --- a/xen/arch/x86/hvm/emulate.c
>>> +++ b/xen/arch/x86/hvm/emulate.c
>>> @@ -1296,8 +1296,83 @@ static int hvmemul_cmpxchg(
>>>      bool lock,
>>>      struct x86_emulate_ctxt *ctxt)
>>>  {
>>> -    /* Fix this in case the guest is really relying on r-m-w atomicity. */
>>> -    return hvmemul_write(seg, offset, p_new, bytes, ctxt);
>>> +    struct hvm_emulate_ctxt *hvmemul_ctxt =
>>> +        container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
>>> +    struct vcpu *curr = current;
>>> +    unsigned long addr, reps = 1;
>>> +    uint32_t pfec = PFEC_page_present | PFEC_write_access;
>> I'm fairly certain from my pagetable work that passing PFEC_page_present
>> here is bogus, and I do have (eventual) plans to make the pagewalk
>> reject such values.
> Both here and in the subsequent RMW patch I'm simply following
> what hvmemul_write() does. I'd prefer all three to stay in sync.

Fair enough.

>
>>> +    case 16:
>>> +        if ( cpu_has_cx16 )
>>> +        {
>>> +            __uint128_t *old = p_old, cur;
>>> +
>>> +            if ( lock )
>>> +                cur = __cmpxchg16b(mapping, old, p_new);
>>> +            else
>>> +                cur = cmpxchg16b_local_(mapping, old, p_new);
>>> +            if ( cur != *old )
>>> +            {
>>> +                *old = cur;
>>> +                rc = X86EMUL_CMPXCHG_FAILED;
>>> +            }
>>> +            break;
>>> +        }
>>> +        /* fall through */
>>> +    default:
>> ASSERT_UNREACHABLE() ?
>>
>>> +        rc = X86EMUL_UNHANDLEABLE;
>>> +        break;
>>> +    }
> I'm not sure - from an abstract POV cpu_has_cx16 and the guest
> seeing the feature available in its CPUID policy could differ. Granted
> we're unlikely to want to try to emulate CMPXCHG16B without having
> the instruction available ourselves, but it still wouldn't seem entirely
> correct to assert here. I could remove the fall-through and _then_
> assert in the default case only. Let me know.

The point was to catch bad sizes from being passed in.  There is only a
single ancient range of 64bit processors which don't have CX16, but I'd
still argue that it would be a bug for the emulator to pass 16 down in
such cases.

>
>>> --- a/xen/include/asm-x86/system.h
>>> +++ b/xen/include/asm-x86/system.h
>>> @@ -110,6 +110,38 @@ static always_inline unsigned long __cmp
>>>      return old;
>>>  }
>>>  
>>> +static always_inline unsigned long cmpxchg_local_(
>> unlocked_cmpxchg() ?
> Not in line with our current naming scheme.

Its rather more in line than introducing a local_ suffix.  Trailing
underscores are almost non-existant, and as far as I can tell, used
exclusively in the internals of the compat code.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 23/25] x86/HVM: make use of new read-modify-write emulator hook
  2018-02-05  8:34     ` Jan Beulich
@ 2018-02-05 16:15       ` Andrew Cooper
  0 siblings, 0 replies; 85+ messages in thread
From: Andrew Cooper @ 2018-02-05 16:15 UTC (permalink / raw)
  To: Jan Beulich; +Cc: George Dunlap, xen-devel, Paul Durrant

On 05/02/18 08:34, Jan Beulich wrote:
>>>> On 02.02.18 at 17:37, <andrew.cooper3@citrix.com> wrote:
>> On 07/12/17 14:17, Jan Beulich wrote:
>>> ..., at least as far as currently possible, i.e. when a mapping can be
>>> obtained.
>>>
>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>> ---
>>> v3: New.
>>>
>>> --- a/xen/arch/x86/hvm/emulate.c
>>> +++ b/xen/arch/x86/hvm/emulate.c
>>> @@ -1187,6 +1187,61 @@ static int hvmemul_write(
>>>      return X86EMUL_OKAY;
>>>  }
>>>  
>>> +static int hvmemul_rmw(
>>> +    enum x86_segment seg,
>>> +    unsigned long offset,
>>> +    unsigned int bytes,
>>> +    uint32_t *eflags,
>>> +    struct x86_emulate_state *state,
>>> +    struct x86_emulate_ctxt *ctxt)
>>> +{
>>> +    struct hvm_emulate_ctxt *hvmemul_ctxt =
>>> +        container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
>>> +    unsigned long addr, reps = 1;
>>> +    uint32_t pfec = PFEC_page_present | PFEC_write_access;
>> Drop present, and...
> See reply to previous patch.
>
>>> +    struct hvm_vcpu_io *vio = &current->arch.hvm_vcpu.hvm_io;
>>> +    int rc;
>>> +    void *mapping;
>>> +
>>> +    rc = hvmemul_virtual_to_linear(
>>> +        seg, offset, bytes, &reps, hvm_access_write, hvmemul_ctxt, &addr);
>>> +    if ( rc != X86EMUL_OKAY || !bytes )
>>> +        return rc;
>>> +
>>> +    if ( is_x86_system_segment(seg) )
>>> +        pfec |= PFEC_implicit;
>>> +    else if ( hvmemul_ctxt->seg_reg[x86_seg_ss].dpl == 3 )
>>> +        pfec |= PFEC_user_mode;
>>> +
>>> +    mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt);
>>> +    if ( IS_ERR(mapping) )
>>> +        return ~PTR_ERR(mapping);
>>> +
>>> +    if ( mapping )
>>> +    {
>>> +        rc = x86_emul_rmw(mapping, bytes, eflags, state, ctxt);
>>> +        hvmemul_unmap_linear_addr(mapping, addr, bytes, hvmemul_ctxt);
>>> +    }
>>> +    else
>>> +    {
>>> +        unsigned long data = 0;
>>> +        bool_t known_gpfn = vio->mmio_access.write_access &&
>>> +                            vio->mmio_gla == (addr & PAGE_MASK);
>> ... bool here.
> Oops.
>
>> Otherwise, Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
> Thanks, but due to the first of the two requests you'll need to let
> me know whether this applies with just the second change done.

I'll accept the consistency argument and let the first one go.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 22/25] x86/HVM: do actual CMPXCHG in hvmemul_cmpxchg()
  2018-02-05 16:09       ` Andrew Cooper
@ 2018-02-05 16:49         ` Jan Beulich
  2018-02-05 16:57           ` Andrew Cooper
  0 siblings, 1 reply; 85+ messages in thread
From: Jan Beulich @ 2018-02-05 16:49 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: George Dunlap, xen-devel, Paul Durrant

>>> On 05.02.18 at 17:09, <andrew.cooper3@citrix.com> wrote:
> On 05/02/18 08:32, Jan Beulich wrote:
>>>>> On 02.02.18 at 17:36, <andrew.cooper3@citrix.com> wrote:
>>> On 07/12/17 14:16, Jan Beulich wrote:
>>>> +    case 16:
>>>> +        if ( cpu_has_cx16 )
>>>> +        {
>>>> +            __uint128_t *old = p_old, cur;
>>>> +
>>>> +            if ( lock )
>>>> +                cur = __cmpxchg16b(mapping, old, p_new);
>>>> +            else
>>>> +                cur = cmpxchg16b_local_(mapping, old, p_new);
>>>> +            if ( cur != *old )
>>>> +            {
>>>> +                *old = cur;
>>>> +                rc = X86EMUL_CMPXCHG_FAILED;
>>>> +            }
>>>> +            break;
>>>> +        }
>>>> +        /* fall through */
>>>> +    default:
>>> ASSERT_UNREACHABLE() ?
>>>
>>>> +        rc = X86EMUL_UNHANDLEABLE;
>>>> +        break;
>>>> +    }
>> I'm not sure - from an abstract POV cpu_has_cx16 and the guest
>> seeing the feature available in its CPUID policy could differ. Granted
>> we're unlikely to want to try to emulate CMPXCHG16B without having
>> the instruction available ourselves, but it still wouldn't seem entirely
>> correct to assert here. I could remove the fall-through and _then_
>> assert in the default case only. Let me know.
> 
> The point was to catch bad sizes from being passed in.  There is only a
> single ancient range of 64bit processors which don't have CX16, but I'd
> still argue that it would be a bug for the emulator to pass 16 down in
> such cases.

So - are you fine then with my earlier suggestion towards an actual
change to make here?

>>>> --- a/xen/include/asm-x86/system.h
>>>> +++ b/xen/include/asm-x86/system.h
>>>> @@ -110,6 +110,38 @@ static always_inline unsigned long __cmp
>>>>      return old;
>>>>  }
>>>>  
>>>> +static always_inline unsigned long cmpxchg_local_(
>>> unlocked_cmpxchg() ?
>> Not in line with our current naming scheme.
> 
> Its rather more in line than introducing a local_ suffix.  Trailing
> underscores are almost non-existant, and as far as I can tell, used
> exclusively in the internals of the compat code.

Well, the name choice started from Linux'es cmpxchg_local(), of
which the function introduced here would be a helper. I'd like to
stick to the Linux inherited naming scheme (read: I want to keep
the "cmpxchg_local" part), but I don't insist on the trailing
underscore (which I only use here [and elsewhere] in preference
of name space violating leading ones). I'd just need a suggestion
towards an alternative you could live with, and fitting the outlined
criteria.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 22/25] x86/HVM: do actual CMPXCHG in hvmemul_cmpxchg()
  2018-02-05 16:49         ` Jan Beulich
@ 2018-02-05 16:57           ` Andrew Cooper
  2018-02-05 17:05             ` Jan Beulich
  0 siblings, 1 reply; 85+ messages in thread
From: Andrew Cooper @ 2018-02-05 16:57 UTC (permalink / raw)
  To: Jan Beulich; +Cc: George Dunlap, xen-devel, Paul Durrant

On 05/02/18 16:49, Jan Beulich wrote:
>>>> On 05.02.18 at 17:09, <andrew.cooper3@citrix.com> wrote:
>> On 05/02/18 08:32, Jan Beulich wrote:
>>>>>> On 02.02.18 at 17:36, <andrew.cooper3@citrix.com> wrote:
>>>> On 07/12/17 14:16, Jan Beulich wrote:
>>>>> +    case 16:
>>>>> +        if ( cpu_has_cx16 )
>>>>> +        {
>>>>> +            __uint128_t *old = p_old, cur;
>>>>> +
>>>>> +            if ( lock )
>>>>> +                cur = __cmpxchg16b(mapping, old, p_new);
>>>>> +            else
>>>>> +                cur = cmpxchg16b_local_(mapping, old, p_new);
>>>>> +            if ( cur != *old )
>>>>> +            {
>>>>> +                *old = cur;
>>>>> +                rc = X86EMUL_CMPXCHG_FAILED;
>>>>> +            }
>>>>> +            break;
>>>>> +        }
>>>>> +        /* fall through */
>>>>> +    default:
>>>> ASSERT_UNREACHABLE() ?
>>>>
>>>>> +        rc = X86EMUL_UNHANDLEABLE;
>>>>> +        break;
>>>>> +    }
>>> I'm not sure - from an abstract POV cpu_has_cx16 and the guest
>>> seeing the feature available in its CPUID policy could differ. Granted
>>> we're unlikely to want to try to emulate CMPXCHG16B without having
>>> the instruction available ourselves, but it still wouldn't seem entirely
>>> correct to assert here. I could remove the fall-through and _then_
>>> assert in the default case only. Let me know.
>> The point was to catch bad sizes from being passed in.  There is only a
>> single ancient range of 64bit processors which don't have CX16, but I'd
>> still argue that it would be a bug for the emulator to pass 16 down in
>> such cases.
> So - are you fine then with my earlier suggestion towards an actual
> change to make here?

Ok.

>
>>>>> --- a/xen/include/asm-x86/system.h
>>>>> +++ b/xen/include/asm-x86/system.h
>>>>> @@ -110,6 +110,38 @@ static always_inline unsigned long __cmp
>>>>>      return old;
>>>>>  }
>>>>>  
>>>>> +static always_inline unsigned long cmpxchg_local_(
>>>> unlocked_cmpxchg() ?
>>> Not in line with our current naming scheme.
>> Its rather more in line than introducing a local_ suffix.  Trailing
>> underscores are almost non-existant, and as far as I can tell, used
>> exclusively in the internals of the compat code.
> Well, the name choice started from Linux'es cmpxchg_local(), of
> which the function introduced here would be a helper. I'd like to
> stick to the Linux inherited naming scheme (read: I want to keep
> the "cmpxchg_local" part), but I don't insist on the trailing
> underscore (which I only use here [and elsewhere] in preference
> of name space violating leading ones). I'd just need a suggestion
> towards an alternative you could live with, and fitting the outlined
> criteria.

cmpxchg_local() would be better than with a trailing underscore.

Seeing as it matches the Linux naming scheme, using exactly
cmpxchg_local() would be the logical move.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

* Re: [PATCH v3 22/25] x86/HVM: do actual CMPXCHG in hvmemul_cmpxchg()
  2018-02-05 16:57           ` Andrew Cooper
@ 2018-02-05 17:05             ` Jan Beulich
  0 siblings, 0 replies; 85+ messages in thread
From: Jan Beulich @ 2018-02-05 17:05 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: George Dunlap, xen-devel, Paul Durrant

>>> On 05.02.18 at 17:57, <andrew.cooper3@citrix.com> wrote:
> On 05/02/18 16:49, Jan Beulich wrote:
>>>>> On 05.02.18 at 17:09, <andrew.cooper3@citrix.com> wrote:
>>> On 05/02/18 08:32, Jan Beulich wrote:
>>>>>>> On 02.02.18 at 17:36, <andrew.cooper3@citrix.com> wrote:
>>>>>> --- a/xen/include/asm-x86/system.h
>>>>>> +++ b/xen/include/asm-x86/system.h
>>>>>> @@ -110,6 +110,38 @@ static always_inline unsigned long __cmp
>>>>>>      return old;
>>>>>>  }
>>>>>>  
>>>>>> +static always_inline unsigned long cmpxchg_local_(
>>>>> unlocked_cmpxchg() ?
>>>> Not in line with our current naming scheme.
>>> Its rather more in line than introducing a local_ suffix.  Trailing
>>> underscores are almost non-existant, and as far as I can tell, used
>>> exclusively in the internals of the compat code.
>> Well, the name choice started from Linux'es cmpxchg_local(), of
>> which the function introduced here would be a helper. I'd like to
>> stick to the Linux inherited naming scheme (read: I want to keep
>> the "cmpxchg_local" part), but I don't insist on the trailing
>> underscore (which I only use here [and elsewhere] in preference
>> of name space violating leading ones). I'd just need a suggestion
>> towards an alternative you could live with, and fitting the outlined
>> criteria.
> 
> cmpxchg_local() would be better than with a trailing underscore.
> 
> Seeing as it matches the Linux naming scheme, using exactly
> cmpxchg_local() would be the logical move.

Note how I've said "of which the function introduced here would be
a helper": cmpxchg_local() should have no size parameter.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 85+ messages in thread

end of thread, other threads:[~2018-02-05 17:05 UTC | newest]

Thread overview: 85+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-12-07 13:49 [PATCH v3 00/25] x86: emulator enhancements Jan Beulich
2017-12-07 13:58 ` [PATCH v3 01/25] x86emul: make decode_register() return unsigned long * Jan Beulich
2017-12-07 18:32   ` Andrew Cooper
2017-12-08  7:44     ` Jan Beulich
2017-12-07 13:59 ` [PATCH v3 02/25] x86emul: build SIMD tests with -Os Jan Beulich
2017-12-07 18:32   ` Andrew Cooper
2017-12-07 14:00 ` [PATCH v3 03/25] x86emul: support F16C insns Jan Beulich
2018-01-31 18:58   ` Andrew Cooper
2017-12-07 14:01 ` [PATCH v3 04/25] x86emul: support FMA4 insns Jan Beulich
2018-01-31 19:51   ` Andrew Cooper
2017-12-07 14:02 ` [PATCH v3 05/25] x86emul: support FMA insns Jan Beulich
2018-02-01 16:15   ` Andrew Cooper
2017-12-07 14:03 ` [PATCH v3 06/25] x86emul: support most remaining AVX2 insns Jan Beulich
2018-02-01 19:45   ` Andrew Cooper
2018-02-02  9:29     ` Jan Beulich
2017-12-07 14:03 ` [PATCH v3 07/25] x86emul: support AVX2 gather insns Jan Beulich
2018-02-01 20:53   ` Andrew Cooper
2018-02-02  9:44     ` Jan Beulich
2017-12-07 14:04 ` [PATCH v3 08/25] x86emul: add tables for XOP 08 and 09 extension spaces Jan Beulich
2018-02-02 11:43   ` Andrew Cooper
2018-02-02 15:15     ` Jan Beulich
2018-02-02 16:02       ` Andrew Cooper
2017-12-07 14:04 ` [PATCH v3 09/25] x86emul: support XOP insns Jan Beulich
2018-02-02 12:03   ` Andrew Cooper
2018-02-02 15:17     ` Jan Beulich
2018-02-05 13:01       ` Andrew Cooper
2017-12-07 14:05 ` [PATCH v3 10/25] x86emul: support 3DNow! insns Jan Beulich
2018-02-02 13:02   ` Andrew Cooper
2018-02-02 15:22     ` Jan Beulich
2018-02-02 16:04       ` Andrew Cooper
2017-12-07 14:06 ` [PATCH v3 11/25] x86emul: place test blobs in executable section Jan Beulich
2018-02-02 13:03   ` Andrew Cooper
2018-02-02 15:27     ` Jan Beulich
2018-02-05 13:11       ` Andrew Cooper
2018-02-05 13:55         ` Jan Beulich
2017-12-07 14:07 ` [PATCH v3 12/25] x86emul: abstract out XCRn accesses Jan Beulich
2018-02-02 13:29   ` Andrew Cooper
2018-02-02 17:05     ` Jan Beulich
2017-12-07 14:08 ` [PATCH v3 13/25] x86emul: adjust_bnd() should check XCR0 Jan Beulich
2018-02-02 13:30   ` Andrew Cooper
2018-02-02 16:19     ` Jan Beulich
2018-02-02 16:28       ` Andrew Cooper
2017-12-07 14:09 ` [PATCH v3 14/25] x86emul: make all FPU emulation use the stub Jan Beulich
2018-02-02 13:37   ` Andrew Cooper
2017-12-07 14:10 ` [PATCH v3 15/25] x86/HVM: eliminate custom #MF/#XM handling Jan Beulich
2018-02-02 13:38   ` Andrew Cooper
2017-12-07 14:11 ` [PATCH v3 16/25] x86emul: support SWAPGS Jan Beulich
2018-02-02 13:41   ` Andrew Cooper
2018-02-02 16:24     ` Jan Beulich
2017-12-07 14:11 ` [PATCH v3 17/25] x86emul: emulate {MONITOR, MWAIT}{, X} as no-op Jan Beulich
2018-02-02 14:05   ` Andrew Cooper
2017-12-07 14:12 ` [PATCH v3 18/25] x86emul: add missing suffixes in test harness Jan Beulich
2018-02-02 14:13   ` Andrew Cooper
2017-12-07 14:14 ` [PATCH v3 19/25] x86emul: tell cmpxchg hook whether LOCK is in effect Jan Beulich
2017-12-08 10:58   ` Paul Durrant
2018-02-02 14:13   ` Andrew Cooper
2017-12-07 14:15 ` [PATCH v3 20/25] x86emul: correctly handle CMPXCHG* comparison failures Jan Beulich
2018-02-02 14:49   ` Andrew Cooper
2018-02-05  8:07     ` Jan Beulich
2018-02-05 13:38       ` Andrew Cooper
2017-12-07 14:16 ` [PATCH v3 21/25] x86emul: add read-modify-write hook Jan Beulich
2018-02-02 16:13   ` Andrew Cooper
2018-02-05  8:22     ` Jan Beulich
2018-02-05 14:21       ` Andrew Cooper
2018-02-05 14:56         ` Jan Beulich
2017-12-07 14:16 ` [PATCH v3 22/25] x86/HVM: do actual CMPXCHG in hvmemul_cmpxchg() Jan Beulich
2017-12-07 14:38   ` Razvan Cojocaru
2017-12-08 10:38   ` Paul Durrant
2018-02-02 16:36   ` Andrew Cooper
2018-02-05  8:32     ` Jan Beulich
2018-02-05 16:09       ` Andrew Cooper
2018-02-05 16:49         ` Jan Beulich
2018-02-05 16:57           ` Andrew Cooper
2018-02-05 17:05             ` Jan Beulich
2017-12-07 14:17 ` [PATCH v3 23/25] x86/HVM: make use of new read-modify-write emulator hook Jan Beulich
2017-12-08 10:41   ` Paul Durrant
2018-02-02 16:37   ` Andrew Cooper
2018-02-05  8:34     ` Jan Beulich
2018-02-05 16:15       ` Andrew Cooper
2017-12-07 14:18 ` [PATCH v3 24/25] x86/shadow: fully move unmap-dest into common code Jan Beulich
2018-02-02 16:46   ` Andrew Cooper
2017-12-07 14:19 ` [PATCH v3 25/25] x86/shadow: fold sh_x86_emulate_{write, cmpxchg}() into their only callers Jan Beulich
2018-02-02 16:52   ` Andrew Cooper
2018-02-05  8:42     ` Jan Beulich
2018-02-05 12:16       ` Tim Deegan

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.